Update dtc submodule to v1.4.3

Since the last submodule update (which was v1.4.2) dtc and libfdt have gained some features which would be useful in qemu. There's now a v1.4.3 upstream release, so update our submodule to point to it. Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
pseries: Update SLOF firmware image
2017-03-03 17:25:32 +11:00 · 2017-03-03 17:25:32 +11:00 · 2017-03-02 22:06:41 +00:00 · 2017-03-02 20:31:49 +00:00 · 2017-03-02 19:27:30 +00:00 · 2017-03-02 17:39:12 +00:00
266 changed files with 8700 additions and 3375 deletions
--- a/.shippable.yml
+++ b/.shippable.yml
@@ -5,6 +5,8 @@ env:
      TARGET_LIST=arm-softmmu,arm-linux-user
    - IMAGE=debian-arm64-cross
      TARGET_LIST=aarch64-softmmu,aarch64-linux-user
+    - IMAGE=debian-s390x-cross
+      TARGET_LIST=s390x-softmmu,s390x-linux-user
 build:
  pre_ci:
    - make docker-image-${IMAGE}
--- a/audio/audio.c
+++ b/audio/audio.c
@@ -28,6 +28,7 @@
 #include "qemu/timer.h"
 #include "sysemu/sysemu.h"
 #include "qemu/cutils.h"
+#include "sysemu/replay.h"

 #define AUDIO_CAP "audio"
 #include "audio_int.h"
@@ -1112,7 +1113,7 @@ static int audio_is_timer_needed (void)
 static void audio_reset_timer (AudioState *s)
 {
    if (audio_is_timer_needed ()) {
-        timer_mod (s->ts,
+        timer_mod_anticipate_ns(s->ts,
            qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + conf.period.ticks);
    }
    else {
@@ -1387,6 +1388,7 @@ static void audio_run_out (AudioState *s)

        prev_rpos = hw->rpos;
        played = hw->pcm_ops->run_out (hw, live);
+        replay_audio_out(&played);
        if (audio_bug (AUDIO_FUNC, hw->rpos >= hw->samples)) {
            dolog ("hw->rpos=%d hw->samples=%d played=%d\n",
                   hw->rpos, hw->samples, played);
@@ -1450,9 +1452,12 @@ static void audio_run_in (AudioState *s)

    while ((hw = audio_pcm_hw_find_any_enabled_in (hw))) {
        SWVoiceIn *sw;
-        int captured, min;
+        int captured = 0, min;

-        captured = hw->pcm_ops->run_in (hw);
+        if (replay_mode != REPLAY_MODE_PLAY) {
+            captured = hw->pcm_ops->run_in(hw);
+        }
+        replay_audio_in(&captured, hw->conv_buf, &hw->wpos, hw->samples);

        min = audio_pcm_hw_find_min_in (hw);
        hw->total_samples_captured += captured - min;
--- a/audio/audio.h
+++ b/audio/audio.h
@@ -166,4 +166,9 @@ int wav_start_capture (CaptureState *s, const char *path, int freq,
 bool audio_is_cleaning_up(void);
 void audio_cleanup(void);

+void audio_sample_to_uint64(void *samples, int pos,
+                            uint64_t *left, uint64_t *right);
+void audio_sample_from_uint64(void *samples, int pos,
+                            uint64_t left, uint64_t right);
+
 #endif /* QEMU_AUDIO_H */
--- a/audio/mixeng.c
+++ b/audio/mixeng.c
@@ -25,6 +25,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "qemu/bswap.h"
+#include "qemu/error-report.h"
 #include "audio.h"

 #define AUDIO_CAP "mixeng"
@@ -267,6 +268,37 @@ f_sample *mixeng_clip[2][2][2][3] = {
    }
 };

+
+void audio_sample_to_uint64(void *samples, int pos,
+                            uint64_t *left, uint64_t *right)
+{
+    struct st_sample *sample = samples;
+    sample += pos;
+#ifdef FLOAT_MIXENG
+    error_report(
+        "Coreaudio and floating point samples are not supported by replay yet");
+    abort();
+#else
+    *left = sample->l;
+    *right = sample->r;
+#endif
+}
+
+void audio_sample_from_uint64(void *samples, int pos,
+                            uint64_t left, uint64_t right)
+{
+    struct st_sample *sample = samples;
+    sample += pos;
+#ifdef FLOAT_MIXENG
+    error_report(
+        "Coreaudio and floating point samples are not supported by replay yet");
+    abort();
+#else
+    sample->l = left;
+    sample->r = right;
+#endif
+}
+
 /*
 * August 21, 1998
 * Copyright 1998 Fabrice Bellard.
--- a/audio/sdlaudio.c
+++ b/audio/sdlaudio.c
@@ -38,10 +38,14 @@
 #define AUDIO_CAP "sdl"
 #include "audio_int.h"

+#define USE_SEMAPHORE (SDL_MAJOR_VERSION < 2)
+
 typedef struct SDLVoiceOut {
    HWVoiceOut hw;
    int live;
+#if USE_SEMAPHORE
    int rpos;
+#endif
    int decr;
 } SDLVoiceOut;

@@ -53,8 +57,10 @@ static struct {

 static struct SDLAudioState {
    int exit;
+#if USE_SEMAPHORE
    SDL_mutex *mutex;
    SDL_sem *sem;
+#endif
    int initialized;
    bool driver_created;
 } glob_sdl;
@@ -73,31 +79,45 @@ static void GCC_FMT_ATTR (1, 2) sdl_logerr (const char *fmt, ...)

 static int sdl_lock (SDLAudioState *s, const char *forfn)
 {
+#if USE_SEMAPHORE
    if (SDL_LockMutex (s->mutex)) {
        sdl_logerr ("SDL_LockMutex for %s failed\n", forfn);
        return -1;
    }
+#else
+    SDL_LockAudio();
+#endif
+
    return 0;
 }

 static int sdl_unlock (SDLAudioState *s, const char *forfn)
 {
+#if USE_SEMAPHORE
    if (SDL_UnlockMutex (s->mutex)) {
        sdl_logerr ("SDL_UnlockMutex for %s failed\n", forfn);
        return -1;
    }
+#else
+    SDL_UnlockAudio();
+#endif
+
    return 0;
 }

 static int sdl_post (SDLAudioState *s, const char *forfn)
 {
+#if USE_SEMAPHORE
    if (SDL_SemPost (s->sem)) {
        sdl_logerr ("SDL_SemPost for %s failed\n", forfn);
        return -1;
    }
+#endif
+
    return 0;
 }

+#if USE_SEMAPHORE
 static int sdl_wait (SDLAudioState *s, const char *forfn)
 {
    if (SDL_SemWait (s->sem)) {
@@ -106,6 +126,7 @@ static int sdl_wait (SDLAudioState *s, const char *forfn)
    }
    return 0;
 }
+#endif

 static int sdl_unlock_and_post (SDLAudioState *s, const char *forfn)
 {
@@ -246,6 +267,7 @@ static void sdl_callback (void *opaque, Uint8 *buf, int len)
        int to_mix, decr;

        /* dolog ("in callback samples=%d\n", samples); */
+#if USE_SEMAPHORE
        sdl_wait (s, "sdl_callback");
        if (s->exit) {
            return;
@@ -264,6 +286,11 @@ static void sdl_callback (void *opaque, Uint8 *buf, int len)
        if (!sdl->live) {
            goto again;
        }
+#else
+        if (s->exit || !sdl->live) {
+            break;
+        }
+#endif

        /* dolog ("in callback live=%d\n", live); */
        to_mix = audio_MIN (samples, sdl->live);
@@ -274,7 +301,11 @@ static void sdl_callback (void *opaque, Uint8 *buf, int len)

            /* dolog ("in callback to_mix %d, chunk %d\n", to_mix, chunk); */
            hw->clip (buf, src, chunk);
+#if USE_SEMAPHORE
            sdl->rpos = (sdl->rpos + chunk) % hw->samples;
+#else
+            hw->rpos = (hw->rpos + chunk) % hw->samples;
+#endif
            to_mix -= chunk;
            buf += chunk << hw->info.shift;
        }
@@ -282,12 +313,21 @@ static void sdl_callback (void *opaque, Uint8 *buf, int len)
        sdl->live -= decr;
        sdl->decr += decr;

+#if USE_SEMAPHORE
    again:
        if (sdl_unlock (s, "sdl_callback")) {
            return;
        }
+#endif
    }
    /* dolog ("done len=%d\n", len); */
+
+#if (SDL_MAJOR_VERSION >= 2)
+    /* SDL2 does not clear the remaining buffer for us, so do it on our own */
+    if (samples) {
+        memset(buf, 0, samples << hw->info.shift);
+    }
+#endif
 }

 static int sdl_write_out (SWVoiceOut *sw, void *buf, int len)
@@ -315,8 +355,12 @@ static int sdl_run_out (HWVoiceOut *hw, int live)
    decr = audio_MIN (sdl->decr, live);
    sdl->decr -= decr;

+#if USE_SEMAPHORE
    sdl->live = live - decr;
    hw->rpos = sdl->rpos;
+#else
+    sdl->live = live;
+#endif

    if (sdl->live > 0) {
        sdl_unlock_and_post (s, "sdl_run_out");
@@ -405,6 +449,7 @@ static void *sdl_audio_init (void)
        return NULL;
    }

+#if USE_SEMAPHORE
    s->mutex = SDL_CreateMutex ();
    if (!s->mutex) {
        sdl_logerr ("Failed to create SDL mutex\n");
@@ -419,6 +464,7 @@ static void *sdl_audio_init (void)
        SDL_QuitSubSystem (SDL_INIT_AUDIO);
        return NULL;
    }
+#endif

    s->driver_created = true;
    return s;
@@ -428,8 +474,10 @@ static void sdl_audio_fini (void *opaque)
 {
    SDLAudioState *s = opaque;
    sdl_close (s);
+#if USE_SEMAPHORE
    SDL_DestroySemaphore (s->sem);
    SDL_DestroyMutex (s->mutex);
+#endif
    SDL_QuitSubSystem (SDL_INIT_AUDIO);
    s->driver_created = false;
 }
--- a/block.c
+++ b/block.c
@@ -707,6 +707,12 @@ int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough)
    return 0;
 }

+static char *bdrv_child_get_parent_desc(BdrvChild *c)
+{
+    BlockDriverState *parent = c->opaque;
+    return g_strdup(bdrv_get_device_or_node_name(parent));
+}
+
 static void bdrv_child_cb_drained_begin(BdrvChild *child)
 {
    BlockDriverState *bs = child->opaque;
@@ -774,6 +780,7 @@ static void bdrv_inherited_options(int *child_flags, QDict *child_options,
 }

 const BdrvChildRole child_file = {
+    .get_parent_desc = bdrv_child_get_parent_desc,
    .inherit_options = bdrv_inherited_options,
    .drained_begin   = bdrv_child_cb_drained_begin,
    .drained_end     = bdrv_child_cb_drained_end,
@@ -794,11 +801,63 @@ static void bdrv_inherited_fmt_options(int *child_flags, QDict *child_options,
 }

 const BdrvChildRole child_format = {
+    .get_parent_desc = bdrv_child_get_parent_desc,
    .inherit_options = bdrv_inherited_fmt_options,
    .drained_begin   = bdrv_child_cb_drained_begin,
    .drained_end     = bdrv_child_cb_drained_end,
 };

+static void bdrv_backing_attach(BdrvChild *c)
+{
+    BlockDriverState *parent = c->opaque;
+    BlockDriverState *backing_hd = c->bs;
+
+    assert(!parent->backing_blocker);
+    error_setg(&parent->backing_blocker,
+               "node is used as backing hd of '%s'",
+               bdrv_get_device_or_node_name(parent));
+
+    parent->open_flags &= ~BDRV_O_NO_BACKING;
+    pstrcpy(parent->backing_file, sizeof(parent->backing_file),
+            backing_hd->filename);
+    pstrcpy(parent->backing_format, sizeof(parent->backing_format),
+            backing_hd->drv ? backing_hd->drv->format_name : "");
+
+    bdrv_op_block_all(backing_hd, parent->backing_blocker);
+    /* Otherwise we won't be able to commit or stream */
+    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
+                    parent->backing_blocker);
+    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_STREAM,
+                    parent->backing_blocker);
+    /*
+     * We do backup in 3 ways:
+     * 1. drive backup
+     *    The target bs is new opened, and the source is top BDS
+     * 2. blockdev backup
+     *    Both the source and the target are top BDSes.
+     * 3. internal backup(used for block replication)
+     *    Both the source and the target are backing file
+     *
+     * In case 1 and 2, neither the source nor the target is the backing file.
+     * In case 3, we will block the top BDS, so there is only one block job
+     * for the top BDS and its backing chain.
+     */
+    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_SOURCE,
+                    parent->backing_blocker);
+    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
+                    parent->backing_blocker);
+}
+
+static void bdrv_backing_detach(BdrvChild *c)
+{
+    BlockDriverState *parent = c->opaque;
+
+    assert(parent->backing_blocker);
+    bdrv_op_unblock_all(c->bs, parent->backing_blocker);
+    error_free(parent->backing_blocker);
+    parent->backing_blocker = NULL;
+}
+
 /*
 * Returns the options and flags that bs->backing should get, based on the
 * given options and flags for the parent BDS
@@ -823,7 +882,10 @@ static void bdrv_backing_options(int *child_flags, QDict *child_options,
    *child_flags = flags;
 }

-static const BdrvChildRole child_backing = {
+const BdrvChildRole child_backing = {
+    .get_parent_desc = bdrv_child_get_parent_desc,
+    .attach          = bdrv_backing_attach,
+    .detach          = bdrv_backing_detach,
    .inherit_options = bdrv_backing_options,
    .drained_begin   = bdrv_child_cb_drained_begin,
    .drained_end     = bdrv_child_cb_drained_end,
@@ -1326,15 +1388,352 @@ static int bdrv_fill_options(QDict **options, const char *filename,
    return 0;
 }

-static void bdrv_replace_child(BdrvChild *child, BlockDriverState *new_bs)
+/*
+ * Check whether permissions on this node can be changed in a way that
+ * @cumulative_perms and @cumulative_shared_perms are the new cumulative
+ * permissions of all its parents. This involves checking whether all necessary
+ * permission changes to child nodes can be performed.
+ *
+ * A call to this function must always be followed by a call to bdrv_set_perm()
+ * or bdrv_abort_perm_update().
+ */
+static int bdrv_check_perm(BlockDriverState *bs, uint64_t cumulative_perms,
+                           uint64_t cumulative_shared_perms, Error **errp)
+{
+    BlockDriver *drv = bs->drv;
+    BdrvChild *c;
+    int ret;
+
+    /* Write permissions never work with read-only images */
+    if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) &&
+        bdrv_is_read_only(bs))
+    {
+        error_setg(errp, "Block node is read-only");
+        return -EPERM;
+    }
+
+    /* Check this node */
+    if (!drv) {
+        return 0;
+    }
+
+    if (drv->bdrv_check_perm) {
+        return drv->bdrv_check_perm(bs, cumulative_perms,
+                                    cumulative_shared_perms, errp);
+    }
+
+    /* Drivers that never have children can omit .bdrv_child_perm() */
+    if (!drv->bdrv_child_perm) {
+        assert(QLIST_EMPTY(&bs->children));
+        return 0;
+    }
+
+    /* Check all children */
+    QLIST_FOREACH(c, &bs->children, next) {
+        uint64_t cur_perm, cur_shared;
+        drv->bdrv_child_perm(bs, c, c->role,
+                             cumulative_perms, cumulative_shared_perms,
+                             &cur_perm, &cur_shared);
+        ret = bdrv_child_check_perm(c, cur_perm, cur_shared, errp);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * Notifies drivers that after a previous bdrv_check_perm() call, the
+ * permission update is not performed and any preparations made for it (e.g.
+ * taken file locks) need to be undone.
+ *
+ * This function recursively notifies all child nodes.
+ */
+static void bdrv_abort_perm_update(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+    BdrvChild *c;
+
+    if (!drv) {
+        return;
+    }
+
+    if (drv->bdrv_abort_perm_update) {
+        drv->bdrv_abort_perm_update(bs);
+    }
+
+    QLIST_FOREACH(c, &bs->children, next) {
+        bdrv_child_abort_perm_update(c);
+    }
+}
+
+static void bdrv_set_perm(BlockDriverState *bs, uint64_t cumulative_perms,
+                          uint64_t cumulative_shared_perms)
+{
+    BlockDriver *drv = bs->drv;
+    BdrvChild *c;
+
+    if (!drv) {
+        return;
+    }
+
+    /* Update this node */
+    if (drv->bdrv_set_perm) {
+        drv->bdrv_set_perm(bs, cumulative_perms, cumulative_shared_perms);
+    }
+
+    /* Drivers that never have children can omit .bdrv_child_perm() */
+    if (!drv->bdrv_child_perm) {
+        assert(QLIST_EMPTY(&bs->children));
+        return;
+    }
+
+    /* Update all children */
+    QLIST_FOREACH(c, &bs->children, next) {
+        uint64_t cur_perm, cur_shared;
+        drv->bdrv_child_perm(bs, c, c->role,
+                             cumulative_perms, cumulative_shared_perms,
+                             &cur_perm, &cur_shared);
+        bdrv_child_set_perm(c, cur_perm, cur_shared);
+    }
+}
+
+static void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm,
+                                     uint64_t *shared_perm)
+{
+    BdrvChild *c;
+    uint64_t cumulative_perms = 0;
+    uint64_t cumulative_shared_perms = BLK_PERM_ALL;
+
+    QLIST_FOREACH(c, &bs->parents, next_parent) {
+        cumulative_perms |= c->perm;
+        cumulative_shared_perms &= c->shared_perm;
+    }
+
+    *perm = cumulative_perms;
+    *shared_perm = cumulative_shared_perms;
+}
+
+static char *bdrv_child_user_desc(BdrvChild *c)
+{
+    if (c->role->get_parent_desc) {
+        return c->role->get_parent_desc(c);
+    }
+
+    return g_strdup("another user");
+}
+
+static char *bdrv_perm_names(uint64_t perm)
+{
+    struct perm_name {
+        uint64_t perm;
+        const char *name;
+    } permissions[] = {
+        { BLK_PERM_CONSISTENT_READ, "consistent read" },
+        { BLK_PERM_WRITE,           "write" },
+        { BLK_PERM_WRITE_UNCHANGED, "write unchanged" },
+        { BLK_PERM_RESIZE,          "resize" },
+        { BLK_PERM_GRAPH_MOD,       "change children" },
+        { 0, NULL }
+    };
+
+    char *result = g_strdup("");
+    struct perm_name *p;
+
+    for (p = permissions; p->name; p++) {
+        if (perm & p->perm) {
+            char *old = result;
+            result = g_strdup_printf("%s%s%s", old, *old ? ", " : "", p->name);
+            g_free(old);
+        }
+    }
+
+    return result;
+}
+
+/*
+ * Checks whether a new reference to @bs can be added if the new user requires
+ * @new_used_perm/@new_shared_perm as its permissions. If @ignore_child is set,
+ * this old reference is ignored in the calculations; this allows checking
+ * permission updates for an existing reference.
+ *
+ * Needs to be followed by a call to either bdrv_set_perm() or
+ * bdrv_abort_perm_update(). */
+static int bdrv_check_update_perm(BlockDriverState *bs, uint64_t new_used_perm,
+                                  uint64_t new_shared_perm,
+                                  BdrvChild *ignore_child, Error **errp)
+{
+    BdrvChild *c;
+    uint64_t cumulative_perms = new_used_perm;
+    uint64_t cumulative_shared_perms = new_shared_perm;
+
+    /* There is no reason why anyone couldn't tolerate write_unchanged */
+    assert(new_shared_perm & BLK_PERM_WRITE_UNCHANGED);
+
+    QLIST_FOREACH(c, &bs->parents, next_parent) {
+        if (c == ignore_child) {
+            continue;
+        }
+
+        if ((new_used_perm & c->shared_perm) != new_used_perm) {
+            char *user = bdrv_child_user_desc(c);
+            char *perm_names = bdrv_perm_names(new_used_perm & ~c->shared_perm);
+            error_setg(errp, "Conflicts with use by %s as '%s', which does not "
+                             "allow '%s' on %s",
+                       user, c->name, perm_names, bdrv_get_node_name(c->bs));
+            g_free(user);
+            g_free(perm_names);
+            return -EPERM;
+        }
+
+        if ((c->perm & new_shared_perm) != c->perm) {
+            char *user = bdrv_child_user_desc(c);
+            char *perm_names = bdrv_perm_names(c->perm & ~new_shared_perm);
+            error_setg(errp, "Conflicts with use by %s as '%s', which uses "
+                             "'%s' on %s",
+                       user, c->name, perm_names, bdrv_get_node_name(c->bs));
+            g_free(user);
+            g_free(perm_names);
+            return -EPERM;
+        }
+
+        cumulative_perms |= c->perm;
+        cumulative_shared_perms &= c->shared_perm;
+    }
+
+    return bdrv_check_perm(bs, cumulative_perms, cumulative_shared_perms, errp);
+}
+
+/* Needs to be followed by a call to either bdrv_child_set_perm() or
+ * bdrv_child_abort_perm_update(). */
+int bdrv_child_check_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
+                          Error **errp)
+{
+    return bdrv_check_update_perm(c->bs, perm, shared, c, errp);
+}
+
+void bdrv_child_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared)
+{
+    uint64_t cumulative_perms, cumulative_shared_perms;
+
+    c->perm = perm;
+    c->shared_perm = shared;
+
+    bdrv_get_cumulative_perm(c->bs, &cumulative_perms,
+                             &cumulative_shared_perms);
+    bdrv_set_perm(c->bs, cumulative_perms, cumulative_shared_perms);
+}
+
+void bdrv_child_abort_perm_update(BdrvChild *c)
+{
+    bdrv_abort_perm_update(c->bs);
+}
+
+int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
+                            Error **errp)
+{
+    int ret;
+
+    ret = bdrv_child_check_perm(c, perm, shared, errp);
+    if (ret < 0) {
+        bdrv_child_abort_perm_update(c);
+        return ret;
+    }
+
+    bdrv_child_set_perm(c, perm, shared);
+
+    return 0;
+}
+
+#define DEFAULT_PERM_PASSTHROUGH (BLK_PERM_CONSISTENT_READ \
+                                 | BLK_PERM_WRITE \
+                                 | BLK_PERM_WRITE_UNCHANGED \
+                                 | BLK_PERM_RESIZE)
+#define DEFAULT_PERM_UNCHANGED (BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH)
+
+void bdrv_filter_default_perms(BlockDriverState *bs, BdrvChild *c,
+                               const BdrvChildRole *role,
+                               uint64_t perm, uint64_t shared,
+                               uint64_t *nperm, uint64_t *nshared)
+{
+    if (c == NULL) {
+        *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
+        *nshared = (shared & DEFAULT_PERM_PASSTHROUGH) | DEFAULT_PERM_UNCHANGED;
+        return;
+    }
+
+    *nperm = (perm & DEFAULT_PERM_PASSTHROUGH) |
+             (c->perm & DEFAULT_PERM_UNCHANGED);
+    *nshared = (shared & DEFAULT_PERM_PASSTHROUGH) |
+               (c->shared_perm & DEFAULT_PERM_UNCHANGED);
+}
+
+void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
+                               const BdrvChildRole *role,
+                               uint64_t perm, uint64_t shared,
+                               uint64_t *nperm, uint64_t *nshared)
+{
+    bool backing = (role == &child_backing);
+    assert(role == &child_backing || role == &child_file);
+
+    if (!backing) {
+        /* Apart from the modifications below, the same permissions are
+         * forwarded and left alone as for filters */
+        bdrv_filter_default_perms(bs, c, role, perm, shared, &perm, &shared);
+
+        /* Format drivers may touch metadata even if the guest doesn't write */
+        if (!bdrv_is_read_only(bs)) {
+            perm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
+        }
+
+        /* bs->file always needs to be consistent because of the metadata. We
+         * can never allow other users to resize or write to it. */
+        perm |= BLK_PERM_CONSISTENT_READ;
+        shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
+    } else {
+        /* We want consistent read from backing files if the parent needs it.
+         * No other operations are performed on backing files. */
+        perm &= BLK_PERM_CONSISTENT_READ;
+
+        /* If the parent can deal with changing data, we're okay with a
+         * writable and resizable backing file. */
+        /* TODO Require !(perm & BLK_PERM_CONSISTENT_READ), too? */
+        if (shared & BLK_PERM_WRITE) {
+            shared = BLK_PERM_WRITE | BLK_PERM_RESIZE;
+        } else {
+            shared = 0;
+        }
+
+        shared |= BLK_PERM_CONSISTENT_READ | BLK_PERM_GRAPH_MOD |
+                  BLK_PERM_WRITE_UNCHANGED;
+    }
+
+    *nperm = perm;
+    *nshared = shared;
+}
+
+static void bdrv_replace_child(BdrvChild *child, BlockDriverState *new_bs,
+                               bool check_new_perm)
 {
    BlockDriverState *old_bs = child->bs;
+    uint64_t perm, shared_perm;

    if (old_bs) {
        if (old_bs->quiesce_counter && child->role->drained_end) {
            child->role->drained_end(child);
        }
+        if (child->role->detach) {
+            child->role->detach(child);
+        }
        QLIST_REMOVE(child, next_parent);
+
+        /* Update permissions for old node. This is guaranteed to succeed
+         * because we're just taking a parent away, so we're loosening
+         * restrictions. */
+        bdrv_get_cumulative_perm(old_bs, &perm, &shared_perm);
+        bdrv_check_perm(old_bs, perm, shared_perm, &error_abort);
+        bdrv_set_perm(old_bs, perm, shared_perm);
    }

    child->bs = new_bs;
@@ -1344,23 +1743,46 @@ static void bdrv_replace_child(BdrvChild *child, BlockDriverState *new_bs)
        if (new_bs->quiesce_counter && child->role->drained_begin) {
            child->role->drained_begin(child);
        }
+
+        bdrv_get_cumulative_perm(new_bs, &perm, &shared_perm);
+        if (check_new_perm) {
+            bdrv_check_perm(new_bs, perm, shared_perm, &error_abort);
+        }
+        bdrv_set_perm(new_bs, perm, shared_perm);
+
+        if (child->role->attach) {
+            child->role->attach(child);
+        }
    }
 }

 BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
                                  const char *child_name,
                                  const BdrvChildRole *child_role,
-                                  void *opaque)
+                                  uint64_t perm, uint64_t shared_perm,
+                                  void *opaque, Error **errp)
 {
-    BdrvChild *child = g_new(BdrvChild, 1);
+    BdrvChild *child;
+    int ret;
+
+    ret = bdrv_check_update_perm(child_bs, perm, shared_perm, NULL, errp);
+    if (ret < 0) {
+        bdrv_abort_perm_update(child_bs);
+        return NULL;
+    }
+
+    child = g_new(BdrvChild, 1);
    *child = (BdrvChild) {
-        .bs     = NULL,
-        .name   = g_strdup(child_name),
-        .role   = child_role,
-        .opaque = opaque,
+        .bs             = NULL,
+        .name           = g_strdup(child_name),
+        .role           = child_role,
+        .perm           = perm,
+        .shared_perm    = shared_perm,
+        .opaque         = opaque,
    };

-    bdrv_replace_child(child, child_bs);
+    /* This performs the matching bdrv_set_perm() for the above check. */
+    bdrv_replace_child(child, child_bs, false);

    return child;
 }
@@ -1368,10 +1790,24 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
 BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
                             BlockDriverState *child_bs,
                             const char *child_name,
-                             const BdrvChildRole *child_role)
+                             const BdrvChildRole *child_role,
+                             Error **errp)
 {
-    BdrvChild *child = bdrv_root_attach_child(child_bs, child_name, child_role,
-                                              parent_bs);
+    BdrvChild *child;
+    uint64_t perm, shared_perm;
+
+    bdrv_get_cumulative_perm(parent_bs, &perm, &shared_perm);
+
+    assert(parent_bs->drv);
+    parent_bs->drv->bdrv_child_perm(parent_bs, NULL, child_role,
+                                    perm, shared_perm, &perm, &shared_perm);
+
+    child = bdrv_root_attach_child(child_bs, child_name, child_role,
+                                   perm, shared_perm, parent_bs, errp);
+    if (child == NULL) {
+        return NULL;
+    }
+
    QLIST_INSERT_HEAD(&parent_bs->children, child, next);
    return child;
 }
@@ -1383,7 +1819,7 @@ static void bdrv_detach_child(BdrvChild *child)
        child->next.le_prev = NULL;
    }

-    bdrv_replace_child(child, NULL);
+    bdrv_replace_child(child, NULL, false);

    g_free(child->name);
    g_free(child);
@@ -1447,57 +1883,28 @@ static void bdrv_parent_cb_resize(BlockDriverState *bs)
 * Sets the backing file link of a BDS. A new reference is created; callers
 * which don't need their own reference any more must call bdrv_unref().
 */
-void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
+void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
+                         Error **errp)
 {
    if (backing_hd) {
        bdrv_ref(backing_hd);
    }

    if (bs->backing) {
-        assert(bs->backing_blocker);
-        bdrv_op_unblock_all(bs->backing->bs, bs->backing_blocker);
        bdrv_unref_child(bs, bs->backing);
-    } else if (backing_hd) {
-        error_setg(&bs->backing_blocker,
-                   "node is used as backing hd of '%s'",
-                   bdrv_get_device_or_node_name(bs));
    }

    if (!backing_hd) {
-        error_free(bs->backing_blocker);
-        bs->backing_blocker = NULL;
        bs->backing = NULL;
        goto out;
    }
-    bs->backing = bdrv_attach_child(bs, backing_hd, "backing", &child_backing);
-    bs->open_flags &= ~BDRV_O_NO_BACKING;
-    pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
-    pstrcpy(bs->backing_format, sizeof(bs->backing_format),
-            backing_hd->drv ? backing_hd->drv->format_name : "");

-    bdrv_op_block_all(backing_hd, bs->backing_blocker);
-    /* Otherwise we won't be able to commit or stream */
-    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
-                    bs->backing_blocker);
-    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_STREAM,
-                    bs->backing_blocker);
-    /*
-     * We do backup in 3 ways:
-     * 1. drive backup
-     *    The target bs is new opened, and the source is top BDS
-     * 2. blockdev backup
-     *    Both the source and the target are top BDSes.
-     * 3. internal backup(used for block replication)
-     *    Both the source and the target are backing file
-     *
-     * In case 1 and 2, neither the source nor the target is the backing file.
-     * In case 3, we will block the top BDS, so there is only one block job
-     * for the top BDS and its backing chain.
-     */
-    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_SOURCE,
-                    bs->backing_blocker);
-    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
-                    bs->backing_blocker);
+    bs->backing = bdrv_attach_child(bs, backing_hd, "backing", &child_backing,
+                                    errp);
+    if (!bs->backing) {
+        bdrv_unref(backing_hd);
+    }
+
 out:
    bdrv_refresh_limits(bs, NULL);
 }
@@ -1580,8 +1987,12 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,

    /* Hook up the backing file link; drop our reference, bs owns the
     * backing_hd reference now */
-    bdrv_set_backing_hd(bs, backing_hd);
+    bdrv_set_backing_hd(bs, backing_hd, &local_err);
    bdrv_unref(backing_hd);
+    if (local_err) {
+        ret = -EINVAL;
+        goto free_exit;
+    }

    qdict_del(parent_options, bdref_key);

@@ -1648,6 +2059,7 @@ BdrvChild *bdrv_open_child(const char *filename,
                           const BdrvChildRole *child_role,
                           bool allow_none, Error **errp)
 {
+    BdrvChild *c;
    BlockDriverState *bs;

    bs = bdrv_open_child_bs(filename, options, bdref_key, parent, child_role,
@@ -1656,7 +2068,13 @@ BdrvChild *bdrv_open_child(const char *filename,
        return NULL;
    }

-    return bdrv_attach_child(parent, bs, bdref_key, child_role);
+    c = bdrv_attach_child(parent, bs, bdref_key, child_role, errp);
+    if (!c) {
+        bdrv_unref(bs);
+        return NULL;
+    }
+
+    return c;
 }

 static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs,
@@ -1669,6 +2087,7 @@ static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs,
    int64_t total_size;
    QemuOpts *opts = NULL;
    BlockDriverState *bs_snapshot;
+    Error *local_err = NULL;
    int ret;

    /* if snapshot, we create a temporary backing file and open it
@@ -1718,7 +2137,12 @@ static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs,
     * call bdrv_unref() on it), so in order to be able to return one, we have
     * to increase bs_snapshot's refcount here */
    bdrv_ref(bs_snapshot);
-    bdrv_append(bs_snapshot, bs);
+    bdrv_append(bs_snapshot, bs, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto out;
+    }

    g_free(tmp_filename);
    return bs_snapshot;
@@ -1862,9 +2286,12 @@ static BlockDriverState *bdrv_open_inherit(const char *filename,
            goto fail;
        }
        if (file_bs != NULL) {
-            file = blk_new();
-            blk_insert_bs(file, file_bs);
+            file = blk_new(BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL);
+            blk_insert_bs(file, file_bs, &local_err);
            bdrv_unref(file_bs);
+            if (local_err) {
+                goto fail;
+            }

            qdict_put(options, "file",
                      qstring_from_str(bdrv_get_node_name(file_bs)));
@@ -2405,7 +2832,7 @@ static void bdrv_close(BlockDriverState *bs)
        bs->drv->bdrv_close(bs);
        bs->drv = NULL;

-        bdrv_set_backing_hd(bs, NULL);
+        bdrv_set_backing_hd(bs, NULL, &error_abort);

        if (bs->file != NULL) {
            bdrv_unref_child(bs, bs->file);
@@ -2465,10 +2892,13 @@ static void change_parent_backing_link(BlockDriverState *from,
    BdrvChild *c, *next, *to_c;

    QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
+        if (c->role->stay_at_node) {
+            continue;
+        }
        if (c->role == &child_backing) {
-            /* @from is generally not allowed to be a backing file, except for
-             * when @to is the overlay. In that case, @from may not be replaced
-             * by @to as @to's backing node. */
+            /* If @from is a backing file of @to, ignore the child to avoid
+             * creating a loop. We only want to change the pointer of other
+             * parents. */
            QLIST_FOREACH(to_c, &to->children, next) {
                if (to_c == c) {
                    break;
@@ -2479,9 +2909,10 @@ static void change_parent_backing_link(BlockDriverState *from,
            }
        }

-        assert(c->role != &child_backing);
        bdrv_ref(to);
-        bdrv_replace_child(c, to);
+        /* FIXME Are we sure that bdrv_replace_child() can't run into
+         * &error_abort because of permissions? */
+        bdrv_replace_child(c, to, true);
        bdrv_unref(from);
    }
 }
@@ -2502,19 +2933,25 @@ static void change_parent_backing_link(BlockDriverState *from,
 * parents of bs_top after bdrv_append() returns. If the caller needs to keep a
 * reference of its own, it must call bdrv_ref().
 */
-void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
+void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
+                 Error **errp)
 {
-    assert(!bdrv_requests_pending(bs_top));
-    assert(!bdrv_requests_pending(bs_new));
+    Error *local_err = NULL;

-    bdrv_ref(bs_top);
+    assert(!atomic_read(&bs_top->in_flight));
+    assert(!atomic_read(&bs_new->in_flight));
+
+    bdrv_set_backing_hd(bs_new, bs_top, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        goto out;
+    }

    change_parent_backing_link(bs_top, bs_new);
-    bdrv_set_backing_hd(bs_new, bs_top);
-    bdrv_unref(bs_top);

    /* bs_new is now referenced by its new parents, we don't need the
     * additional reference any more. */
+out:
    bdrv_unref(bs_new);
 }

@@ -2658,6 +3095,7 @@ int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
                           BlockDriverState *base, const char *backing_file_str)
 {
    BlockDriverState *new_top_bs = NULL;
+    Error *local_err = NULL;
    int ret = -EIO;

    if (!top->drv || !base->drv) {
@@ -2690,7 +3128,13 @@ int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
    if (ret) {
        goto exit;
    }
-    bdrv_set_backing_hd(new_top_bs, base);
+
+    bdrv_set_backing_hd(new_top_bs, base, &local_err);
+    if (local_err) {
+        ret = -EPERM;
+        error_report_err(local_err);
+        goto exit;
+    }

    ret = 0;
 exit:
@@ -2705,6 +3149,9 @@ int bdrv_truncate(BdrvChild *child, int64_t offset)
    BlockDriverState *bs = child->bs;
    BlockDriver *drv = bs->drv;
    int ret;
+
+    assert(child->perm & BLK_PERM_RESIZE);
+
    if (!drv)
        return -ENOMEDIUM;
    if (!drv->bdrv_truncate)
--- a/block/backup.c
+++ b/block/backup.c
@@ -618,14 +618,24 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
        goto error;
    }

-    job = block_job_create(job_id, &backup_job_driver, bs, speed,
-                           creation_flags, cb, opaque, errp);
+    /* job->common.len is fixed, so we can't allow resize */
+    job = block_job_create(job_id, &backup_job_driver, bs,
+                           BLK_PERM_CONSISTENT_READ,
+                           BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE |
+                           BLK_PERM_WRITE_UNCHANGED | BLK_PERM_GRAPH_MOD,
+                           speed, creation_flags, cb, opaque, errp);
    if (!job) {
        goto error;
    }

-    job->target = blk_new();
-    blk_insert_bs(job->target, target);
+    /* The target must match the source in size, so no resize here either */
+    job->target = blk_new(BLK_PERM_WRITE,
+                          BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE |
+                          BLK_PERM_WRITE_UNCHANGED | BLK_PERM_GRAPH_MOD);
+    ret = blk_insert_bs(job->target, target, errp);
+    if (ret < 0) {
+        goto error;
+    }

    job->on_source_error = on_source_error;
    job->on_target_error = on_target_error;
@@ -652,7 +662,9 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
        job->cluster_size = MAX(BACKUP_CLUSTER_SIZE_DEFAULT, bdi.cluster_size);
    }

-    block_job_add_bdrv(&job->common, target);
+    /* Required permissions are already taken with target's blk_new() */
+    block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL,
+                       &error_abort);
    job->common.len = len;
    block_job_txn_add_job(txn, &job->common);

--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -734,6 +734,8 @@ static BlockDriver bdrv_blkdebug = {
    .bdrv_file_open         = blkdebug_open,
    .bdrv_close             = blkdebug_close,
    .bdrv_reopen_prepare    = blkdebug_reopen_prepare,
+    .bdrv_child_perm        = bdrv_filter_default_perms,
+
    .bdrv_getlength         = blkdebug_getlength,
    .bdrv_truncate          = blkdebug_truncate,
    .bdrv_refresh_filename  = blkdebug_refresh_filename,
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -137,6 +137,7 @@ static BlockDriver bdrv_blkreplay = {

    .bdrv_file_open         = blkreplay_open,
    .bdrv_close             = blkreplay_close,
+    .bdrv_child_perm        = bdrv_filter_default_perms,
    .bdrv_getlength         = blkreplay_getlength,

    .bdrv_co_preadv         = blkreplay_co_preadv,
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -320,6 +320,7 @@ static BlockDriver bdrv_blkverify = {
    .bdrv_parse_filename              = blkverify_parse_filename,
    .bdrv_file_open                   = blkverify_open,
    .bdrv_close                       = blkverify_close,
+    .bdrv_child_perm                  = bdrv_filter_default_perms,
    .bdrv_getlength                   = blkverify_getlength,
    .bdrv_refresh_filename            = blkverify_refresh_filename,

--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -59,6 +59,9 @@ struct BlockBackend {
    bool iostatus_enabled;
    BlockDeviceIoStatus iostatus;

+    uint64_t perm;
+    uint64_t shared_perm;
+
    bool allow_write_beyond_eof;

    NotifierList remove_bs_notifiers, insert_bs_notifiers;
@@ -77,6 +80,7 @@ static const AIOCBInfo block_backend_aiocb_info = {

 static void drive_info_del(DriveInfo *dinfo);
 static BlockBackend *bdrv_first_blk(BlockDriverState *bs);
+static char *blk_get_attached_dev_id(BlockBackend *blk);

 /* All BlockBackends */
 static QTAILQ_HEAD(, BlockBackend) block_backends =
@@ -99,6 +103,25 @@ static void blk_root_drained_end(BdrvChild *child);
 static void blk_root_change_media(BdrvChild *child, bool load);
 static void blk_root_resize(BdrvChild *child);

+static char *blk_root_get_parent_desc(BdrvChild *child)
+{
+    BlockBackend *blk = child->opaque;
+    char *dev_id;
+
+    if (blk->name) {
+        return g_strdup(blk->name);
+    }
+
+    dev_id = blk_get_attached_dev_id(blk);
+    if (*dev_id) {
+        return dev_id;
+    } else {
+        /* TODO Callback into the BB owner for something more detailed */
+        g_free(dev_id);
+        return g_strdup("a block device");
+    }
+}
+
 static const char *blk_root_get_name(BdrvChild *child)
 {
    return blk_name(child->opaque);
@@ -110,6 +133,7 @@ static const BdrvChildRole child_root = {
    .change_media       = blk_root_change_media,
    .resize             = blk_root_resize,
    .get_name           = blk_root_get_name,
+    .get_parent_desc    = blk_root_get_parent_desc,

    .drained_begin      = blk_root_drained_begin,
    .drained_end        = blk_root_drained_end,
@@ -117,15 +141,23 @@ static const BdrvChildRole child_root = {

 /*
 * Create a new BlockBackend with a reference count of one.
- * Store an error through @errp on failure, unless it's null.
+ *
+ * @perm is a bitmasks of BLK_PERM_* constants which describes the permissions
+ * to request for a block driver node that is attached to this BlockBackend.
+ * @shared_perm is a bitmask which describes which permissions may be granted
+ * to other users of the attached node.
+ * Both sets of permissions can be changed later using blk_set_perm().
+ *
 * Return the new BlockBackend on success, null on failure.
 */
-BlockBackend *blk_new(void)
+BlockBackend *blk_new(uint64_t perm, uint64_t shared_perm)
 {
    BlockBackend *blk;

    blk = g_new0(BlockBackend, 1);
    blk->refcnt = 1;
+    blk->perm = perm;
+    blk->shared_perm = shared_perm;
    blk_set_enable_write_cache(blk, true);

    qemu_co_queue_init(&blk->public.throttled_reqs[0]);
@@ -155,15 +187,33 @@ BlockBackend *blk_new_open(const char *filename, const char *reference,
 {
    BlockBackend *blk;
    BlockDriverState *bs;
+    uint64_t perm;

-    blk = blk_new();
+    /* blk_new_open() is mainly used in .bdrv_create implementations and the
+     * tools where sharing isn't a concern because the BDS stays private, so we
+     * just request permission according to the flags.
+     *
+     * The exceptions are xen_disk and blockdev_init(); in these cases, the
+     * caller of blk_new_open() doesn't make use of the permissions, but they
+     * shouldn't hurt either. We can still share everything here because the
+     * guest devices will add their own blockers if they can't share. */
+    perm = BLK_PERM_CONSISTENT_READ;
+    if (flags & BDRV_O_RDWR) {
+        perm |= BLK_PERM_WRITE;
+    }
+    if (flags & BDRV_O_RESIZE) {
+        perm |= BLK_PERM_RESIZE;
+    }
+
+    blk = blk_new(perm, BLK_PERM_ALL);
    bs = bdrv_open(filename, reference, options, flags, errp);
    if (!bs) {
        blk_unref(blk);
        return NULL;
    }

-    blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk);
+    blk->root = bdrv_root_attach_child(bs, "root", &child_root,
+                                       perm, BLK_PERM_ALL, blk, &error_abort);

    return blk;
 }
@@ -495,16 +545,49 @@ void blk_remove_bs(BlockBackend *blk)
 /*
 * Associates a new BlockDriverState with @blk.
 */
-void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs)
+int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
 {
+    blk->root = bdrv_root_attach_child(bs, "root", &child_root,
+                                       blk->perm, blk->shared_perm, blk, errp);
+    if (blk->root == NULL) {
+        return -EPERM;
+    }
    bdrv_ref(bs);
-    blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk);

    notifier_list_notify(&blk->insert_bs_notifiers, blk);
    if (blk->public.throttle_state) {
        throttle_timers_attach_aio_context(
            &blk->public.throttle_timers, bdrv_get_aio_context(bs));
    }
+
+    return 0;
+}
+
+/*
+ * Sets the permission bitmasks that the user of the BlockBackend needs.
+ */
+int blk_set_perm(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
+                 Error **errp)
+{
+    int ret;
+
+    if (blk->root) {
+        ret = bdrv_child_try_set_perm(blk->root, perm, shared_perm, errp);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    blk->perm = perm;
+    blk->shared_perm = shared_perm;
+
+    return 0;
+}
+
+void blk_get_perm(BlockBackend *blk, uint64_t *perm, uint64_t *shared_perm)
+{
+    *perm = blk->perm;
+    *shared_perm = blk->shared_perm;
 }

 static int blk_do_attach_dev(BlockBackend *blk, void *dev)
@@ -553,6 +636,7 @@ void blk_detach_dev(BlockBackend *blk, void *dev)
    blk->dev_ops = NULL;
    blk->dev_opaque = NULL;
    blk->guest_block_size = 512;
+    blk_set_perm(blk, 0, BLK_PERM_ALL, &error_abort);
    blk_unref(blk);
 }

@@ -620,19 +704,29 @@ void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops,

 /*
 * Notify @blk's attached device model of media change.
- * If @load is true, notify of media load.
- * Else, notify of media eject.
+ *
+ * If @load is true, notify of media load. This action can fail, meaning that
+ * the medium cannot be loaded. @errp is set then.
+ *
+ * If @load is false, notify of media eject. This can never fail.
+ *
 * Also send DEVICE_TRAY_MOVED events as appropriate.
 */
-void blk_dev_change_media_cb(BlockBackend *blk, bool load)
+void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp)
 {
    if (blk->dev_ops && blk->dev_ops->change_media_cb) {
        bool tray_was_open, tray_is_open;
+        Error *local_err = NULL;

        assert(!blk->legacy_dev);

        tray_was_open = blk_dev_is_tray_open(blk);
-        blk->dev_ops->change_media_cb(blk->dev_opaque, load);
+        blk->dev_ops->change_media_cb(blk->dev_opaque, load, &local_err);
+        if (local_err) {
+            assert(load == true);
+            error_propagate(errp, local_err);
+            return;
+        }
        tray_is_open = blk_dev_is_tray_open(blk);

        if (tray_was_open != tray_is_open) {
@@ -646,7 +740,7 @@ void blk_dev_change_media_cb(BlockBackend *blk, bool load)

 static void blk_root_change_media(BdrvChild *child, bool load)
 {
-    blk_dev_change_media_cb(child->opaque, load);
+    blk_dev_change_media_cb(child->opaque, load, NULL);
 }

 /*
--- a/block/bochs.c
+++ b/block/bochs.c
@@ -293,6 +293,7 @@ static BlockDriver bdrv_bochs = {
    .instance_size	= sizeof(BDRVBochsState),
    .bdrv_probe		= bochs_probe,
    .bdrv_open		= bochs_open,
+    .bdrv_child_perm     = bdrv_format_default_perms,
    .bdrv_refresh_limits = bochs_refresh_limits,
    .bdrv_co_preadv = bochs_co_preadv,
    .bdrv_close		= bochs_close,
--- a/block/cloop.c
+++ b/block/cloop.c
@@ -290,6 +290,7 @@ static BlockDriver bdrv_cloop = {
    .instance_size  = sizeof(BDRVCloopState),
    .bdrv_probe     = cloop_probe,
    .bdrv_open      = cloop_open,
+    .bdrv_child_perm     = bdrv_format_default_perms,
    .bdrv_refresh_limits = cloop_refresh_limits,
    .bdrv_co_preadv = cloop_co_preadv,
    .bdrv_close     = cloop_close,
--- a/block/commit.c
+++ b/block/commit.c
@@ -36,6 +36,7 @@ typedef struct CommitBlockJob {
    BlockJob common;
    RateLimit limit;
    BlockDriverState *active;
+    BlockDriverState *commit_top_bs;
    BlockBackend *top;
    BlockBackend *base;
    BlockdevOnError on_error;
@@ -83,12 +84,23 @@ static void commit_complete(BlockJob *job, void *opaque)
    BlockDriverState *active = s->active;
    BlockDriverState *top = blk_bs(s->top);
    BlockDriverState *base = blk_bs(s->base);
-    BlockDriverState *overlay_bs = bdrv_find_overlay(active, top);
+    BlockDriverState *overlay_bs = bdrv_find_overlay(active, s->commit_top_bs);
    int ret = data->ret;
+    bool remove_commit_top_bs = false;
+
+    /* Remove base node parent that still uses BLK_PERM_WRITE/RESIZE before
+     * the normal backing chain can be restored. */
+    blk_unref(s->base);

    if (!block_job_is_cancelled(&s->common) && ret == 0) {
        /* success */
-        ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str);
+        ret = bdrv_drop_intermediate(active, s->commit_top_bs, base,
+                                     s->backing_file_str);
+    } else if (overlay_bs) {
+        /* XXX Can (or should) we somehow keep 'consistent read' blocked even
+         * after the failed/cancelled commit job is gone? If we already wrote
+         * something to base, the intermediate images aren't valid any more. */
+        remove_commit_top_bs = true;
    }

    /* restore base open flags here if appropriate (e.g., change the base back
@@ -102,9 +114,15 @@ static void commit_complete(BlockJob *job, void *opaque)
    }
    g_free(s->backing_file_str);
    blk_unref(s->top);
-    blk_unref(s->base);
    block_job_completed(&s->common, ret);
    g_free(data);
+
+    /* If bdrv_drop_intermediate() didn't already do that, remove the commit
+     * filter driver from the backing chain. Do this as the final step so that
+     * the 'consistent read' permission can be granted.  */
+    if (remove_commit_top_bs) {
+        bdrv_set_backing_hd(overlay_bs, top, &error_abort);
+    }
 }

 static void coroutine_fn commit_run(void *opaque)
@@ -208,10 +226,38 @@ static const BlockJobDriver commit_job_driver = {
    .start         = commit_run,
 };

+static int coroutine_fn bdrv_commit_top_preadv(BlockDriverState *bs,
+    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
+{
+    return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
+}
+
+static void bdrv_commit_top_close(BlockDriverState *bs)
+{
+}
+
+static void bdrv_commit_top_child_perm(BlockDriverState *bs, BdrvChild *c,
+                                       const BdrvChildRole *role,
+                                       uint64_t perm, uint64_t shared,
+                                       uint64_t *nperm, uint64_t *nshared)
+{
+    *nperm = 0;
+    *nshared = BLK_PERM_ALL;
+}
+
+/* Dummy node that provides consistent read to its users without requiring it
+ * from its backing file and that allows writes on the backing file chain. */
+static BlockDriver bdrv_commit_top = {
+    .format_name        = "commit_top",
+    .bdrv_co_preadv     = bdrv_commit_top_preadv,
+    .bdrv_close         = bdrv_commit_top_close,
+    .bdrv_child_perm    = bdrv_commit_top_child_perm,
+};
+
 void commit_start(const char *job_id, BlockDriverState *bs,
                  BlockDriverState *base, BlockDriverState *top, int64_t speed,
                  BlockdevOnError on_error, const char *backing_file_str,
-                  Error **errp)
+                  const char *filter_node_name, Error **errp)
 {
    CommitBlockJob *s;
    BlockReopenQueue *reopen_queue = NULL;
@@ -219,7 +265,9 @@ void commit_start(const char *job_id, BlockDriverState *bs,
    int orig_base_flags;
    BlockDriverState *iter;
    BlockDriverState *overlay_bs;
+    BlockDriverState *commit_top_bs = NULL;
    Error *local_err = NULL;
+    int ret;

    assert(top != bs);
    if (top == base) {
@@ -234,8 +282,8 @@ void commit_start(const char *job_id, BlockDriverState *bs,
        return;
    }

-    s = block_job_create(job_id, &commit_job_driver, bs, speed,
-                         BLOCK_JOB_DEFAULT, NULL, NULL, errp);
+    s = block_job_create(job_id, &commit_job_driver, bs, 0, BLK_PERM_ALL,
+                         speed, BLOCK_JOB_DEFAULT, NULL, NULL, errp);
    if (!s) {
        return;
    }
@@ -256,30 +304,70 @@ void commit_start(const char *job_id, BlockDriverState *bs,
        bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err);
        if (local_err != NULL) {
            error_propagate(errp, local_err);
-            block_job_unref(&s->common);
-            return;
+            goto fail;
        }
    }

+    /* Insert commit_top block node above top, so we can block consistent read
+     * on the backing chain below it */
+    commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, filter_node_name, 0,
+                                         errp);
+    if (commit_top_bs == NULL) {
+        goto fail;
+    }
+
+    bdrv_set_backing_hd(commit_top_bs, top, &error_abort);
+    bdrv_set_backing_hd(overlay_bs, commit_top_bs, &error_abort);
+
+    s->commit_top_bs = commit_top_bs;
+    bdrv_unref(commit_top_bs);

    /* Block all nodes between top and base, because they will
     * disappear from the chain after this operation. */
    assert(bdrv_chain_contains(top, base));
-    for (iter = top; iter != backing_bs(base); iter = backing_bs(iter)) {
-        block_job_add_bdrv(&s->common, iter);
+    for (iter = top; iter != base; iter = backing_bs(iter)) {
+        /* XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves
+         * at s->base (if writes are blocked for a node, they are also blocked
+         * for its backing file). The other options would be a second filter
+         * driver above s->base. */
+        ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
+                                 BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE,
+                                 errp);
+        if (ret < 0) {
+            goto fail;
+        }
    }
+
+    ret = block_job_add_bdrv(&s->common, "base", base, 0, BLK_PERM_ALL, errp);
+    if (ret < 0) {
+        goto fail;
+    }
+
    /* overlay_bs must be blocked because it needs to be modified to
-     * update the backing image string, but if it's the root node then
-     * don't block it again */
-    if (bs != overlay_bs) {
-        block_job_add_bdrv(&s->common, overlay_bs);
+     * update the backing image string. */
+    ret = block_job_add_bdrv(&s->common, "overlay of top", overlay_bs,
+                             BLK_PERM_GRAPH_MOD, BLK_PERM_ALL, errp);
+    if (ret < 0) {
+        goto fail;
    }

-    s->base = blk_new();
-    blk_insert_bs(s->base, base);
+    s->base = blk_new(BLK_PERM_CONSISTENT_READ
+                      | BLK_PERM_WRITE
+                      | BLK_PERM_RESIZE,
+                      BLK_PERM_CONSISTENT_READ
+                      | BLK_PERM_GRAPH_MOD
+                      | BLK_PERM_WRITE_UNCHANGED);
+    ret = blk_insert_bs(s->base, base, errp);
+    if (ret < 0) {
+        goto fail;
+    }

-    s->top = blk_new();
-    blk_insert_bs(s->top, top);
+    /* Required permissions are already taken with block_job_add_bdrv() */
+    s->top = blk_new(0, BLK_PERM_ALL);
+    blk_insert_bs(s->top, top, errp);
+    if (ret < 0) {
+        goto fail;
+    }

    s->active = bs;

@@ -292,6 +380,19 @@ void commit_start(const char *job_id, BlockDriverState *bs,

    trace_commit_start(bs, base, top, s);
    block_job_start(&s->common);
+    return;
+
+fail:
+    if (s->base) {
+        blk_unref(s->base);
+    }
+    if (s->top) {
+        blk_unref(s->top);
+    }
+    if (commit_top_bs) {
+        bdrv_set_backing_hd(overlay_bs, top, &error_abort);
+    }
+    block_job_unref(&s->common);
 }


@@ -301,11 +402,14 @@ void commit_start(const char *job_id, BlockDriverState *bs,
 int bdrv_commit(BlockDriverState *bs)
 {
    BlockBackend *src, *backing;
+    BlockDriverState *backing_file_bs = NULL;
+    BlockDriverState *commit_top_bs = NULL;
    BlockDriver *drv = bs->drv;
    int64_t sector, total_sectors, length, backing_length;
    int n, ro, open_flags;
    int ret = 0;
    uint8_t *buf = NULL;
+    Error *local_err = NULL;

    if (!drv)
        return -ENOMEDIUM;
@@ -328,11 +432,33 @@ int bdrv_commit(BlockDriverState *bs)
        }
    }

-    src = blk_new();
-    blk_insert_bs(src, bs);
+    src = blk_new(BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL);
+    backing = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);

-    backing = blk_new();
-    blk_insert_bs(backing, bs->backing->bs);
+    ret = blk_insert_bs(src, bs, &local_err);
+    if (ret < 0) {
+        error_report_err(local_err);
+        goto ro_cleanup;
+    }
+
+    /* Insert commit_top block node above backing, so we can write to it */
+    backing_file_bs = backing_bs(bs);
+
+    commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, NULL, BDRV_O_RDWR,
+                                         &local_err);
+    if (commit_top_bs == NULL) {
+        error_report_err(local_err);
+        goto ro_cleanup;
+    }
+
+    bdrv_set_backing_hd(commit_top_bs, backing_file_bs, &error_abort);
+    bdrv_set_backing_hd(bs, commit_top_bs, &error_abort);
+
+    ret = blk_insert_bs(backing, backing_file_bs, &local_err);
+    if (ret < 0) {
+        error_report_err(local_err);
+        goto ro_cleanup;
+    }

    length = blk_getlength(src);
    if (length < 0) {
@@ -404,8 +530,12 @@ int bdrv_commit(BlockDriverState *bs)
 ro_cleanup:
    qemu_vfree(buf);

-    blk_unref(src);
    blk_unref(backing);
+    if (backing_file_bs) {
+        bdrv_set_backing_hd(bs, backing_file_bs, &error_abort);
+    }
+    bdrv_unref(commit_top_bs);
+    blk_unref(src);

    if (ro) {
        /* ignoring error return here */
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -628,6 +628,7 @@ BlockDriver bdrv_crypto_luks = {
    .bdrv_probe         = block_crypto_probe_luks,
    .bdrv_open          = block_crypto_open_luks,
    .bdrv_close         = block_crypto_close,
+    .bdrv_child_perm    = bdrv_format_default_perms,
    .bdrv_create        = block_crypto_create_luks,
    .bdrv_truncate      = block_crypto_truncate,
    .create_opts        = &block_crypto_create_opts_luks,
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -697,6 +697,7 @@ static BlockDriver bdrv_dmg = {
    .bdrv_probe     = dmg_probe,
    .bdrv_open      = dmg_open,
    .bdrv_refresh_limits = dmg_refresh_limits,
+    .bdrv_child_perm     = bdrv_format_default_perms,
    .bdrv_co_preadv = dmg_co_preadv,
    .bdrv_close     = dmg_close,
 };
--- a/block/io.c
+++ b/block/io.c
@@ -925,9 +925,11 @@ bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
    return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
 }

-static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
+static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
        int64_t offset, unsigned int bytes, QEMUIOVector *qiov)
 {
+    BlockDriverState *bs = child->bs;
+
    /* Perform I/O through a temporary buffer so that users who scribble over
     * their read buffer while the operation is in progress do not end up
     * modifying the image file.  This is critical for zero-copy guest I/O
@@ -943,6 +945,8 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
    size_t skip_bytes;
    int ret;

+    assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
+
    /* Cover entire cluster so no additional backing file I/O is required when
     * allocating cluster in the image file.
     */
@@ -1001,10 +1005,11 @@ err:
 * handles copy on read, zeroing after EOF, and fragmentation of large
 * reads; any other features must be implemented by the caller.
 */
-static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
+static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
    int64_t align, QEMUIOVector *qiov, int flags)
 {
+    BlockDriverState *bs = child->bs;
    int64_t total_bytes, max_bytes;
    int ret = 0;
    uint64_t bytes_remaining = bytes;
@@ -1050,7 +1055,7 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
        }

        if (!ret || pnum != nb_sectors) {
-            ret = bdrv_co_do_copy_on_readv(bs, offset, bytes, qiov);
+            ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov);
            goto out;
        }
    }
@@ -1158,7 +1163,7 @@ int coroutine_fn bdrv_co_preadv(BdrvChild *child,
    }

    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
-    ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
+    ret = bdrv_aligned_preadv(child, &req, offset, bytes, align,
                              use_local_qiov ? &local_qiov : qiov,
                              flags);
    tracked_request_end(&req);
@@ -1306,10 +1311,11 @@ fail:
 * Forwards an already correctly aligned write request to the BlockDriver,
 * after possibly fragmenting it.
 */
-static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
+static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
    int64_t align, QEMUIOVector *qiov, int flags)
 {
+    BlockDriverState *bs = child->bs;
    BlockDriver *drv = bs->drv;
    bool waited;
    int ret;
@@ -1332,6 +1338,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
    assert(!waited || !req->serialising);
    assert(req->overlap_offset <= offset);
    assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
+    assert(child->perm & BLK_PERM_WRITE);
+    assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);

    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);

@@ -1397,12 +1405,13 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
    return ret;
 }

-static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
+static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
                                                int64_t offset,
                                                unsigned int bytes,
                                                BdrvRequestFlags flags,
                                                BdrvTrackedRequest *req)
 {
+    BlockDriverState *bs = child->bs;
    uint8_t *buf = NULL;
    QEMUIOVector local_qiov;
    struct iovec iov;
@@ -1430,7 +1439,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
        mark_request_serialising(req, align);
        wait_serialising_requests(req);
        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
-        ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align,
+        ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align,
                                  align, &local_qiov, 0);
        if (ret < 0) {
            goto fail;
@@ -1438,7 +1447,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);

        memset(buf + head_padding_bytes, 0, zero_bytes);
-        ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align,
+        ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align,
                                   align, &local_qiov,
                                   flags & ~BDRV_REQ_ZERO_WRITE);
        if (ret < 0) {
@@ -1452,7 +1461,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
    if (bytes >= align) {
        /* Write the aligned part in the middle. */
        uint64_t aligned_bytes = bytes & ~(align - 1);
-        ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, align,
+        ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
                                   NULL, flags);
        if (ret < 0) {
            goto fail;
@@ -1468,7 +1477,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
        mark_request_serialising(req, align);
        wait_serialising_requests(req);
        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
-        ret = bdrv_aligned_preadv(bs, req, offset, align,
+        ret = bdrv_aligned_preadv(child, req, offset, align,
                                  align, &local_qiov, 0);
        if (ret < 0) {
            goto fail;
@@ -1476,7 +1485,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);

        memset(buf, 0, bytes);
-        ret = bdrv_aligned_pwritev(bs, req, offset, align, align,
+        ret = bdrv_aligned_pwritev(child, req, offset, align, align,
                                   &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
    }
 fail:
@@ -1523,7 +1532,7 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);

    if (!qiov) {
-        ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req);
+        ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
        goto out;
    }

@@ -1542,7 +1551,7 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
        qemu_iovec_init_external(&head_qiov, &head_iov, 1);

        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
-        ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
+        ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align,
                                  align, &head_qiov, 0);
        if (ret < 0) {
            goto fail;
@@ -1584,8 +1593,8 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
        qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);

        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
-        ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
-                                  align, &tail_qiov, 0);
+        ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1),
+                                  align, align, &tail_qiov, 0);
        if (ret < 0) {
            goto fail;
        }
@@ -1603,7 +1612,7 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
        bytes = ROUND_UP(bytes, align);
    }

-    ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, align,
+    ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
                               use_local_qiov ? &local_qiov : qiov,
                               flags);

--- a/block/mirror.c
+++ b/block/mirror.c
@@ -38,7 +38,10 @@ typedef struct MirrorBlockJob {
    BlockJob common;
    RateLimit limit;
    BlockBackend *target;
+    BlockDriverState *mirror_top_bs;
+    BlockDriverState *source;
    BlockDriverState *base;
+
    /* The name of the graph node to replace */
    char *replaces;
    /* The BDS to replace */
@@ -327,7 +330,7 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s,

 static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
-    BlockDriverState *source = blk_bs(s->common.blk);
+    BlockDriverState *source = s->source;
    int64_t sector_num, first_chunk;
    uint64_t delay_ns = 0;
    /* At least the first dirty chunk is mirrored in one iteration. */
@@ -497,12 +500,30 @@ static void mirror_exit(BlockJob *job, void *opaque)
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
    MirrorExitData *data = opaque;
    AioContext *replace_aio_context = NULL;
-    BlockDriverState *src = blk_bs(s->common.blk);
+    BlockDriverState *src = s->source;
    BlockDriverState *target_bs = blk_bs(s->target);
+    BlockDriverState *mirror_top_bs = s->mirror_top_bs;
+    Error *local_err = NULL;

    /* Make sure that the source BDS doesn't go away before we called
     * block_job_completed(). */
    bdrv_ref(src);
+    bdrv_ref(mirror_top_bs);
+
+    /* We don't access the source any more. Dropping any WRITE/RESIZE is
+     * required before it could become a backing file of target_bs. */
+    bdrv_child_try_set_perm(mirror_top_bs->backing, 0, BLK_PERM_ALL,
+                            &error_abort);
+    if (s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
+        BlockDriverState *backing = s->is_none_mode ? src : s->base;
+        if (backing_bs(target_bs) != backing) {
+            bdrv_set_backing_hd(target_bs, backing, &local_err);
+            if (local_err) {
+                error_report_err(local_err);
+                data->ret = -EPERM;
+            }
+        }
+    }

    if (s->to_replace) {
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
@@ -524,10 +545,6 @@ static void mirror_exit(BlockJob *job, void *opaque)
        bdrv_drained_begin(target_bs);
        bdrv_replace_in_backing_chain(to_replace, target_bs);
        bdrv_drained_end(target_bs);
-
-        /* We just changed the BDS the job BB refers to */
-        blk_remove_bs(job->blk);
-        blk_insert_bs(job->blk, src);
    }
    if (s->to_replace) {
        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
@@ -540,9 +557,26 @@ static void mirror_exit(BlockJob *job, void *opaque)
    g_free(s->replaces);
    blk_unref(s->target);
    s->target = NULL;
+
+    /* Remove the mirror filter driver from the graph. Before this, get rid of
+     * the blockers on the intermediate nodes so that the resulting state is
+     * valid. */
+    block_job_remove_all_bdrv(job);
+    bdrv_replace_in_backing_chain(mirror_top_bs, backing_bs(mirror_top_bs));
+
+    /* We just changed the BDS the job BB refers to (with either or both of the
+     * bdrv_replace_in_backing_chain() calls), so switch the BB back so the
+     * cleanup does the right thing. We don't need any permissions any more
+     * now. */
+    blk_remove_bs(job->blk);
+    blk_set_perm(job->blk, 0, BLK_PERM_ALL, &error_abort);
+    blk_insert_bs(job->blk, mirror_top_bs, &error_abort);
+
    block_job_completed(&s->common, data->ret);
+
    g_free(data);
    bdrv_drained_end(src);
+    bdrv_unref(mirror_top_bs);
    bdrv_unref(src);
 }

@@ -562,7 +596,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
 {
    int64_t sector_num, end;
    BlockDriverState *base = s->base;
-    BlockDriverState *bs = blk_bs(s->common.blk);
+    BlockDriverState *bs = s->source;
    BlockDriverState *target_bs = blk_bs(s->target);
    int ret, n;

@@ -644,7 +678,7 @@ static void coroutine_fn mirror_run(void *opaque)
 {
    MirrorBlockJob *s = opaque;
    MirrorExitData *data;
-    BlockDriverState *bs = blk_bs(s->common.blk);
+    BlockDriverState *bs = s->source;
    BlockDriverState *target_bs = blk_bs(s->target);
    bool need_drain = true;
    int64_t length;
@@ -876,9 +910,8 @@ static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
 static void mirror_complete(BlockJob *job, Error **errp)
 {
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
-    BlockDriverState *src, *target;
+    BlockDriverState *target;

-    src = blk_bs(job->blk);
    target = blk_bs(s->target);

    if (!s->synced) {
@@ -910,6 +943,10 @@ static void mirror_complete(BlockJob *job, Error **errp)
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);

+        /* TODO Translate this into permission system. Current definition of
+         * GRAPH_MOD would require to request it for the parents; they might
+         * not even be BlockDriverStates, however, so a BdrvChild can't address
+         * them. May need redefinition of GRAPH_MOD. */
        error_setg(&s->replace_blocker,
                   "block device is in use by block-job-complete");
        bdrv_op_block_all(s->to_replace, s->replace_blocker);
@@ -918,13 +955,6 @@ static void mirror_complete(BlockJob *job, Error **errp)
        aio_context_release(replace_aio_context);
    }

-    if (s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
-        BlockDriverState *backing = s->is_none_mode ? src : s->base;
-        if (backing_bs(target) != backing) {
-            bdrv_set_backing_hd(target, backing);
-        }
-    }
-
    s->should_complete = true;
    block_job_enter(&s->common);
 }
@@ -980,6 +1010,77 @@ static const BlockJobDriver commit_active_job_driver = {
    .drain                  = mirror_drain,
 };

+static int coroutine_fn bdrv_mirror_top_preadv(BlockDriverState *bs,
+    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
+{
+    return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
+}
+
+static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs,
+    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
+{
+    return bdrv_co_pwritev(bs->backing, offset, bytes, qiov, flags);
+}
+
+static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs)
+{
+    return bdrv_co_flush(bs->backing->bs);
+}
+
+static int64_t coroutine_fn bdrv_mirror_top_get_block_status(
+    BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
+    BlockDriverState **file)
+{
+    *pnum = nb_sectors;
+    *file = bs->backing->bs;
+    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
+           (sector_num << BDRV_SECTOR_BITS);
+}
+
+static int coroutine_fn bdrv_mirror_top_pwrite_zeroes(BlockDriverState *bs,
+    int64_t offset, int count, BdrvRequestFlags flags)
+{
+    return bdrv_co_pwrite_zeroes(bs->backing, offset, count, flags);
+}
+
+static int coroutine_fn bdrv_mirror_top_pdiscard(BlockDriverState *bs,
+    int64_t offset, int count)
+{
+    return bdrv_co_pdiscard(bs->backing->bs, offset, count);
+}
+
+static void bdrv_mirror_top_close(BlockDriverState *bs)
+{
+}
+
+static void bdrv_mirror_top_child_perm(BlockDriverState *bs, BdrvChild *c,
+                                       const BdrvChildRole *role,
+                                       uint64_t perm, uint64_t shared,
+                                       uint64_t *nperm, uint64_t *nshared)
+{
+    /* Must be able to forward guest writes to the real image */
+    *nperm = 0;
+    if (perm & BLK_PERM_WRITE) {
+        *nperm |= BLK_PERM_WRITE;
+    }
+
+    *nshared = BLK_PERM_ALL;
+}
+
+/* Dummy node that provides consistent read to its users without requiring it
+ * from its backing file and that allows writes on the backing file chain. */
+static BlockDriver bdrv_mirror_top = {
+    .format_name                = "mirror_top",
+    .bdrv_co_preadv             = bdrv_mirror_top_preadv,
+    .bdrv_co_pwritev            = bdrv_mirror_top_pwritev,
+    .bdrv_co_pwrite_zeroes      = bdrv_mirror_top_pwrite_zeroes,
+    .bdrv_co_pdiscard           = bdrv_mirror_top_pdiscard,
+    .bdrv_co_flush              = bdrv_mirror_top_flush,
+    .bdrv_co_get_block_status   = bdrv_mirror_top_get_block_status,
+    .bdrv_close                 = bdrv_mirror_top_close,
+    .bdrv_child_perm            = bdrv_mirror_top_child_perm,
+};
+
 static void mirror_start_job(const char *job_id, BlockDriverState *bs,
                             int creation_flags, BlockDriverState *target,
                             const char *replaces, int64_t speed,
@@ -992,9 +1093,14 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
                             void *opaque, Error **errp,
                             const BlockJobDriver *driver,
                             bool is_none_mode, BlockDriverState *base,
-                             bool auto_complete)
+                             bool auto_complete, const char *filter_node_name)
 {
    MirrorBlockJob *s;
+    BlockDriverState *mirror_top_bs;
+    bool target_graph_mod;
+    bool target_is_backing;
+    Error *local_err = NULL;
+    int ret;

    if (granularity == 0) {
        granularity = bdrv_get_default_bitmap_granularity(target);
@@ -1011,14 +1117,62 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
        buf_size = DEFAULT_MIRROR_BUF_SIZE;
    }

-    s = block_job_create(job_id, driver, bs, speed, creation_flags,
-                         cb, opaque, errp);
-    if (!s) {
+    /* In the case of active commit, add dummy driver to provide consistent
+     * reads on the top, while disabling it in the intermediate nodes, and make
+     * the backing chain writable. */
+    mirror_top_bs = bdrv_new_open_driver(&bdrv_mirror_top, filter_node_name,
+                                         BDRV_O_RDWR, errp);
+    if (mirror_top_bs == NULL) {
+        return;
+    }
+    mirror_top_bs->total_sectors = bs->total_sectors;
+
+    /* bdrv_append takes ownership of the mirror_top_bs reference, need to keep
+     * it alive until block_job_create() even if bs has no parent. */
+    bdrv_ref(mirror_top_bs);
+    bdrv_drained_begin(bs);
+    bdrv_append(mirror_top_bs, bs, &local_err);
+    bdrv_drained_end(bs);
+
+    if (local_err) {
+        bdrv_unref(mirror_top_bs);
+        error_propagate(errp, local_err);
        return;
    }

-    s->target = blk_new();
-    blk_insert_bs(s->target, target);
+    /* Make sure that the source is not resized while the job is running */
+    s = block_job_create(job_id, driver, mirror_top_bs,
+                         BLK_PERM_CONSISTENT_READ,
+                         BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
+                         BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD, speed,
+                         creation_flags, cb, opaque, errp);
+    bdrv_unref(mirror_top_bs);
+    if (!s) {
+        goto fail;
+    }
+    s->source = bs;
+    s->mirror_top_bs = mirror_top_bs;
+
+    /* No resize for the target either; while the mirror is still running, a
+     * consistent read isn't necessarily possible. We could possibly allow
+     * writes and graph modifications, though it would likely defeat the
+     * purpose of a mirror, so leave them blocked for now.
+     *
+     * In the case of active commit, things look a bit different, though,
+     * because the target is an already populated backing file in active use.
+     * We can allow anything except resize there.*/
+    target_is_backing = bdrv_chain_contains(bs, target);
+    target_graph_mod = (backing_mode != MIRROR_LEAVE_BACKING_CHAIN);
+    s->target = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE |
+                        (target_graph_mod ? BLK_PERM_GRAPH_MOD : 0),
+                        BLK_PERM_WRITE_UNCHANGED |
+                        (target_is_backing ? BLK_PERM_CONSISTENT_READ |
+                                             BLK_PERM_WRITE |
+                                             BLK_PERM_GRAPH_MOD : 0));
+    ret = blk_insert_bs(s->target, target, errp);
+    if (ret < 0) {
+        goto fail;
+    }

    s->replaces = g_strdup(replaces);
    s->on_source_error = on_source_error;
@@ -1041,18 +1195,40 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
        return;
    }

-    block_job_add_bdrv(&s->common, target);
+    /* Required permissions are already taken with blk_new() */
+    block_job_add_bdrv(&s->common, "target", target, 0, BLK_PERM_ALL,
+                       &error_abort);
+
    /* In commit_active_start() all intermediate nodes disappear, so
     * any jobs in them must be blocked */
-    if (bdrv_chain_contains(bs, target)) {
+    if (target_is_backing) {
        BlockDriverState *iter;
        for (iter = backing_bs(bs); iter != target; iter = backing_bs(iter)) {
-            block_job_add_bdrv(&s->common, iter);
+            /* XXX BLK_PERM_WRITE needs to be allowed so we don't block
+             * ourselves at s->base (if writes are blocked for a node, they are
+             * also blocked for its backing file). The other options would be a
+             * second filter driver above s->base (== target). */
+            ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
+                                     BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE,
+                                     errp);
+            if (ret < 0) {
+                goto fail;
+            }
        }
    }

    trace_mirror_start(bs, s, opaque);
    block_job_start(&s->common);
+    return;
+
+fail:
+    if (s) {
+        g_free(s->replaces);
+        blk_unref(s->target);
+        block_job_unref(&s->common);
+    }
+
+    bdrv_replace_in_backing_chain(mirror_top_bs, backing_bs(mirror_top_bs));
 }

 void mirror_start(const char *job_id, BlockDriverState *bs,
@@ -1061,7 +1237,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
                  MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
                  BlockdevOnError on_source_error,
                  BlockdevOnError on_target_error,
-                  bool unmap, Error **errp)
+                  bool unmap, const char *filter_node_name, Error **errp)
 {
    bool is_none_mode;
    BlockDriverState *base;
@@ -1075,12 +1251,14 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
    mirror_start_job(job_id, bs, BLOCK_JOB_DEFAULT, target, replaces,
                     speed, granularity, buf_size, backing_mode,
                     on_source_error, on_target_error, unmap, NULL, NULL, errp,
-                     &mirror_job_driver, is_none_mode, base, false);
+                     &mirror_job_driver, is_none_mode, base, false,
+                     filter_node_name);
 }

 void commit_active_start(const char *job_id, BlockDriverState *bs,
                         BlockDriverState *base, int creation_flags,
                         int64_t speed, BlockdevOnError on_error,
+                         const char *filter_node_name,
                         BlockCompletionFunc *cb, void *opaque, Error **errp,
                         bool auto_complete)
 {
@@ -1096,7 +1274,8 @@ void commit_active_start(const char *job_id, BlockDriverState *bs,
    mirror_start_job(job_id, bs, creation_flags, base, NULL, speed, 0, 0,
                     MIRROR_LEAVE_BACKING_CHAIN,
                     on_error, on_error, true, cb, opaque, &local_err,
-                     &commit_active_job_driver, false, base, auto_complete);
+                     &commit_active_job_driver, false, base, auto_complete,
+                     filter_node_name);
    if (local_err) {
        error_propagate(errp, local_err);
        goto error_restore_flags;
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -488,7 +488,8 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    file = blk_new_open(filename, NULL, NULL,
-                        BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
+                        BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
+                        &local_err);
    if (file == NULL) {
        error_propagate(errp, local_err);
        return -EIO;
@@ -762,6 +763,7 @@ static BlockDriver bdrv_parallels = {
    .bdrv_probe		= parallels_probe,
    .bdrv_open		= parallels_open,
    .bdrv_close		= parallels_close,
+    .bdrv_child_perm          = bdrv_format_default_perms,
    .bdrv_co_get_block_status = parallels_co_get_block_status,
    .bdrv_has_zero_init       = bdrv_has_zero_init_1,
    .bdrv_co_flush_to_os      = parallels_co_flush_to_os,
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -823,7 +823,8 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    qcow_blk = blk_new_open(filename, NULL, NULL,
-                            BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
+                            BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
+                            &local_err);
    if (qcow_blk == NULL) {
        error_propagate(errp, local_err);
        ret = -EIO;
@@ -1052,6 +1053,7 @@ static BlockDriver bdrv_qcow = {
    .bdrv_probe		= qcow_probe,
    .bdrv_open		= qcow_open,
    .bdrv_close		= qcow_close,
+    .bdrv_child_perm        = bdrv_format_default_perms,
    .bdrv_reopen_prepare    = qcow_reopen_prepare,
    .bdrv_create            = qcow_create,
    .bdrv_has_zero_init     = bdrv_has_zero_init_1,
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2202,7 +2202,8 @@ static int qcow2_create2(const char *filename, int64_t total_size,
    }

    blk = blk_new_open(filename, NULL, NULL,
-                       BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
+                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
+                       &local_err);
    if (blk == NULL) {
        error_propagate(errp, local_err);
        return -EIO;
@@ -2266,7 +2267,8 @@ static int qcow2_create2(const char *filename, int64_t total_size,
    options = qdict_new();
    qdict_put(options, "driver", qstring_from_str("qcow2"));
    blk = blk_new_open(filename, NULL, options,
-                       BDRV_O_RDWR | BDRV_O_NO_FLUSH, &local_err);
+                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_NO_FLUSH,
+                       &local_err);
    if (blk == NULL) {
        error_propagate(errp, local_err);
        ret = -EIO;
@@ -3113,6 +3115,7 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
    uint64_t cluster_size = s->cluster_size;
    bool encrypt;
    int refcount_bits = s->refcount_bits;
+    Error *local_err = NULL;
    int ret;
    QemuOptDesc *desc = opts->list->desc;
    Qcow2AmendHelperCBInfo helper_cb_info;
@@ -3262,11 +3265,16 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
    }

    if (new_size) {
-        BlockBackend *blk = blk_new();
-        blk_insert_bs(blk, bs);
+        BlockBackend *blk = blk_new(BLK_PERM_RESIZE, BLK_PERM_ALL);
+        ret = blk_insert_bs(blk, bs, &local_err);
+        if (ret < 0) {
+            error_report_err(local_err);
+            blk_unref(blk);
+            return ret;
+        }
+
        ret = blk_truncate(blk, new_size);
        blk_unref(blk);
-
        if (ret < 0) {
            return ret;
        }
@@ -3403,6 +3411,7 @@ BlockDriver bdrv_qcow2 = {
    .bdrv_reopen_commit   = qcow2_reopen_commit,
    .bdrv_reopen_abort    = qcow2_reopen_abort,
    .bdrv_join_options    = qcow2_join_options,
+    .bdrv_child_perm      = bdrv_format_default_perms,
    .bdrv_create        = qcow2_create,
    .bdrv_has_zero_init = bdrv_has_zero_init_1,
    .bdrv_co_get_block_status = qcow2_co_get_block_status,
--- a/block/qed.c
+++ b/block/qed.c
@@ -625,7 +625,8 @@ static int qed_create(const char *filename, uint32_t cluster_size,
    }

    blk = blk_new_open(filename, NULL, NULL,
-                       BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
+                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
+                       &local_err);
    if (blk == NULL) {
        error_propagate(errp, local_err);
        return -EIO;
@@ -1704,6 +1705,7 @@ static BlockDriver bdrv_qed = {
    .bdrv_open                = bdrv_qed_open,
    .bdrv_close               = bdrv_qed_close,
    .bdrv_reopen_prepare      = bdrv_qed_reopen_prepare,
+    .bdrv_child_perm          = bdrv_format_default_perms,
    .bdrv_create              = bdrv_qed_create,
    .bdrv_has_zero_init       = bdrv_has_zero_init_1,
    .bdrv_co_get_block_status = bdrv_qed_co_get_block_status,
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -1032,10 +1032,17 @@ static void quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs,

    /* We can safely add the child now */
    bdrv_ref(child_bs);
-    child = bdrv_attach_child(bs, child_bs, indexstr, &child_format);
+
+    child = bdrv_attach_child(bs, child_bs, indexstr, &child_format, errp);
+    if (child == NULL) {
+        s->next_child_index--;
+        bdrv_unref(child_bs);
+        goto out;
+    }
    s->children = g_renew(BdrvChild *, s->children, s->num_children + 1);
    s->children[s->num_children++] = child;

+out:
    bdrv_drained_end(bs);
 }

@@ -1126,6 +1133,8 @@ static BlockDriver bdrv_quorum = {
    .bdrv_add_child                     = quorum_add_child,
    .bdrv_del_child                     = quorum_del_child,

+    .bdrv_child_perm                    = bdrv_filter_default_perms,
+
    .is_filter                          = true,
    .bdrv_recurse_is_first_non_filter   = quorum_recurse_is_first_non_filter,
 };
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -467,6 +467,7 @@ BlockDriver bdrv_raw = {
    .bdrv_reopen_abort    = &raw_reopen_abort,
    .bdrv_open            = &raw_open,
    .bdrv_close           = &raw_close,
+    .bdrv_child_perm      = bdrv_filter_default_perms,
    .bdrv_create          = &raw_create,
    .bdrv_co_preadv       = &raw_co_preadv,
    .bdrv_co_pwritev      = &raw_co_pwritev,
--- a/block/replication.c
+++ b/block/replication.c
@@ -644,7 +644,7 @@ static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
        s->replication_state = BLOCK_REPLICATION_FAILOVER;
        commit_active_start(NULL, s->active_disk->bs, s->secondary_disk->bs,
                            BLOCK_JOB_INTERNAL, 0, BLOCKDEV_ON_ERROR_REPORT,
-                            replication_done, bs, errp, true);
+                            NULL, replication_done, bs, errp, true);
        break;
    default:
        aio_context_release(aio_context);
@@ -660,6 +660,7 @@ BlockDriver bdrv_replication = {

    .bdrv_open                  = replication_open,
    .bdrv_close                 = replication_close,
+    .bdrv_child_perm            = bdrv_filter_default_perms,

    .bdrv_getlength             = replication_getlength,
    .bdrv_co_readv              = replication_co_readv,
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1609,7 +1609,7 @@ static int sd_prealloc(const char *filename, Error **errp)
    int ret;

    blk = blk_new_open(filename, NULL, NULL,
-                       BDRV_O_RDWR | BDRV_O_PROTOCOL, errp);
+                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
    if (blk == NULL) {
        ret = -EIO;
        goto out_with_err_set;
--- a/block/stream.c
+++ b/block/stream.c
@@ -68,6 +68,7 @@ static void stream_complete(BlockJob *job, void *opaque)
    StreamCompleteData *data = opaque;
    BlockDriverState *bs = blk_bs(job->blk);
    BlockDriverState *base = s->base;
+    Error *local_err = NULL;

    if (!block_job_is_cancelled(&s->common) && data->reached_end &&
        data->ret == 0) {
@@ -79,11 +80,19 @@ static void stream_complete(BlockJob *job, void *opaque)
            }
        }
        data->ret = bdrv_change_backing_file(bs, base_id, base_fmt);
-        bdrv_set_backing_hd(bs, base);
+        bdrv_set_backing_hd(bs, base, &local_err);
+        if (local_err) {
+            error_report_err(local_err);
+            data->ret = -EPERM;
+            goto out;
+        }
    }

+out:
    /* Reopen the image back in read-only mode if necessary */
    if (s->bs_flags != bdrv_get_flags(bs)) {
+        /* Give up write permissions before making it read-only */
+        blk_set_perm(job->blk, 0, BLK_PERM_ALL, &error_abort);
        bdrv_reopen(bs, s->bs_flags, NULL);
    }

@@ -229,25 +238,35 @@ void stream_start(const char *job_id, BlockDriverState *bs,
    BlockDriverState *iter;
    int orig_bs_flags;

-    s = block_job_create(job_id, &stream_job_driver, bs, speed,
-                         BLOCK_JOB_DEFAULT, NULL, NULL, errp);
-    if (!s) {
-        return;
-    }
-
    /* Make sure that the image is opened in read-write mode */
    orig_bs_flags = bdrv_get_flags(bs);
    if (!(orig_bs_flags & BDRV_O_RDWR)) {
        if (bdrv_reopen(bs, orig_bs_flags | BDRV_O_RDWR, errp) != 0) {
-            block_job_unref(&s->common);
            return;
        }
    }

-    /* Block all intermediate nodes between bs and base, because they
-     * will disappear from the chain after this operation */
+    /* Prevent concurrent jobs trying to modify the graph structure here, we
+     * already have our own plans. Also don't allow resize as the image size is
+     * queried only at the job start and then cached. */
+    s = block_job_create(job_id, &stream_job_driver, bs,
+                         BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
+                         BLK_PERM_GRAPH_MOD,
+                         BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
+                         BLK_PERM_WRITE,
+                         speed, BLOCK_JOB_DEFAULT, NULL, NULL, errp);
+    if (!s) {
+        goto fail;
+    }
+
+    /* Block all intermediate nodes between bs and base, because they will
+     * disappear from the chain after this operation. The streaming job reads
+     * every block only once, assuming that it doesn't change, so block writes
+     * and resizes. */
    for (iter = backing_bs(bs); iter && iter != base; iter = backing_bs(iter)) {
-        block_job_add_bdrv(&s->common, iter);
+        block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
+                           BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED,
+                           &error_abort);
    }

    s->base = base;
@@ -257,4 +276,10 @@ void stream_start(const char *job_id, BlockDriverState *bs,
    s->on_error = on_error;
    trace_stream_start(bs, base, s);
    block_job_start(&s->common);
+    return;
+
+fail:
+    if (orig_bs_flags != bdrv_get_flags(bs)) {
+        bdrv_reopen(bs, s->bs_flags, NULL);
+    }
 }
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -763,7 +763,8 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    blk = blk_new_open(filename, NULL, NULL,
-                       BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
+                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
+                       &local_err);
    if (blk == NULL) {
        error_propagate(errp, local_err);
        ret = -EIO;
@@ -891,6 +892,7 @@ static BlockDriver bdrv_vdi = {
    .bdrv_open = vdi_open,
    .bdrv_close = vdi_close,
    .bdrv_reopen_prepare = vdi_reopen_prepare,
+    .bdrv_child_perm          = bdrv_format_default_perms,
    .bdrv_create = vdi_create,
    .bdrv_has_zero_init = bdrv_has_zero_init_1,
    .bdrv_co_get_block_status = vdi_co_get_block_status,
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -1859,7 +1859,8 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    blk = blk_new_open(filename, NULL, NULL,
-                       BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
+                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
+                       &local_err);
    if (blk == NULL) {
        error_propagate(errp, local_err);
        ret = -EIO;
@@ -1983,6 +1984,7 @@ static BlockDriver bdrv_vhdx = {
    .bdrv_open              = vhdx_open,
    .bdrv_close             = vhdx_close,
    .bdrv_reopen_prepare    = vhdx_reopen_prepare,
+    .bdrv_child_perm        = bdrv_format_default_perms,
    .bdrv_co_readv          = vhdx_co_readv,
    .bdrv_co_writev         = vhdx_co_writev,
    .bdrv_create            = vhdx_create,
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1703,7 +1703,8 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
    }

    blk = blk_new_open(filename, NULL, NULL,
-                       BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
+                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
+                       &local_err);
    if (blk == NULL) {
        error_propagate(errp, local_err);
        ret = -EIO;
@@ -2071,7 +2072,8 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    new_blk = blk_new_open(filename, NULL, NULL,
-                           BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
+                           BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
+                           &local_err);
    if (new_blk == NULL) {
        error_propagate(errp, local_err);
        ret = -EIO;
@@ -2359,6 +2361,7 @@ static BlockDriver bdrv_vmdk = {
    .bdrv_open                    = vmdk_open,
    .bdrv_check                   = vmdk_check,
    .bdrv_reopen_prepare          = vmdk_reopen_prepare,
+    .bdrv_child_perm              = bdrv_format_default_perms,
    .bdrv_co_preadv               = vmdk_co_preadv,
    .bdrv_co_pwritev              = vmdk_co_pwritev,
    .bdrv_co_pwritev_compressed   = vmdk_co_pwritev_compressed,
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -915,7 +915,8 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    blk = blk_new_open(filename, NULL, NULL,
-                       BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
+                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
+                       &local_err);
    if (blk == NULL) {
        error_propagate(errp, local_err);
        ret = -EIO;
@@ -1067,6 +1068,7 @@ static BlockDriver bdrv_vpc = {
    .bdrv_open              = vpc_open,
    .bdrv_close             = vpc_close,
    .bdrv_reopen_prepare    = vpc_reopen_prepare,
+    .bdrv_child_perm        = bdrv_format_default_perms,
    .bdrv_create            = vpc_create,

    .bdrv_co_preadv             = vpc_co_preadv,
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -3041,7 +3041,7 @@ static int enable_write_target(BlockDriverState *bs, Error **errp)
                                   &error_abort);
    *(void**) backing->opaque = s;

-    bdrv_set_backing_hd(s->bs, backing);
+    bdrv_set_backing_hd(s->bs, backing, &error_abort);
    bdrv_unref(backing);

    return 0;
@@ -3052,6 +3052,27 @@ err:
    return ret;
 }

+static void vvfat_child_perm(BlockDriverState *bs, BdrvChild *c,
+                             const BdrvChildRole *role,
+                             uint64_t perm, uint64_t shared,
+                             uint64_t *nperm, uint64_t *nshared)
+{
+    BDRVVVFATState *s = bs->opaque;
+
+    assert(c == s->qcow || role == &child_backing);
+
+    if (c == s->qcow) {
+        /* This is a private node, nobody should try to attach to it */
+        *nperm = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE;
+        *nshared = BLK_PERM_WRITE_UNCHANGED;
+    } else {
+        /* The backing file is there so 'commit' can use it. vvfat doesn't
+         * access it in any way. */
+        *nperm = 0;
+        *nshared = BLK_PERM_ALL;
+    }
+}
+
 static void vvfat_close(BlockDriverState *bs)
 {
    BDRVVVFATState *s = bs->opaque;
@@ -3077,6 +3098,7 @@ static BlockDriver bdrv_vvfat = {
    .bdrv_file_open         = vvfat_open,
    .bdrv_refresh_limits    = vvfat_refresh_limits,
    .bdrv_close             = vvfat_close,
+    .bdrv_child_perm        = vvfat_child_perm,

    .bdrv_co_preadv         = vvfat_co_preadv,
    .bdrv_co_pwritev        = vvfat_co_pwritev,
--- a/blockdev.c
+++ b/blockdev.c
@@ -558,7 +558,7 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
    if ((!file || !*file) && !qdict_size(bs_opts)) {
        BlockBackendRootState *blk_rs;

-        blk = blk_new();
+        blk = blk_new(0, BLK_PERM_ALL);
        blk_rs = blk_get_root_state(blk);
        blk_rs->open_flags    = bdrv_flags;
        blk_rs->read_only     = read_only;
@@ -1768,6 +1768,17 @@ static void external_snapshot_prepare(BlkActionState *common,

    if (!state->new_bs->drv->supports_backing) {
        error_setg(errp, "The snapshot does not support backing images");
+        return;
+    }
+
+    /* This removes our old bs and adds the new bs. This is an operation that
+     * can fail, so we need to do it in .prepare; undoing it for abort is
+     * always possible. */
+    bdrv_ref(state->new_bs);
+    bdrv_append(state->new_bs, state->old_bs, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
    }
 }

@@ -1778,8 +1789,6 @@ static void external_snapshot_commit(BlkActionState *common)

    bdrv_set_aio_context(state->new_bs, state->aio_context);

-    /* This removes our old bs and adds the new bs */
-    bdrv_append(state->new_bs, state->old_bs);
    /* We don't need (or want) to use the transactional
     * bdrv_reopen_multiple() across all the entries at once, because we
     * don't want to abort all of them if one of them fails the reopen */
@@ -1794,7 +1803,9 @@ static void external_snapshot_abort(BlkActionState *common)
    ExternalSnapshotState *state =
                             DO_UPCAST(ExternalSnapshotState, common, common);
    if (state->new_bs) {
-        bdrv_unref(state->new_bs);
+        if (state->new_bs->backing) {
+            bdrv_replace_in_backing_chain(state->new_bs, state->old_bs);
+        }
    }
 }

@@ -1805,6 +1816,7 @@ static void external_snapshot_clean(BlkActionState *common)
    if (state->aio_context) {
        bdrv_drained_end(state->old_bs);
        aio_context_release(state->aio_context);
+        bdrv_unref(state->new_bs);
    }
 }

@@ -2311,7 +2323,7 @@ static int do_open_tray(const char *blk_name, const char *qdev_id,
    }

    if (!locked || force) {
-        blk_dev_change_media_cb(blk, false);
+        blk_dev_change_media_cb(blk, false, &error_abort);
    }

    if (locked && !force) {
@@ -2349,6 +2361,7 @@ void qmp_blockdev_close_tray(bool has_device, const char *device,
                             Error **errp)
 {
    BlockBackend *blk;
+    Error *local_err = NULL;

    device = has_device ? device : NULL;
    id = has_id ? id : NULL;
@@ -2372,7 +2385,11 @@ void qmp_blockdev_close_tray(bool has_device, const char *device,
        return;
    }

-    blk_dev_change_media_cb(blk, true);
+    blk_dev_change_media_cb(blk, true, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
 }

 void qmp_x_blockdev_remove_medium(bool has_device, const char *device,
@@ -2425,7 +2442,7 @@ void qmp_x_blockdev_remove_medium(bool has_device, const char *device,
         * called at all); therefore, the medium needs to be ejected here.
         * Do it after blk_remove_bs() so blk_is_inserted(blk) returns the @load
         * value passed here (i.e. false). */
-        blk_dev_change_media_cb(blk, false);
+        blk_dev_change_media_cb(blk, false, &error_abort);
    }

 out:
@@ -2435,7 +2452,9 @@ out:
 static void qmp_blockdev_insert_anon_medium(BlockBackend *blk,
                                            BlockDriverState *bs, Error **errp)
 {
+    Error *local_err = NULL;
    bool has_device;
+    int ret;

    /* For BBs without a device, we can exchange the BDS tree at will */
    has_device = blk_get_attached_dev(blk);
@@ -2455,7 +2474,10 @@ static void qmp_blockdev_insert_anon_medium(BlockBackend *blk,
        return;
    }

-    blk_insert_bs(blk, bs);
+    ret = blk_insert_bs(blk, bs, errp);
+    if (ret < 0) {
+        return;
+    }

    if (!blk_dev_has_tray(blk)) {
        /* For tray-less devices, blockdev-close-tray is a no-op (or may not be
@@ -2463,7 +2485,12 @@ static void qmp_blockdev_insert_anon_medium(BlockBackend *blk,
         * slot here.
         * Do it after blk_insert_bs() so blk_is_inserted(blk) returns the @load
         * value passed here (i.e. true). */
-        blk_dev_change_media_cb(blk, true);
+        blk_dev_change_media_cb(blk, true, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            blk_remove_bs(blk);
+            return;
+        }
    }
 }

@@ -2890,8 +2917,11 @@ void qmp_block_resize(bool has_device, const char *device,
        goto out;
    }

-    blk = blk_new();
-    blk_insert_bs(blk, bs);
+    blk = blk_new(BLK_PERM_RESIZE, BLK_PERM_ALL);
+    ret = blk_insert_bs(blk, bs, errp);
+    if (ret < 0) {
+        goto out;
+    }

    /* complete all in-flight operations before resizing the device */
    bdrv_drain_all();
@@ -3014,6 +3044,7 @@ void qmp_block_commit(bool has_job_id, const char *job_id, const char *device,
                      bool has_top, const char *top,
                      bool has_backing_file, const char *backing_file,
                      bool has_speed, int64_t speed,
+                      bool has_filter_node_name, const char *filter_node_name,
                      Error **errp)
 {
    BlockDriverState *bs;
@@ -3029,6 +3060,9 @@ void qmp_block_commit(bool has_job_id, const char *job_id, const char *device,
    if (!has_speed) {
        speed = 0;
    }
+    if (!has_filter_node_name) {
+        filter_node_name = NULL;
+    }

    /* Important Note:
     *  libvirt relies on the DeviceNotFound error class in order to probe for
@@ -3103,8 +3137,8 @@ void qmp_block_commit(bool has_job_id, const char *job_id, const char *device,
            goto out;
        }
        commit_active_start(has_job_id ? job_id : NULL, bs, base_bs,
-                            BLOCK_JOB_DEFAULT, speed, on_error, NULL, NULL,
-                            &local_err, false);
+                            BLOCK_JOB_DEFAULT, speed, on_error,
+                            filter_node_name, NULL, NULL, &local_err, false);
    } else {
        BlockDriverState *overlay_bs = bdrv_find_overlay(bs, top_bs);
        if (bdrv_op_is_blocked(overlay_bs, BLOCK_OP_TYPE_COMMIT_TARGET, errp)) {
@@ -3112,7 +3146,7 @@ void qmp_block_commit(bool has_job_id, const char *job_id, const char *device,
        }
        commit_start(has_job_id ? job_id : NULL, bs, base_bs, top_bs, speed,
                     on_error, has_backing_file ? backing_file : NULL,
-                     &local_err);
+                     filter_node_name, &local_err);
    }
    if (local_err != NULL) {
        error_propagate(errp, local_err);
@@ -3348,6 +3382,8 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
                                   bool has_on_target_error,
                                   BlockdevOnError on_target_error,
                                   bool has_unmap, bool unmap,
+                                   bool has_filter_node_name,
+                                   const char *filter_node_name,
                                   Error **errp)
 {

@@ -3369,6 +3405,9 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
    if (!has_unmap) {
        unmap = true;
    }
+    if (!has_filter_node_name) {
+        filter_node_name = NULL;
+    }

    if (granularity != 0 && (granularity < 512 || granularity > 1048576 * 64)) {
        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "granularity",
@@ -3398,7 +3437,8 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
    mirror_start(job_id, bs, target,
                 has_replaces ? replaces : NULL,
                 speed, granularity, buf_size, sync, backing_mode,
-                 on_source_error, on_target_error, unmap, errp);
+                 on_source_error, on_target_error, unmap, filter_node_name,
+                 errp);
 }

 void qmp_drive_mirror(DriveMirror *arg, Error **errp)
@@ -3536,6 +3576,7 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
                           arg->has_on_source_error, arg->on_source_error,
                           arg->has_on_target_error, arg->on_target_error,
                           arg->has_unmap, arg->unmap,
+                           false, NULL,
                           &local_err);
    bdrv_unref(target_bs);
    error_propagate(errp, local_err);
@@ -3554,6 +3595,8 @@ void qmp_blockdev_mirror(bool has_job_id, const char *job_id,
                         BlockdevOnError on_source_error,
                         bool has_on_target_error,
                         BlockdevOnError on_target_error,
+                         bool has_filter_node_name,
+                         const char *filter_node_name,
                         Error **errp)
 {
    BlockDriverState *bs;
@@ -3585,6 +3628,7 @@ void qmp_blockdev_mirror(bool has_job_id, const char *job_id,
                           has_on_source_error, on_source_error,
                           has_on_target_error, on_target_error,
                           true, true,
+                           has_filter_node_name, filter_node_name,
                           &local_err);
    error_propagate(errp, local_err);

--- a/blockjob.c
+++ b/blockjob.c
@@ -55,6 +55,19 @@ struct BlockJobTxn {

 static QLIST_HEAD(, BlockJob) block_jobs = QLIST_HEAD_INITIALIZER(block_jobs);

+static char *child_job_get_parent_desc(BdrvChild *c)
+{
+    BlockJob *job = c->opaque;
+    return g_strdup_printf("%s job '%s'",
+                           BlockJobType_lookup[job->driver->job_type],
+                           job->id);
+}
+
+static const BdrvChildRole child_job = {
+    .get_parent_desc    = child_job_get_parent_desc,
+    .stay_at_node       = true,
+};
+
 BlockJob *block_job_next(BlockJob *job)
 {
    if (!job) {
@@ -115,19 +128,44 @@ static void block_job_detach_aio_context(void *opaque)
    block_job_unref(job);
 }

-void block_job_add_bdrv(BlockJob *job, BlockDriverState *bs)
+void block_job_remove_all_bdrv(BlockJob *job)
 {
-    job->nodes = g_slist_prepend(job->nodes, bs);
+    GSList *l;
+    for (l = job->nodes; l; l = l->next) {
+        BdrvChild *c = l->data;
+        bdrv_op_unblock_all(c->bs, job->blocker);
+        bdrv_root_unref_child(c);
+    }
+    g_slist_free(job->nodes);
+    job->nodes = NULL;
+}
+
+int block_job_add_bdrv(BlockJob *job, const char *name, BlockDriverState *bs,
+                       uint64_t perm, uint64_t shared_perm, Error **errp)
+{
+    BdrvChild *c;
+
+    c = bdrv_root_attach_child(bs, name, &child_job, perm, shared_perm,
+                               job, errp);
+    if (c == NULL) {
+        return -EPERM;
+    }
+
+    job->nodes = g_slist_prepend(job->nodes, c);
    bdrv_ref(bs);
    bdrv_op_block_all(bs, job->blocker);
+
+    return 0;
 }

 void *block_job_create(const char *job_id, const BlockJobDriver *driver,
-                       BlockDriverState *bs, int64_t speed, int flags,
+                       BlockDriverState *bs, uint64_t perm,
+                       uint64_t shared_perm, int64_t speed, int flags,
                       BlockCompletionFunc *cb, void *opaque, Error **errp)
 {
    BlockBackend *blk;
    BlockJob *job;
+    int ret;

    if (bs->job) {
        error_setg(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
@@ -159,13 +197,17 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
        }
    }

-    blk = blk_new();
-    blk_insert_bs(blk, bs);
+    blk = blk_new(perm, shared_perm);
+    ret = blk_insert_bs(blk, bs, errp);
+    if (ret < 0) {
+        blk_unref(blk);
+        return NULL;
+    }

    job = g_malloc0(driver->instance_size);
    error_setg(&job->blocker, "block device is in use by block job: %s",
               BlockJobType_lookup[driver->job_type]);
-    block_job_add_bdrv(job, bs);
+    block_job_add_bdrv(job, "main node", bs, 0, BLK_PERM_ALL, &error_abort);
    bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);

    job->driver        = driver;
@@ -228,15 +270,9 @@ void block_job_ref(BlockJob *job)
 void block_job_unref(BlockJob *job)
 {
    if (--job->refcnt == 0) {
-        GSList *l;
        BlockDriverState *bs = blk_bs(job->blk);
        bs->job = NULL;
-        for (l = job->nodes; l; l = l->next) {
-            bs = l->data;
-            bdrv_op_unblock_all(bs, job->blocker);
-            bdrv_unref(bs);
-        }
-        g_slist_free(job->nodes);
+        block_job_remove_all_bdrv(job);
        blk_remove_aio_context_notifier(job->blk,
                                        block_job_attached_aio_context,
                                        block_job_detach_aio_context, job);
--- a/default-configs/arm-softmmu.mak
+++ b/default-configs/arm-softmmu.mak
@@ -42,6 +42,8 @@ CONFIG_ARM11MPCORE=y
 CONFIG_A9MPCORE=y
 CONFIG_A15MPCORE=y

+CONFIG_ARM_V7M=y
+
 CONFIG_ARM_GIC=y
 CONFIG_ARM_GIC_KVM=$(CONFIG_KVM)
 CONFIG_ARM_TIMER=y
--- a/docs/migration.txt
+++ b/docs/migration.txt
@@ -161,6 +161,11 @@ include/hw/hw.h.

 === More about versions ===

+Version numbers are intended for major incompatible changes to the
+migration of a device, and using them breaks backwards-migration
+compatibility; in general most changes can be made by adding Subsections
+(see below) or _TEST macros (see below) which won't break compatibility.
+
 You can see that there are several version fields:

 - version_id: the maximum version_id supported by VMState for that device.
@@ -175,6 +180,9 @@ version_id.  And the function load_state_old() (if present) is able to
 load state from minimum_version_id_old to minimum_version_id.  This
 function is deprecated and will be removed when no more users are left.

+Saving state will always create a section with the 'version_id' value
+and thus can't be loaded by any older QEMU.
+
 ===  Massaging functions ===

 Sometimes, it is not enough to be able to save the state directly
@@ -292,6 +300,56 @@ save/send this state when we are in the middle of a pio operation
 not enabled, the values on that fields are garbage and don't need to
 be sent.

+Using a condition function that checks a 'property' to determine whether
+to send a subsection allows backwards migration compatibility when
+new subsections are added.
+
+For example;
+   a) Add a new property using DEFINE_PROP_BOOL - e.g. support-foo and
+      default it to true.
+   b) Add an entry to the HW_COMPAT_ for the previous version
+      that sets the property to false.
+   c) Add a static bool  support_foo function that tests the property.
+   d) Add a subsection with a .needed set to the support_foo function
+   e) (potentially) Add a pre_load that sets up a default value for 'foo'
+      to be used if the subsection isn't loaded.
+
+Now that subsection will not be generated when using an older
+machine type and the migration stream will be accepted by older
+QEMU versions. pre-load functions can be used to initialise state
+on the newer version so that they default to suitable values
+when loading streams created by older QEMU versions that do not
+generate the subsection.
+
+In some cases subsections are added for data that had been accidentally
+omitted by earlier versions; if the missing data causes the migration
+process to succeed but the guest to behave badly then it may be better
+to send the subsection and cause the migration to explicitly fail
+with the unknown subsection error.   If the bad behaviour only happens
+with certain data values, making the subsection conditional on
+the data value (rather than the machine type) allows migrations to succeed
+in most cases.  In general the preference is to tie the subsection to
+the machine type, and allow reliable migrations, unless the behaviour
+from omission of the subsection is really bad.
+
+= Not sending existing elements =
+
+Sometimes members of the VMState are no longer needed;
+  removing them will break migration compatibility
+  making them version dependent and bumping the version will break backwards
+   migration compatibility.
+
+The best way is to:
+  a) Add a new property/compatibility/function in the same way for subsections
+    above.
+  b) replace the VMSTATE macro with the _TEST version of the macro, e.g.:
+     VMSTATE_UINT32(foo, barstruct)
+   becomes
+     VMSTATE_UINT32_TEST(foo, barstruct, pre_version_baz)
+
+  Sometime in the future when we no longer care about the ancient
+versions these can be killed off.
+
 = Return path =

 In most migration scenarios there is only a single data path that runs
@@ -482,3 +540,16 @@ request for a page that has already been sent is ignored.  Duplicate requests
 such as this can happen as a page is sent at about the same time the
 destination accesses it.

+=== Postcopy with hugepages ===
+
+Postcopy now works with hugetlbfs backed memory:
+  a) The linux kernel on the destination must support userfault on hugepages.
+  b) The huge-page configuration on the source and destination VMs must be
+     identical; i.e. RAMBlocks on both sides must use the same page size.
+  c) Note that -mem-path /dev/hugepages  will fall back to allocating normal
+     RAM if it doesn't have enough hugepages, triggering (b) to fail.
+     Using -mem-prealloc enforces the allocation using hugepages.
+  d) Care should be taken with the size of hugepage used; postcopy with 2MB
+     hugepages works well, however 1GB hugepages are likely to be problematic
+     since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link,
+     and until the full page is transferred the destination thread is blocked.
--- a/docs/replay.txt
+++ b/docs/replay.txt
@@ -225,3 +225,10 @@ recording the virtual machine this filter puts all packets coming from
 the outer world into the log. In replay mode packets from the log are
 injected into the network device. All interactions with network backend
 in replay mode are disabled.
+
+Audio devices
+-------------
+
+Audio data is recorded and replay automatically. The command line for recording
+and replaying must contain identical specifications of audio hardware, e.g.:
+ -soundhw ac97
--- a/2
+++ b/2
--- a/exec.c
+++ b/exec.c
@@ -45,6 +45,12 @@
 #include "exec/address-spaces.h"
 #include "sysemu/xen-mapcache.h"
 #include "trace-root.h"
+
+#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
+#include <fcntl.h>
+#include <linux/falloc.h>
+#endif
+
 #endif
 #include "exec/cpu-all.h"
 #include "qemu/rcu_queue.h"
@@ -1518,6 +1524,19 @@ size_t qemu_ram_pagesize(RAMBlock *rb)
    return rb->page_size;
 }

+/* Returns the largest size of page in use */
+size_t qemu_ram_pagesize_largest(void)
+{
+    RAMBlock *block;
+    size_t largest = 0;
+
+    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+        largest = MAX(largest, qemu_ram_pagesize(block));
+    }
+
+    return largest;
+}
+
 static int memory_try_enable_merging(void *addr, size_t len)
 {
    if (!machine_mem_merge(current_machine)) {
@@ -3294,4 +3313,68 @@ int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
    rcu_read_unlock();
    return ret;
 }
+
+/*
+ * Unmap pages of memory from start to start+length such that
+ * they a) read as 0, b) Trigger whatever fault mechanism
+ * the OS provides for postcopy.
+ * The pages must be unmapped by the end of the function.
+ * Returns: 0 on success, none-0 on failure
+ *
+ */
+int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
+{
+    int ret = -1;
+
+    uint8_t *host_startaddr = rb->host + start;
+
+    if ((uintptr_t)host_startaddr & (rb->page_size - 1)) {
+        error_report("ram_block_discard_range: Unaligned start address: %p",
+                     host_startaddr);
+        goto err;
+    }
+
+    if ((start + length) <= rb->used_length) {
+        uint8_t *host_endaddr = host_startaddr + length;
+        if ((uintptr_t)host_endaddr & (rb->page_size - 1)) {
+            error_report("ram_block_discard_range: Unaligned end address: %p",
+                         host_endaddr);
+            goto err;
+        }
+
+        errno = ENOTSUP; /* If we are missing MADVISE etc */
+
+        if (rb->page_size == qemu_host_page_size) {
+#if defined(CONFIG_MADVISE)
+            /* Note: We need the madvise MADV_DONTNEED behaviour of definitely
+             * freeing the page.
+             */
+            ret = madvise(host_startaddr, length, MADV_DONTNEED);
+#endif
+        } else {
+            /* Huge page case  - unfortunately it can't do DONTNEED, but
+             * it can do the equivalent by FALLOC_FL_PUNCH_HOLE in the
+             * huge page file.
+             */
+#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
+            ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+                            start, length);
+#endif
+        }
+        if (ret) {
+            ret = -errno;
+            error_report("ram_block_discard_range: Failed to discard range "
+                         "%s:%" PRIx64 " +%zx (%d)",
+                         rb->idstr, start, length, ret);
+        }
+    } else {
+        error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
+                     "/%zx/" RAM_ADDR_FMT")",
+                     rb->idstr, start, length, rb->used_length);
+    }
+
+err:
+    return ret;
+}
+
 #endif
--- a/hmp.c
+++ b/hmp.c
@@ -2045,13 +2045,17 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
    const char* device = qdict_get_str(qdict, "device");
    const char* command = qdict_get_str(qdict, "command");
    Error *err = NULL;
+    int ret;

    blk = blk_by_name(device);
    if (!blk) {
        BlockDriverState *bs = bdrv_lookup_bs(NULL, device, &err);
        if (bs) {
-            blk = local_blk = blk_new();
-            blk_insert_bs(blk, bs);
+            blk = local_blk = blk_new(0, BLK_PERM_ALL);
+            ret = blk_insert_bs(blk, bs, &err);
+            if (ret < 0) {
+                goto fail;
+            }
        } else {
            goto fail;
        }
@@ -2060,6 +2064,31 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
    aio_context = blk_get_aio_context(blk);
    aio_context_acquire(aio_context);

+    /*
+     * Notably absent: Proper permission management. This is sad, but it seems
+     * almost impossible to achieve without changing the semantics and thereby
+     * limiting the use cases of the qemu-io HMP command.
+     *
+     * In an ideal world we would unconditionally create a new BlockBackend for
+     * qemuio_command(), but we have commands like 'reopen' and want them to
+     * take effect on the exact BlockBackend whose name the user passed instead
+     * of just on a temporary copy of it.
+     *
+     * Another problem is that deleting the temporary BlockBackend involves
+     * draining all requests on it first, but some qemu-iotests cases want to
+     * issue multiple aio_read/write requests and expect them to complete in
+     * the background while the monitor has already returned.
+     *
+     * This is also what prevents us from saving the original permissions and
+     * restoring them later: We can't revoke permissions until all requests
+     * have completed, and we don't know when that is nor can we really let
+     * anything else run before we have revoken them to avoid race conditions.
+     *
+     * What happens now is that command() in qemu-io-cmds.c can extend the
+     * permissions if necessary for the qemu-io command. And they simply stay
+     * extended, possibly resulting in a read-only guest device keeping write
+     * permissions. Ugly, but it appears to be the lesser evil.
+     */
    qemuio_command(blk, command);

    aio_context_release(aio_context);
--- a/hw/9pfs/9p-local.c
+++ b/hw/9pfs/9p-local.c
--- a/hw/9pfs/9p-local.h
+++ b/hw/9pfs/9p-local.h
@@ -0,0 +1,20 @@
+/*
+ * 9p local backend utilities
+ *
+ * Copyright IBM, Corp. 2017
+ *
+ * Authors:
+ *  Greg Kurz <groug@kaod.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_9P_LOCAL_H
+#define QEMU_9P_LOCAL_H
+
+int local_open_nofollow(FsContext *fs_ctx, const char *path, int flags,
+                        mode_t mode);
+int local_opendir_nofollow(FsContext *fs_ctx, const char *path);
+
+#endif
--- a/hw/9pfs/9p-posix-acl.c
+++ b/hw/9pfs/9p-posix-acl.c
@@ -25,13 +25,7 @@
 static ssize_t mp_pacl_getxattr(FsContext *ctx, const char *path,
                                const char *name, void *value, size_t size)
 {
-    char *buffer;
-    ssize_t ret;
-
-    buffer = rpath(ctx, path);
-    ret = lgetxattr(buffer, MAP_ACL_ACCESS, value, size);
-    g_free(buffer);
-    return ret;
+    return local_getxattr_nofollow(ctx, path, MAP_ACL_ACCESS, value, size);
 }

 static ssize_t mp_pacl_listxattr(FsContext *ctx, const char *path,
@@ -56,23 +50,16 @@ static ssize_t mp_pacl_listxattr(FsContext *ctx, const char *path,
 static int mp_pacl_setxattr(FsContext *ctx, const char *path, const char *name,
                            void *value, size_t size, int flags)
 {
-    char *buffer;
-    int ret;
-
-    buffer = rpath(ctx, path);
-    ret = lsetxattr(buffer, MAP_ACL_ACCESS, value, size, flags);
-    g_free(buffer);
-    return ret;
+    return local_setxattr_nofollow(ctx, path, MAP_ACL_ACCESS, value, size,
+                                   flags);
 }

 static int mp_pacl_removexattr(FsContext *ctx,
                               const char *path, const char *name)
 {
    int ret;
-    char *buffer;

-    buffer = rpath(ctx, path);
-    ret  = lremovexattr(buffer, MAP_ACL_ACCESS);
+    ret = local_removexattr_nofollow(ctx, path, MAP_ACL_ACCESS);
    if (ret == -1 && errno == ENODATA) {
        /*
         * We don't get ENODATA error when trying to remove a
@@ -82,20 +69,13 @@ static int mp_pacl_removexattr(FsContext *ctx,
        errno = 0;
        ret = 0;
    }
-    g_free(buffer);
    return ret;
 }

 static ssize_t mp_dacl_getxattr(FsContext *ctx, const char *path,
                                const char *name, void *value, size_t size)
 {
-    char *buffer;
-    ssize_t ret;
-
-    buffer = rpath(ctx, path);
-    ret = lgetxattr(buffer, MAP_ACL_DEFAULT, value, size);
-    g_free(buffer);
-    return ret;
+    return local_getxattr_nofollow(ctx, path, MAP_ACL_DEFAULT, value, size);
 }

 static ssize_t mp_dacl_listxattr(FsContext *ctx, const char *path,
@@ -120,23 +100,16 @@ static ssize_t mp_dacl_listxattr(FsContext *ctx, const char *path,
 static int mp_dacl_setxattr(FsContext *ctx, const char *path, const char *name,
                            void *value, size_t size, int flags)
 {
-    char *buffer;
-    int ret;
-
-    buffer = rpath(ctx, path);
-    ret = lsetxattr(buffer, MAP_ACL_DEFAULT, value, size, flags);
-    g_free(buffer);
-    return ret;
+    return local_setxattr_nofollow(ctx, path, MAP_ACL_DEFAULT, value, size,
+                                   flags);
 }

 static int mp_dacl_removexattr(FsContext *ctx,
                               const char *path, const char *name)
 {
    int ret;
-    char *buffer;

-    buffer = rpath(ctx, path);
-    ret  = lremovexattr(buffer, MAP_ACL_DEFAULT);
+    ret = local_removexattr_nofollow(ctx, path, MAP_ACL_DEFAULT);
    if (ret == -1 && errno == ENODATA) {
        /*
         * We don't get ENODATA error when trying to remove a
@@ -146,7 +119,6 @@ static int mp_dacl_removexattr(FsContext *ctx,
        errno = 0;
        ret = 0;
    }
-    g_free(buffer);
    return ret;
 }

--- a/hw/9pfs/9p-util.c
+++ b/hw/9pfs/9p-util.c
@@ -0,0 +1,69 @@
+/*
+ * 9p utilities
+ *
+ * Copyright IBM, Corp. 2017
+ *
+ * Authors:
+ *  Greg Kurz <groug@kaod.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/xattr.h"
+#include "9p-util.h"
+
+int relative_openat_nofollow(int dirfd, const char *path, int flags,
+                             mode_t mode)
+{
+    int fd;
+
+    fd = dup(dirfd);
+    if (fd == -1) {
+        return -1;
+    }
+
+    while (*path) {
+        const char *c;
+        int next_fd;
+        char *head;
+
+        /* Only relative paths without consecutive slashes */
+        assert(path[0] != '/');
+
+        head = g_strdup(path);
+        c = strchr(path, '/');
+        if (c) {
+            head[c - path] = 0;
+            next_fd = openat_dir(fd, head);
+        } else {
+            next_fd = openat_file(fd, head, flags, mode);
+        }
+        g_free(head);
+        if (next_fd == -1) {
+            close_preserve_errno(fd);
+            return -1;
+        }
+        close(fd);
+        fd = next_fd;
+
+        if (!c) {
+            break;
+        }
+        path = c + 1;
+    }
+
+    return fd;
+}
+
+ssize_t fgetxattrat_nofollow(int dirfd, const char *filename, const char *name,
+                             void *value, size_t size)
+{
+    char *proc_path = g_strdup_printf("/proc/self/fd/%d/%s", dirfd, filename);
+    int ret;
+
+    ret = lgetxattr(proc_path, name, value, size);
+    g_free(proc_path);
+    return ret;
+}
--- a/hw/9pfs/9p-util.h
+++ b/hw/9pfs/9p-util.h
@@ -0,0 +1,54 @@
+/*
+ * 9p utilities
+ *
+ * Copyright IBM, Corp. 2017
+ *
+ * Authors:
+ *  Greg Kurz <groug@kaod.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_9P_UTIL_H
+#define QEMU_9P_UTIL_H
+
+static inline void close_preserve_errno(int fd)
+{
+    int serrno = errno;
+    close(fd);
+    errno = serrno;
+}
+
+static inline int openat_dir(int dirfd, const char *name)
+{
+    return openat(dirfd, name, O_DIRECTORY | O_RDONLY | O_PATH);
+}
+
+static inline int openat_file(int dirfd, const char *name, int flags,
+                              mode_t mode)
+{
+    int fd, serrno, ret;
+
+    fd = openat(dirfd, name, flags | O_NOFOLLOW | O_NOCTTY | O_NONBLOCK,
+                mode);
+    if (fd == -1) {
+        return -1;
+    }
+
+    serrno = errno;
+    /* O_NONBLOCK was only needed to open the file. Let's drop it. */
+    ret = fcntl(fd, F_SETFL, flags);
+    assert(!ret);
+    errno = serrno;
+    return fd;
+}
+
+int relative_openat_nofollow(int dirfd, const char *path, int flags,
+                             mode_t mode);
+ssize_t fgetxattrat_nofollow(int dirfd, const char *path, const char *name,
+                             void *value, size_t size);
+int fsetxattrat_nofollow(int dirfd, const char *path, const char *name,
+                         void *value, size_t size, int flags);
+
+#endif
--- a/hw/9pfs/9p-xattr-user.c
+++ b/hw/9pfs/9p-xattr-user.c
@@ -20,9 +20,6 @@
 static ssize_t mp_user_getxattr(FsContext *ctx, const char *path,
                                const char *name, void *value, size_t size)
 {
-    char *buffer;
-    ssize_t ret;
-
    if (strncmp(name, "user.virtfs.", 12) == 0) {
        /*
         * Don't allow fetch of user.virtfs namesapce
@@ -31,10 +28,7 @@ static ssize_t mp_user_getxattr(FsContext *ctx, const char *path,
        errno = ENOATTR;
        return -1;
    }
-    buffer = rpath(ctx, path);
-    ret = lgetxattr(buffer, name, value, size);
-    g_free(buffer);
-    return ret;
+    return local_getxattr_nofollow(ctx, path, name, value, size);
 }

 static ssize_t mp_user_listxattr(FsContext *ctx, const char *path,
@@ -73,9 +67,6 @@ static ssize_t mp_user_listxattr(FsContext *ctx, const char *path,
 static int mp_user_setxattr(FsContext *ctx, const char *path, const char *name,
                            void *value, size_t size, int flags)
 {
-    char *buffer;
-    int ret;
-
    if (strncmp(name, "user.virtfs.", 12) == 0) {
        /*
         * Don't allow fetch of user.virtfs namesapce
@@ -84,18 +75,12 @@ static int mp_user_setxattr(FsContext *ctx, const char *path, const char *name,
        errno = EACCES;
        return -1;
    }
-    buffer = rpath(ctx, path);
-    ret = lsetxattr(buffer, name, value, size, flags);
-    g_free(buffer);
-    return ret;
+    return local_setxattr_nofollow(ctx, path, name, value, size, flags);
 }

 static int mp_user_removexattr(FsContext *ctx,
                               const char *path, const char *name)
 {
-    char *buffer;
-    int ret;
-
    if (strncmp(name, "user.virtfs.", 12) == 0) {
        /*
         * Don't allow fetch of user.virtfs namesapce
@@ -104,10 +89,7 @@ static int mp_user_removexattr(FsContext *ctx,
        errno = EACCES;
        return -1;
    }
-    buffer = rpath(ctx, path);
-    ret = lremovexattr(buffer, name);
-    g_free(buffer);
-    return ret;
+    return local_removexattr_nofollow(ctx, path, name);
 }

 XattrOperations mapped_user_xattr = {
--- a/hw/9pfs/9p-xattr.c
+++ b/hw/9pfs/9p-xattr.c
@@ -15,6 +15,8 @@
 #include "9p.h"
 #include "fsdev/file-op-9p.h"
 #include "9p-xattr.h"
+#include "9p-util.h"
+#include "9p-local.h"


 static XattrOperations *get_xattr_operations(XattrOperations **h,
@@ -58,6 +60,16 @@ ssize_t pt_listxattr(FsContext *ctx, const char *path,
    return name_size;
 }

+static ssize_t flistxattrat_nofollow(int dirfd, const char *filename,
+                                     char *list, size_t size)
+{
+    char *proc_path = g_strdup_printf("/proc/self/fd/%d/%s", dirfd, filename);
+    int ret;
+
+    ret = llistxattr(proc_path, list, size);
+    g_free(proc_path);
+    return ret;
+}

 /*
 * Get the list and pass to each layer to find out whether
@@ -67,24 +79,37 @@ ssize_t v9fs_list_xattr(FsContext *ctx, const char *path,
                        void *value, size_t vsize)
 {
    ssize_t size = 0;
-    char *buffer;
    void *ovalue = value;
    XattrOperations *xops;
    char *orig_value, *orig_value_start;
    ssize_t xattr_len, parsed_len = 0, attr_len;
+    char *dirpath, *name;
+    int dirfd;

    /* Get the actual len */
-    buffer = rpath(ctx, path);
-    xattr_len = llistxattr(buffer, value, 0);
+    dirpath = g_path_get_dirname(path);
+    dirfd = local_opendir_nofollow(ctx, dirpath);
+    g_free(dirpath);
+    if (dirfd == -1) {
+        return -1;
+    }
+
+    name = g_path_get_basename(path);
+    xattr_len = flistxattrat_nofollow(dirfd, name, value, 0);
    if (xattr_len <= 0) {
-        g_free(buffer);
+        g_free(name);
+        close_preserve_errno(dirfd);
        return xattr_len;
    }

    /* Now fetch the xattr and find the actual size */
    orig_value = g_malloc(xattr_len);
-    xattr_len = llistxattr(buffer, orig_value, xattr_len);
-    g_free(buffer);
+    xattr_len = flistxattrat_nofollow(dirfd, name, orig_value, xattr_len);
+    g_free(name);
+    close_preserve_errno(dirfd);
+    if (xattr_len < 0) {
+        return -1;
+    }

    /* store the orig pointer */
    orig_value_start = orig_value;
@@ -143,6 +168,135 @@ int v9fs_remove_xattr(FsContext *ctx,

 }

+ssize_t local_getxattr_nofollow(FsContext *ctx, const char *path,
+                                const char *name, void *value, size_t size)
+{
+    char *dirpath = g_path_get_dirname(path);
+    char *filename = g_path_get_basename(path);
+    int dirfd;
+    ssize_t ret = -1;
+
+    dirfd = local_opendir_nofollow(ctx, dirpath);
+    if (dirfd == -1) {
+        goto out;
+    }
+
+    ret = fgetxattrat_nofollow(dirfd, filename, name, value, size);
+    close_preserve_errno(dirfd);
+out:
+    g_free(dirpath);
+    g_free(filename);
+    return ret;
+}
+
+ssize_t pt_getxattr(FsContext *ctx, const char *path, const char *name,
+                    void *value, size_t size)
+{
+    return local_getxattr_nofollow(ctx, path, name, value, size);
+}
+
+int fsetxattrat_nofollow(int dirfd, const char *filename, const char *name,
+                         void *value, size_t size, int flags)
+{
+    char *proc_path = g_strdup_printf("/proc/self/fd/%d/%s", dirfd, filename);
+    int ret;
+
+    ret = lsetxattr(proc_path, name, value, size, flags);
+    g_free(proc_path);
+    return ret;
+}
+
+ssize_t local_setxattr_nofollow(FsContext *ctx, const char *path,
+                                const char *name, void *value, size_t size,
+                                int flags)
+{
+    char *dirpath = g_path_get_dirname(path);
+    char *filename = g_path_get_basename(path);
+    int dirfd;
+    ssize_t ret = -1;
+
+    dirfd = local_opendir_nofollow(ctx, dirpath);
+    if (dirfd == -1) {
+        goto out;
+    }
+
+    ret = fsetxattrat_nofollow(dirfd, filename, name, value, size, flags);
+    close_preserve_errno(dirfd);
+out:
+    g_free(dirpath);
+    g_free(filename);
+    return ret;
+}
+
+int pt_setxattr(FsContext *ctx, const char *path, const char *name, void *value,
+                size_t size, int flags)
+{
+    return local_setxattr_nofollow(ctx, path, name, value, size, flags);
+}
+
+static ssize_t fremovexattrat_nofollow(int dirfd, const char *filename,
+                                       const char *name)
+{
+    char *proc_path = g_strdup_printf("/proc/self/fd/%d/%s", dirfd, filename);
+    int ret;
+
+    ret = lremovexattr(proc_path, name);
+    g_free(proc_path);
+    return ret;
+}
+
+ssize_t local_removexattr_nofollow(FsContext *ctx, const char *path,
+                                   const char *name)
+{
+    char *dirpath = g_path_get_dirname(path);
+    char *filename = g_path_get_basename(path);
+    int dirfd;
+    ssize_t ret = -1;
+
+    dirfd = local_opendir_nofollow(ctx, dirpath);
+    if (dirfd == -1) {
+        goto out;
+    }
+
+    ret = fremovexattrat_nofollow(dirfd, filename, name);
+    close_preserve_errno(dirfd);
+out:
+    g_free(dirpath);
+    g_free(filename);
+    return ret;
+}
+
+int pt_removexattr(FsContext *ctx, const char *path, const char *name)
+{
+    return local_removexattr_nofollow(ctx, path, name);
+}
+
+ssize_t notsup_getxattr(FsContext *ctx, const char *path, const char *name,
+                        void *value, size_t size)
+{
+    errno = ENOTSUP;
+    return -1;
+}
+
+int notsup_setxattr(FsContext *ctx, const char *path, const char *name,
+                    void *value, size_t size, int flags)
+{
+    errno = ENOTSUP;
+    return -1;
+}
+
+ssize_t notsup_listxattr(FsContext *ctx, const char *path, char *name,
+                         void *value, size_t size)
+{
+    return 0;
+}
+
+int notsup_removexattr(FsContext *ctx, const char *path, const char *name)
+{
+    errno = ENOTSUP;
+    return -1;
+}
+
 XattrOperations *mapped_xattr_ops[] = {
    &mapped_user_xattr,
    &mapped_pacl_xattr,
--- a/hw/9pfs/9p-xattr.h
+++ b/hw/9pfs/9p-xattr.h
@@ -29,6 +29,13 @@ typedef struct xattr_operations
                       const char *path, const char *name);
 } XattrOperations;

+ssize_t local_getxattr_nofollow(FsContext *ctx, const char *path,
+                                const char *name, void *value, size_t size);
+ssize_t local_setxattr_nofollow(FsContext *ctx, const char *path,
+                                const char *name, void *value, size_t size,
+                                int flags);
+ssize_t local_removexattr_nofollow(FsContext *ctx, const char *path,
+                                   const char *name);

 extern XattrOperations mapped_user_xattr;
 extern XattrOperations passthrough_user_xattr;
@@ -49,73 +56,21 @@ ssize_t v9fs_list_xattr(FsContext *ctx, const char *path, void *value,
 int v9fs_set_xattr(FsContext *ctx, const char *path, const char *name,
                          void *value, size_t size, int flags);
 int v9fs_remove_xattr(FsContext *ctx, const char *path, const char *name);
+
 ssize_t pt_listxattr(FsContext *ctx, const char *path, char *name, void *value,
                     size_t size);
+ssize_t pt_getxattr(FsContext *ctx, const char *path, const char *name,
+                    void *value, size_t size);
+int pt_setxattr(FsContext *ctx, const char *path, const char *name, void *value,
+                size_t size, int flags);
+int pt_removexattr(FsContext *ctx, const char *path, const char *name);

-static inline ssize_t pt_getxattr(FsContext *ctx, const char *path,
-                                  const char *name, void *value, size_t size)
-{
-    char *buffer;
-    ssize_t ret;
-
-    buffer = rpath(ctx, path);
-    ret = lgetxattr(buffer, name, value, size);
-    g_free(buffer);
-    return ret;
-}
-
-static inline int pt_setxattr(FsContext *ctx, const char *path,
-                              const char *name, void *value,
-                              size_t size, int flags)
-{
-    char *buffer;
-    int ret;
-
-    buffer = rpath(ctx, path);
-    ret = lsetxattr(buffer, name, value, size, flags);
-    g_free(buffer);
-    return ret;
-}
-
-static inline int pt_removexattr(FsContext *ctx,
-                                 const char *path, const char *name)
-{
-    char *buffer;
-    int ret;
-
-    buffer = rpath(ctx, path);
-    ret = lremovexattr(path, name);
-    g_free(buffer);
-    return ret;
-}
-
-static inline ssize_t notsup_getxattr(FsContext *ctx, const char *path,
-                                      const char *name, void *value,
-                                      size_t size)
-{
-    errno = ENOTSUP;
-    return -1;
-}
-
-static inline int notsup_setxattr(FsContext *ctx, const char *path,
-                                  const char *name, void *value,
-                                  size_t size, int flags)
-{
-    errno = ENOTSUP;
-    return -1;
-}
-
-static inline ssize_t notsup_listxattr(FsContext *ctx, const char *path,
-                                       char *name, void *value, size_t size)
-{
-    return 0;
-}
-
-static inline int notsup_removexattr(FsContext *ctx,
-                                     const char *path, const char *name)
-{
-    errno = ENOTSUP;
-    return -1;
-}
+ssize_t notsup_getxattr(FsContext *ctx, const char *path, const char *name,
+                        void *value, size_t size);
+int notsup_setxattr(FsContext *ctx, const char *path, const char *name,
+                    void *value, size_t size, int flags);
+ssize_t notsup_listxattr(FsContext *ctx, const char *path, char *name,
+                         void *value, size_t size);
+int notsup_removexattr(FsContext *ctx, const char *path, const char *name);

 #endif
--- a/hw/9pfs/Makefile.objs
+++ b/hw/9pfs/Makefile.objs
@@ -1,4 +1,4 @@
-common-obj-y  = 9p.o
+common-obj-y  = 9p.o 9p-util.o
 common-obj-y += 9p-local.o 9p-xattr.o
 common-obj-y += 9p-xattr-user.o 9p-posix-acl.o
 common-obj-y += coth.o cofs.o codir.o cofile.o
--- a/hw/acpi/pcihp.c
+++ b/hw/acpi/pcihp.c
@@ -49,7 +49,6 @@

 #define ACPI_PCIHP_ADDR 0xae00
 #define ACPI_PCIHP_SIZE 0x0014
-#define ACPI_PCIHP_LEGACY_SIZE 0x000f
 #define PCI_UP_BASE 0x0000
 #define PCI_DOWN_BASE 0x0004
 #define PCI_EJ_BASE 0x0008
@@ -302,16 +301,6 @@ void acpi_pcihp_init(Object *owner, AcpiPciHpState *s, PCIBus *root_bus,
    s->root= root_bus;
    s->legacy_piix = !bridges_enabled;

-    if (s->legacy_piix) {
-        unsigned *bus_bsel = g_malloc(sizeof *bus_bsel);
-
-        s->io_len = ACPI_PCIHP_LEGACY_SIZE;
-
-        *bus_bsel = ACPI_PCIHP_BSEL_DEFAULT;
-        object_property_add_uint32_ptr(OBJECT(root_bus), ACPI_PCIHP_PROP_BSEL,
-                                       bus_bsel, NULL);
-    }
-
    memory_region_init_io(&s->io, owner, &acpi_pcihp_io_ops, s,
                          "acpi-pci-hotplug", s->io_len);
    memory_region_add_subregion(address_space_io, s->io_base, &s->io);
--- a/hw/acpi/piix4.c
+++ b/hw/acpi/piix4.c
@@ -440,6 +440,8 @@ static void piix4_update_bus_hotplug(PCIBus *pci_bus, void *opaque)
 {
    PIIX4PMState *s = opaque;

+    /* pci_bus cannot outlive PIIX4PMState, because /machine keeps it alive
+     * and it's not hot-unpluggable */
    qbus_set_hotplug_handler(BUS(pci_bus), DEVICE(s), &error_abort);
 }

--- a/hw/arm/armv7m.c
+++ b/hw/arm/armv7m.c
@@ -8,6 +8,7 @@
 */

 #include "qemu/osdep.h"
+#include "hw/arm/armv7m.h"
 #include "qapi/error.h"
 #include "qemu-common.h"
 #include "cpu.h"
@@ -17,148 +18,261 @@
 #include "elf.h"
 #include "sysemu/qtest.h"
 #include "qemu/error-report.h"
+#include "exec/address-spaces.h"

 /* Bitbanded IO.  Each word corresponds to a single bit.  */

 /* Get the byte address of the real memory for a bitband access.  */
-static inline uint32_t bitband_addr(void * opaque, uint32_t addr)
+static inline hwaddr bitband_addr(BitBandState *s, hwaddr offset)
 {
-    uint32_t res;
-
-    res = *(uint32_t *)opaque;
-    res |= (addr & 0x1ffffff) >> 5;
-    return res;
-
+    return s->base | (offset & 0x1ffffff) >> 5;
 }

-static uint32_t bitband_readb(void *opaque, hwaddr offset)
+static MemTxResult bitband_read(void *opaque, hwaddr offset,
+                                uint64_t *data, unsigned size, MemTxAttrs attrs)
 {
-    uint8_t v;
-    cpu_physical_memory_read(bitband_addr(opaque, offset), &v, 1);
-    return (v & (1 << ((offset >> 2) & 7))) != 0;
+    BitBandState *s = opaque;
+    uint8_t buf[4];
+    MemTxResult res;
+    int bitpos, bit;
+    hwaddr addr;
+
+    assert(size <= 4);
+
+    /* Find address in underlying memory and round down to multiple of size */
+    addr = bitband_addr(s, offset) & (-size);
+    res = address_space_read(s->source_as, addr, attrs, buf, size);
+    if (res) {
+        return res;
+    }
+    /* Bit position in the N bytes read... */
+    bitpos = (offset >> 2) & ((size * 8) - 1);
+    /* ...converted to byte in buffer and bit in byte */
+    bit = (buf[bitpos >> 3] >> (bitpos & 7)) & 1;
+    *data = bit;
+    return MEMTX_OK;
 }

-static void bitband_writeb(void *opaque, hwaddr offset,
-                           uint32_t value)
+static MemTxResult bitband_write(void *opaque, hwaddr offset, uint64_t value,
+                                 unsigned size, MemTxAttrs attrs)
 {
-    uint32_t addr;
-    uint8_t mask;
-    uint8_t v;
-    addr = bitband_addr(opaque, offset);
-    mask = (1 << ((offset >> 2) & 7));
-    cpu_physical_memory_read(addr, &v, 1);
-    if (value & 1)
-        v |= mask;
-    else
-        v &= ~mask;
-    cpu_physical_memory_write(addr, &v, 1);
-}
+    BitBandState *s = opaque;
+    uint8_t buf[4];
+    MemTxResult res;
+    int bitpos, bit;
+    hwaddr addr;

-static uint32_t bitband_readw(void *opaque, hwaddr offset)
-{
-    uint32_t addr;
-    uint16_t mask;
-    uint16_t v;
-    addr = bitband_addr(opaque, offset) & ~1;
-    mask = (1 << ((offset >> 2) & 15));
-    mask = tswap16(mask);
-    cpu_physical_memory_read(addr, &v, 2);
-    return (v & mask) != 0;
-}
+    assert(size <= 4);

-static void bitband_writew(void *opaque, hwaddr offset,
-                           uint32_t value)
-{
-    uint32_t addr;
-    uint16_t mask;
-    uint16_t v;
-    addr = bitband_addr(opaque, offset) & ~1;
-    mask = (1 << ((offset >> 2) & 15));
-    mask = tswap16(mask);
-    cpu_physical_memory_read(addr, &v, 2);
-    if (value & 1)
-        v |= mask;
-    else
-        v &= ~mask;
-    cpu_physical_memory_write(addr, &v, 2);
-}
-
-static uint32_t bitband_readl(void *opaque, hwaddr offset)
-{
-    uint32_t addr;
-    uint32_t mask;
-    uint32_t v;
-    addr = bitband_addr(opaque, offset) & ~3;
-    mask = (1 << ((offset >> 2) & 31));
-    mask = tswap32(mask);
-    cpu_physical_memory_read(addr, &v, 4);
-    return (v & mask) != 0;
-}
-
-static void bitband_writel(void *opaque, hwaddr offset,
-                           uint32_t value)
-{
-    uint32_t addr;
-    uint32_t mask;
-    uint32_t v;
-    addr = bitband_addr(opaque, offset) & ~3;
-    mask = (1 << ((offset >> 2) & 31));
-    mask = tswap32(mask);
-    cpu_physical_memory_read(addr, &v, 4);
-    if (value & 1)
-        v |= mask;
-    else
-        v &= ~mask;
-    cpu_physical_memory_write(addr, &v, 4);
+    /* Find address in underlying memory and round down to multiple of size */
+    addr = bitband_addr(s, offset) & (-size);
+    res = address_space_read(s->source_as, addr, attrs, buf, size);
+    if (res) {
+        return res;
+    }
+    /* Bit position in the N bytes read... */
+    bitpos = (offset >> 2) & ((size * 8) - 1);
+    /* ...converted to byte in buffer and bit in byte */
+    bit = 1 << (bitpos & 7);
+    if (value & 1) {
+        buf[bitpos >> 3] |= bit;
+    } else {
+        buf[bitpos >> 3] &= ~bit;
+    }
+    return address_space_write(s->source_as, addr, attrs, buf, size);
 }

 static const MemoryRegionOps bitband_ops = {
-    .old_mmio = {
-        .read = { bitband_readb, bitband_readw, bitband_readl, },
-        .write = { bitband_writeb, bitband_writew, bitband_writel, },
-    },
+    .read_with_attrs = bitband_read,
+    .write_with_attrs = bitband_write,
    .endianness = DEVICE_NATIVE_ENDIAN,
+    .impl.min_access_size = 1,
+    .impl.max_access_size = 4,
+    .valid.min_access_size = 1,
+    .valid.max_access_size = 4,
 };

-#define TYPE_BITBAND "ARM,bitband-memory"
-#define BITBAND(obj) OBJECT_CHECK(BitBandState, (obj), TYPE_BITBAND)
-
-typedef struct {
-    /*< private >*/
-    SysBusDevice parent_obj;
-    /*< public >*/
-
-    MemoryRegion iomem;
-    uint32_t base;
-} BitBandState;
-
 static void bitband_init(Object *obj)
 {
    BitBandState *s = BITBAND(obj);
    SysBusDevice *dev = SYS_BUS_DEVICE(obj);

-    memory_region_init_io(&s->iomem, obj, &bitband_ops, &s->base,
+    object_property_add_link(obj, "source-memory",
+                             TYPE_MEMORY_REGION,
+                             (Object **)&s->source_memory,
+                             qdev_prop_allow_set_link_before_realize,
+                             OBJ_PROP_LINK_UNREF_ON_RELEASE,
+                             &error_abort);
+    memory_region_init_io(&s->iomem, obj, &bitband_ops, s,
                          "bitband", 0x02000000);
    sysbus_init_mmio(dev, &s->iomem);
 }

-static void armv7m_bitband_init(void)
+static void bitband_realize(DeviceState *dev, Error **errp)
 {
-    DeviceState *dev;
+    BitBandState *s = BITBAND(dev);

-    dev = qdev_create(NULL, TYPE_BITBAND);
-    qdev_prop_set_uint32(dev, "base", 0x20000000);
-    qdev_init_nofail(dev);
-    sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, 0x22000000);
+    if (!s->source_memory) {
+        error_setg(errp, "source-memory property not set");
+        return;
+    }

-    dev = qdev_create(NULL, TYPE_BITBAND);
-    qdev_prop_set_uint32(dev, "base", 0x40000000);
-    qdev_init_nofail(dev);
-    sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, 0x42000000);
+    s->source_as = address_space_init_shareable(s->source_memory,
+                                                "bitband-source");
 }

 /* Board init.  */

+static const hwaddr bitband_input_addr[ARMV7M_NUM_BITBANDS] = {
+    0x20000000, 0x40000000
+};
+
+static const hwaddr bitband_output_addr[ARMV7M_NUM_BITBANDS] = {
+    0x22000000, 0x42000000
+};
+
+static void armv7m_instance_init(Object *obj)
+{
+    ARMv7MState *s = ARMV7M(obj);
+    int i;
+
+    /* Can't init the cpu here, we don't yet know which model to use */
+
+    object_property_add_link(obj, "memory",
+                             TYPE_MEMORY_REGION,
+                             (Object **)&s->board_memory,
+                             qdev_prop_allow_set_link_before_realize,
+                             OBJ_PROP_LINK_UNREF_ON_RELEASE,
+                             &error_abort);
+    memory_region_init(&s->container, obj, "armv7m-container", UINT64_MAX);
+
+    object_initialize(&s->nvic, sizeof(s->nvic), "armv7m_nvic");
+    qdev_set_parent_bus(DEVICE(&s->nvic), sysbus_get_default());
+    object_property_add_alias(obj, "num-irq",
+                              OBJECT(&s->nvic), "num-irq", &error_abort);
+
+    for (i = 0; i < ARRAY_SIZE(s->bitband); i++) {
+        object_initialize(&s->bitband[i], sizeof(s->bitband[i]), TYPE_BITBAND);
+        qdev_set_parent_bus(DEVICE(&s->bitband[i]), sysbus_get_default());
+    }
+}
+
+static void armv7m_realize(DeviceState *dev, Error **errp)
+{
+    ARMv7MState *s = ARMV7M(dev);
+    SysBusDevice *sbd;
+    Error *err = NULL;
+    int i;
+    char **cpustr;
+    ObjectClass *oc;
+    const char *typename;
+    CPUClass *cc;
+
+    if (!s->board_memory) {
+        error_setg(errp, "memory property was not set");
+        return;
+    }
+
+    memory_region_add_subregion_overlap(&s->container, 0, s->board_memory, -1);
+
+    cpustr = g_strsplit(s->cpu_model, ",", 2);
+
+    oc = cpu_class_by_name(TYPE_ARM_CPU, cpustr[0]);
+    if (!oc) {
+        error_setg(errp, "Unknown CPU model %s", cpustr[0]);
+        g_strfreev(cpustr);
+        return;
+    }
+
+    cc = CPU_CLASS(oc);
+    typename = object_class_get_name(oc);
+    cc->parse_features(typename, cpustr[1], &err);
+    g_strfreev(cpustr);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+
+    s->cpu = ARM_CPU(object_new(typename));
+    if (!s->cpu) {
+        error_setg(errp, "Unknown CPU model %s", s->cpu_model);
+        return;
+    }
+
+    object_property_set_link(OBJECT(s->cpu), OBJECT(&s->container), "memory",
+                             &error_abort);
+    object_property_set_bool(OBJECT(s->cpu), true, "realized", &err);
+    if (err != NULL) {
+        error_propagate(errp, err);
+        return;
+    }
+
+    /* Note that we must realize the NVIC after the CPU */
+    object_property_set_bool(OBJECT(&s->nvic), true, "realized", &err);
+    if (err != NULL) {
+        error_propagate(errp, err);
+        return;
+    }
+
+    /* Alias the NVIC's input and output GPIOs as our own so the board
+     * code can wire them up. (We do this in realize because the
+     * NVIC doesn't create the input GPIO array until realize.)
+     */
+    qdev_pass_gpios(DEVICE(&s->nvic), dev, NULL);
+    qdev_pass_gpios(DEVICE(&s->nvic), dev, "SYSRESETREQ");
+
+    /* Wire the NVIC up to the CPU */
+    sbd = SYS_BUS_DEVICE(&s->nvic);
+    sysbus_connect_irq(sbd, 0,
+                       qdev_get_gpio_in(DEVICE(s->cpu), ARM_CPU_IRQ));
+    s->cpu->env.nvic = &s->nvic;
+
+    memory_region_add_subregion(&s->container, 0xe000e000,
+                                sysbus_mmio_get_region(sbd, 0));
+
+    for (i = 0; i < ARRAY_SIZE(s->bitband); i++) {
+        Object *obj = OBJECT(&s->bitband[i]);
+        SysBusDevice *sbd = SYS_BUS_DEVICE(&s->bitband[i]);
+
+        object_property_set_int(obj, bitband_input_addr[i], "base", &err);
+        if (err != NULL) {
+            error_propagate(errp, err);
+            return;
+        }
+        object_property_set_link(obj, OBJECT(s->board_memory),
+                                 "source-memory", &error_abort);
+        object_property_set_bool(obj, true, "realized", &err);
+        if (err != NULL) {
+            error_propagate(errp, err);
+            return;
+        }
+
+        memory_region_add_subregion(&s->container, bitband_output_addr[i],
+                                    sysbus_mmio_get_region(sbd, 0));
+    }
+}
+
+static Property armv7m_properties[] = {
+    DEFINE_PROP_STRING("cpu-model", ARMv7MState, cpu_model),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void armv7m_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->realize = armv7m_realize;
+    dc->props = armv7m_properties;
+}
+
+static const TypeInfo armv7m_info = {
+    .name = TYPE_ARMV7M,
+    .parent = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(ARMv7MState),
+    .instance_init = armv7m_instance_init,
+    .class_init = armv7m_class_init,
+};
+
 static void armv7m_reset(void *opaque)
 {
    ARMCPU *cpu = opaque;
@@ -168,38 +282,36 @@ static void armv7m_reset(void *opaque)

 /* Init CPU and memory for a v7-M based board.
   mem_size is in bytes.
-   Returns the NVIC array.  */
+   Returns the ARMv7M device.  */

 DeviceState *armv7m_init(MemoryRegion *system_memory, int mem_size, int num_irq,
                      const char *kernel_filename, const char *cpu_model)
 {
-    ARMCPU *cpu;
-    CPUARMState *env;
-    DeviceState *nvic;
+    DeviceState *armv7m;
+
+    if (cpu_model == NULL) {
+        cpu_model = "cortex-m3";
+    }
+
+    armv7m = qdev_create(NULL, "armv7m");
+    qdev_prop_set_uint32(armv7m, "num-irq", num_irq);
+    qdev_prop_set_string(armv7m, "cpu-model", cpu_model);
+    object_property_set_link(OBJECT(armv7m), OBJECT(get_system_memory()),
+                                     "memory", &error_abort);
+    /* This will exit with an error if the user passed us a bad cpu_model */
+    qdev_init_nofail(armv7m);
+
+    armv7m_load_kernel(ARM_CPU(first_cpu), kernel_filename, mem_size);
+    return armv7m;
+}
+
+void armv7m_load_kernel(ARMCPU *cpu, const char *kernel_filename, int mem_size)
+{
    int image_size;
    uint64_t entry;
    uint64_t lowaddr;
    int big_endian;

-    if (cpu_model == NULL) {
-	cpu_model = "cortex-m3";
-    }
-    cpu = cpu_arm_init(cpu_model);
-    if (cpu == NULL) {
-        fprintf(stderr, "Unable to find CPU definition\n");
-        exit(1);
-    }
-    env = &cpu->env;
-
-    armv7m_bitband_init();
-
-    nvic = qdev_create(NULL, "armv7m_nvic");
-    qdev_prop_set_uint32(nvic, "num-irq", num_irq);
-    env->nvic = nvic;
-    qdev_init_nofail(nvic);
-    sysbus_connect_irq(SYS_BUS_DEVICE(nvic), 0,
-                       qdev_get_gpio_in(DEVICE(cpu), ARM_CPU_IRQ));
-
 #ifdef TARGET_WORDS_BIGENDIAN
    big_endian = 1;
 #else
@@ -224,8 +336,15 @@ DeviceState *armv7m_init(MemoryRegion *system_memory, int mem_size, int num_irq,
        }
    }

+    /* CPU objects (unlike devices) are not automatically reset on system
+     * reset, so we must always register a handler to do so. Unlike
+     * A-profile CPUs, we don't need to do anything special in the
+     * handler to arrange that it starts correctly.
+     * This is arguably the wrong place to do this, but it matches the
+     * way A-profile does it. Note that this means that every M profile
+     * board must call this function!
+     */
    qemu_register_reset(armv7m_reset, cpu);
-    return nvic;
 }

 static Property bitband_properties[] = {
@@ -237,6 +356,7 @@ static void bitband_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);

+    dc->realize = bitband_realize;
    dc->props = bitband_properties;
 }

@@ -251,6 +371,7 @@ static const TypeInfo bitband_info = {
 static void armv7m_register_types(void)
 {
    type_register_static(&bitband_info);
+    type_register_static(&armv7m_info);
 }

 type_init(armv7m_register_types)
--- a/hw/arm/bcm2835_peripherals.c
+++ b/hw/arm/bcm2835_peripherals.c
@@ -96,6 +96,11 @@ static void bcm2835_peripherals_init(Object *obj)
    object_property_add_child(obj, "sdhci", OBJECT(&s->sdhci), NULL);
    qdev_set_parent_bus(DEVICE(&s->sdhci), sysbus_get_default());

+    /* SDHOST */
+    object_initialize(&s->sdhost, sizeof(s->sdhost), TYPE_BCM2835_SDHOST);
+    object_property_add_child(obj, "sdhost", OBJECT(&s->sdhost), NULL);
+    qdev_set_parent_bus(DEVICE(&s->sdhost), sysbus_get_default());
+
    /* DMA Channels */
    object_initialize(&s->dma, sizeof(s->dma), TYPE_BCM2835_DMA);
    object_property_add_child(obj, "dma", OBJECT(&s->dma), NULL);
@@ -103,6 +108,16 @@ static void bcm2835_peripherals_init(Object *obj)

    object_property_add_const_link(OBJECT(&s->dma), "dma-mr",
                                   OBJECT(&s->gpu_bus_mr), &error_abort);
+
+    /* GPIO */
+    object_initialize(&s->gpio, sizeof(s->gpio), TYPE_BCM2835_GPIO);
+    object_property_add_child(obj, "gpio", OBJECT(&s->gpio), NULL);
+    qdev_set_parent_bus(DEVICE(&s->gpio), sysbus_get_default());
+
+    object_property_add_const_link(OBJECT(&s->gpio), "sdbus-sdhci",
+                                   OBJECT(&s->sdhci.sdbus), &error_abort);
+    object_property_add_const_link(OBJECT(&s->gpio), "sdbus-sdhost",
+                                   OBJECT(&s->sdhost.sdbus), &error_abort);
 }

 static void bcm2835_peripherals_realize(DeviceState *dev, Error **errp)
@@ -267,13 +282,20 @@ static void bcm2835_peripherals_realize(DeviceState *dev, Error **errp)
    sysbus_connect_irq(SYS_BUS_DEVICE(&s->sdhci), 0,
        qdev_get_gpio_in_named(DEVICE(&s->ic), BCM2835_IC_GPU_IRQ,
                               INTERRUPT_ARASANSDIO));
-    object_property_add_alias(OBJECT(s), "sd-bus", OBJECT(&s->sdhci), "sd-bus",
-                              &err);
+
+    /* SDHOST */
+    object_property_set_bool(OBJECT(&s->sdhost), true, "realized", &err);
    if (err) {
        error_propagate(errp, err);
        return;
    }

+    memory_region_add_subregion(&s->peri_mr, MMCI0_OFFSET,
+                sysbus_mmio_get_region(SYS_BUS_DEVICE(&s->sdhost), 0));
+    sysbus_connect_irq(SYS_BUS_DEVICE(&s->sdhost), 0,
+        qdev_get_gpio_in_named(DEVICE(&s->ic), BCM2835_IC_GPU_IRQ,
+                               INTERRUPT_SDIO));
+
    /* DMA Channels */
    object_property_set_bool(OBJECT(&s->dma), true, "realized", &err);
    if (err) {
@@ -292,6 +314,23 @@ static void bcm2835_peripherals_realize(DeviceState *dev, Error **errp)
                                                  BCM2835_IC_GPU_IRQ,
                                                  INTERRUPT_DMA0 + n));
    }
+
+    /* GPIO */
+    object_property_set_bool(OBJECT(&s->gpio), true, "realized", &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+
+    memory_region_add_subregion(&s->peri_mr, GPIO_OFFSET,
+                sysbus_mmio_get_region(SYS_BUS_DEVICE(&s->gpio), 0));
+
+    object_property_add_alias(OBJECT(s), "sd-bus", OBJECT(&s->gpio), "sd-bus",
+                              &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
 }

 static void bcm2835_peripherals_class_init(ObjectClass *oc, void *data)
--- a/hw/arm/netduino2.c
+++ b/hw/arm/netduino2.c
@@ -27,17 +27,18 @@
 #include "hw/boards.h"
 #include "qemu/error-report.h"
 #include "hw/arm/stm32f205_soc.h"
+#include "hw/arm/arm.h"

 static void netduino2_init(MachineState *machine)
 {
    DeviceState *dev;

    dev = qdev_create(NULL, TYPE_STM32F205_SOC);
-    if (machine->kernel_filename) {
-        qdev_prop_set_string(dev, "kernel-filename", machine->kernel_filename);
-    }
    qdev_prop_set_string(dev, "cpu-model", "cortex-m3");
    object_property_set_bool(OBJECT(dev), true, "realized", &error_fatal);
+
+    armv7m_load_kernel(ARM_CPU(first_cpu), machine->kernel_filename,
+                       FLASH_SIZE);
 }

 static void netduino2_machine_init(MachineClass *mc)
--- a/hw/arm/stm32f205_soc.c
+++ b/hw/arm/stm32f205_soc.c
@@ -49,6 +49,9 @@ static void stm32f205_soc_initfn(Object *obj)
    STM32F205State *s = STM32F205_SOC(obj);
    int i;

+    object_initialize(&s->armv7m, sizeof(s->armv7m), TYPE_ARMV7M);
+    qdev_set_parent_bus(DEVICE(&s->armv7m), sysbus_get_default());
+
    object_initialize(&s->syscfg, sizeof(s->syscfg), TYPE_STM32F2XX_SYSCFG);
    qdev_set_parent_bus(DEVICE(&s->syscfg), sysbus_get_default());

@@ -82,7 +85,7 @@ static void stm32f205_soc_initfn(Object *obj)
 static void stm32f205_soc_realize(DeviceState *dev_soc, Error **errp)
 {
    STM32F205State *s = STM32F205_SOC(dev_soc);
-    DeviceState *dev, *nvic;
+    DeviceState *dev, *armv7m;
    SysBusDevice *busdev;
    Error *err = NULL;
    int i;
@@ -110,8 +113,16 @@ static void stm32f205_soc_realize(DeviceState *dev_soc, Error **errp)
    vmstate_register_ram_global(sram);
    memory_region_add_subregion(system_memory, SRAM_BASE_ADDRESS, sram);

-    nvic = armv7m_init(get_system_memory(), FLASH_SIZE, 96,
-                       s->kernel_filename, s->cpu_model);
+    armv7m = DEVICE(&s->armv7m);
+    qdev_prop_set_uint32(armv7m, "num-irq", 96);
+    qdev_prop_set_string(armv7m, "cpu-model", s->cpu_model);
+    object_property_set_link(OBJECT(&s->armv7m), OBJECT(get_system_memory()),
+                                     "memory", &error_abort);
+    object_property_set_bool(OBJECT(&s->armv7m), true, "realized", &err);
+    if (err != NULL) {
+        error_propagate(errp, err);
+        return;
+    }

    /* System configuration controller */
    dev = DEVICE(&s->syscfg);
@@ -122,7 +133,7 @@ static void stm32f205_soc_realize(DeviceState *dev_soc, Error **errp)
    }
    busdev = SYS_BUS_DEVICE(dev);
    sysbus_mmio_map(busdev, 0, 0x40013800);
-    sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(nvic, 71));
+    sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(armv7m, 71));

    /* Attach UART (uses USART registers) and USART controllers */
    for (i = 0; i < STM_NUM_USARTS; i++) {
@@ -136,7 +147,7 @@ static void stm32f205_soc_realize(DeviceState *dev_soc, Error **errp)
        }
        busdev = SYS_BUS_DEVICE(dev);
        sysbus_mmio_map(busdev, 0, usart_addr[i]);
-        sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(nvic, usart_irq[i]));
+        sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(armv7m, usart_irq[i]));
    }

    /* Timer 2 to 5 */
@@ -150,7 +161,7 @@ static void stm32f205_soc_realize(DeviceState *dev_soc, Error **errp)
        }
        busdev = SYS_BUS_DEVICE(dev);
        sysbus_mmio_map(busdev, 0, timer_addr[i]);
-        sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(nvic, timer_irq[i]));
+        sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(armv7m, timer_irq[i]));
    }

    /* ADC 1 to 3 */
@@ -162,7 +173,7 @@ static void stm32f205_soc_realize(DeviceState *dev_soc, Error **errp)
        return;
    }
    qdev_connect_gpio_out(DEVICE(s->adc_irqs), 0,
-                          qdev_get_gpio_in(nvic, ADC_IRQ));
+                          qdev_get_gpio_in(armv7m, ADC_IRQ));

    for (i = 0; i < STM_NUM_ADCS; i++) {
        dev = DEVICE(&(s->adc[i]));
@@ -187,12 +198,11 @@ static void stm32f205_soc_realize(DeviceState *dev_soc, Error **errp)
        }
        busdev = SYS_BUS_DEVICE(dev);
        sysbus_mmio_map(busdev, 0, spi_addr[i]);
-        sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(nvic, spi_irq[i]));
+        sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(armv7m, spi_irq[i]));
    }
 }

 static Property stm32f205_soc_properties[] = {
-    DEFINE_PROP_STRING("kernel-filename", STM32F205State, kernel_filename),
    DEFINE_PROP_STRING("cpu-model", STM32F205State, cpu_model),
    DEFINE_PROP_END_OF_LIST(),
 };
--- a/hw/block/block.c
+++ b/hw/block/block.c
@@ -51,11 +51,33 @@ void blkconf_blocksizes(BlockConf *conf)
    }
 }

-void blkconf_apply_backend_options(BlockConf *conf)
+void blkconf_apply_backend_options(BlockConf *conf, bool readonly,
+                                   bool resizable, Error **errp)
 {
    BlockBackend *blk = conf->blk;
    BlockdevOnError rerror, werror;
+    uint64_t perm, shared_perm;
    bool wce;
+    int ret;
+
+    perm = BLK_PERM_CONSISTENT_READ;
+    if (!readonly) {
+        perm |= BLK_PERM_WRITE;
+    }
+
+    shared_perm = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
+                  BLK_PERM_GRAPH_MOD;
+    if (resizable) {
+        shared_perm |= BLK_PERM_RESIZE;
+    }
+    if (conf->share_rw) {
+        shared_perm |= BLK_PERM_WRITE;
+    }
+
+    ret = blk_set_perm(blk, perm, shared_perm, errp);
+    if (ret < 0) {
+        return;
+    }

    switch (conf->wce) {
    case ON_OFF_AUTO_ON:    wce = true; break;
--- a/hw/block/fdc.c
+++ b/hw/block/fdc.c
@@ -186,6 +186,7 @@ typedef enum FDiskFlags {
 struct FDrive {
    FDCtrl *fdctrl;
    BlockBackend *blk;
+    BlockConf *conf;
    /* Drive status */
    FloppyDriveType drive;    /* CMOS drive type        */
    uint8_t perpendicular;    /* 2.88 MB access mode    */
@@ -469,9 +470,22 @@ static void fd_revalidate(FDrive *drv)
    }
 }

-static void fd_change_cb(void *opaque, bool load)
+static void fd_change_cb(void *opaque, bool load, Error **errp)
 {
    FDrive *drive = opaque;
+    Error *local_err = NULL;
+
+    if (!load) {
+        blk_set_perm(drive->blk, 0, BLK_PERM_ALL, &error_abort);
+    } else {
+        blkconf_apply_backend_options(drive->conf,
+                                      blk_is_read_only(drive->blk), false,
+                                      &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }

    drive->media_changed = 1;
    drive->media_validated = false;
@@ -508,6 +522,7 @@ static int floppy_drive_init(DeviceState *qdev)
    FloppyDrive *dev = FLOPPY_DRIVE(qdev);
    FloppyBus *bus = FLOPPY_BUS(qdev->parent_bus);
    FDrive *drive;
+    Error *local_err = NULL;
    int ret;

    if (dev->unit == -1) {
@@ -533,7 +548,7 @@ static int floppy_drive_init(DeviceState *qdev)

    if (!dev->conf.blk) {
        /* Anonymous BlockBackend for an empty drive */
-        dev->conf.blk = blk_new();
+        dev->conf.blk = blk_new(0, BLK_PERM_ALL);
        ret = blk_attach_dev(dev->conf.blk, qdev);
        assert(ret == 0);
    }
@@ -551,7 +566,13 @@ static int floppy_drive_init(DeviceState *qdev)
     * blkconf_apply_backend_options(). */
    dev->conf.rerror = BLOCKDEV_ON_ERROR_AUTO;
    dev->conf.werror = BLOCKDEV_ON_ERROR_AUTO;
-    blkconf_apply_backend_options(&dev->conf);
+
+    blkconf_apply_backend_options(&dev->conf, blk_is_read_only(dev->conf.blk),
+                                  false, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+        return -1;
+    }

    /* 'enospc' is the default for -drive, 'report' is what blk_new() gives us
     * for empty drives. */
@@ -565,6 +586,7 @@ static int floppy_drive_init(DeviceState *qdev)
        return -1;
    }

+    drive->conf = &dev->conf;
    drive->blk = dev->conf.blk;
    drive->fdctrl = bus->fdc;

--- a/hw/block/m25p80.c
+++ b/hw/block/m25p80.c
@@ -1215,6 +1215,7 @@ static void m25p80_realize(SSISlave *ss, Error **errp)
 {
    Flash *s = M25P80(ss);
    M25P80Class *mc = M25P80_GET_CLASS(s);
+    int ret;

    s->pi = mc->pi;

@@ -1222,6 +1223,13 @@ static void m25p80_realize(SSISlave *ss, Error **errp)
    s->dirty_page = -1;

    if (s->blk) {
+        uint64_t perm = BLK_PERM_CONSISTENT_READ |
+                        (blk_is_read_only(s->blk) ? 0 : BLK_PERM_WRITE);
+        ret = blk_set_perm(s->blk, perm, BLK_PERM_ALL, errp);
+        if (ret < 0) {
+            return;
+        }
+
        DB_PRINT_L(0, "Binding to IF_MTD drive\n");
        s->storage = blk_blockalign(s->blk, s->size);

--- a/hw/block/nand.c
+++ b/hw/block/nand.c
@@ -373,6 +373,8 @@ static void nand_realize(DeviceState *dev, Error **errp)
 {
    int pagesize;
    NANDFlashState *s = NAND(dev);
+    int ret;
+

    s->buswidth = nand_flash_ids[s->chip_id].width >> 3;
    s->size = nand_flash_ids[s->chip_id].size << 20;
@@ -407,6 +409,11 @@ static void nand_realize(DeviceState *dev, Error **errp)
            error_setg(errp, "Can't use a read-only drive");
            return;
        }
+        ret = blk_set_perm(s->blk, BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE,
+                           BLK_PERM_ALL, errp);
+        if (ret < 0) {
+            return;
+        }
        if (blk_getlength(s->blk) >=
                (s->pages << s->page_shift) + (s->pages << s->oob_shift)) {
            pagesize = 0;
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -835,6 +835,7 @@ static int nvme_init(PCIDevice *pci_dev)
    int i;
    int64_t bs_size;
    uint8_t *pci_conf;
+    Error *local_err = NULL;

    if (!n->conf.blk) {
        return -1;
@@ -850,7 +851,12 @@ static int nvme_init(PCIDevice *pci_dev)
        return -1;
    }
    blkconf_blocksizes(&n->conf);
-    blkconf_apply_backend_options(&n->conf);
+    blkconf_apply_backend_options(&n->conf, blk_is_read_only(n->conf.blk),
+                                  false, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+        return -1;
+    }

    pci_conf = pci_dev->config;
    pci_conf[PCI_INTERRUPT_PIN] = 1;
--- a/hw/block/onenand.c
+++ b/hw/block/onenand.c
@@ -778,6 +778,7 @@ static int onenand_initfn(SysBusDevice *sbd)
    OneNANDState *s = ONE_NAND(dev);
    uint32_t size = 1 << (24 + ((s->id.dev >> 4) & 7));
    void *ram;
+    Error *local_err = NULL;

    s->base = (hwaddr)-1;
    s->rdy = NULL;
@@ -796,6 +797,12 @@ static int onenand_initfn(SysBusDevice *sbd)
            error_report("Can't use a read-only drive");
            return -1;
        }
+        blk_set_perm(s->blk, BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE,
+                     BLK_PERM_ALL, &local_err);
+        if (local_err) {
+            error_report_err(local_err);
+            return -1;
+        }
        s->blk_cur = s->blk;
    }
    s->otp = memset(g_malloc((64 + 2) << PAGE_SHIFT),
--- a/hw/block/pflash_cfi01.c
+++ b/hw/block/pflash_cfi01.c
@@ -757,6 +757,18 @@ static void pflash_cfi01_realize(DeviceState *dev, Error **errp)
    pfl->storage = memory_region_get_ram_ptr(&pfl->mem);
    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &pfl->mem);

+    if (pfl->blk) {
+        uint64_t perm;
+        pfl->ro = blk_is_read_only(pfl->blk);
+        perm = BLK_PERM_CONSISTENT_READ | (pfl->ro ? 0 : BLK_PERM_WRITE);
+        ret = blk_set_perm(pfl->blk, perm, BLK_PERM_ALL, errp);
+        if (ret < 0) {
+            return;
+        }
+    } else {
+        pfl->ro = 0;
+    }
+
    if (pfl->blk) {
        /* read the initial flash content */
        ret = blk_pread(pfl->blk, 0, pfl->storage, total_len);
@@ -768,12 +780,6 @@ static void pflash_cfi01_realize(DeviceState *dev, Error **errp)
        }
    }

-    if (pfl->blk) {
-        pfl->ro = blk_is_read_only(pfl->blk);
-    } else {
-        pfl->ro = 0;
-    }
-
    /* Default to devices being used at their maximum device width. This was
     * assumed before the device_width support was added.
     */
--- a/hw/block/pflash_cfi02.c
+++ b/hw/block/pflash_cfi02.c
@@ -632,6 +632,19 @@ static void pflash_cfi02_realize(DeviceState *dev, Error **errp)
    vmstate_register_ram(&pfl->orig_mem, DEVICE(pfl));
    pfl->storage = memory_region_get_ram_ptr(&pfl->orig_mem);
    pfl->chip_len = chip_len;
+
+    if (pfl->blk) {
+        uint64_t perm;
+        pfl->ro = blk_is_read_only(pfl->blk);
+        perm = BLK_PERM_CONSISTENT_READ | (pfl->ro ? 0 : BLK_PERM_WRITE);
+        ret = blk_set_perm(pfl->blk, perm, BLK_PERM_ALL, errp);
+        if (ret < 0) {
+            return;
+        }
+    } else {
+        pfl->ro = 0;
+    }
+
    if (pfl->blk) {
        /* read the initial flash content */
        ret = blk_pread(pfl->blk, 0, pfl->storage, chip_len);
@@ -646,12 +659,6 @@ static void pflash_cfi02_realize(DeviceState *dev, Error **errp)
    pfl->rom_mode = 1;
    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &pfl->mem);

-    if (pfl->blk) {
-        pfl->ro = blk_is_read_only(pfl->blk);
-    } else {
-        pfl->ro = 0;
-    }
-
    pfl->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, pflash_timer, pfl);
    pfl->wcycle = 0;
    pfl->cmd = 0;
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -928,7 +928,13 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
    }

    blkconf_serial(&conf->conf, &conf->serial);
-    blkconf_apply_backend_options(&conf->conf);
+    blkconf_apply_backend_options(&conf->conf,
+                                  blk_is_read_only(conf->conf.blk), true,
+                                  &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
    s->original_wce = blk_enable_write_cache(conf->conf.blk);
    blkconf_geometry(&conf->conf, NULL, 65535, 255, 255, &err);
    if (err) {
--- a/hw/core/bus.c
+++ b/hw/core/bus.c
@@ -197,7 +197,7 @@ static void qbus_initfn(Object *obj)
                             TYPE_HOTPLUG_HANDLER,
                             (Object **)&bus->hotplug_handler,
                             object_property_allow_set_link,
-                             OBJ_PROP_LINK_UNREF_ON_RELEASE,
+                             0,
                             NULL);
    object_property_add_bool(obj, "realized",
                             bus_get_realized, bus_set_realized, NULL);
--- a/hw/core/loader.c
+++ b/hw/core/loader.c
@@ -434,6 +434,19 @@ int load_elf_as(const char *filename,
                void *translate_opaque, uint64_t *pentry, uint64_t *lowaddr,
                uint64_t *highaddr, int big_endian, int elf_machine,
                int clear_lsb, int data_swab, AddressSpace *as)
+{
+    return load_elf_ram(filename, translate_fn, translate_opaque,
+                        pentry, lowaddr, highaddr, big_endian, elf_machine,
+                        clear_lsb, data_swab, as, true);
+}
+
+/* return < 0 if error, otherwise the number of bytes loaded in memory */
+int load_elf_ram(const char *filename,
+                 uint64_t (*translate_fn)(void *, uint64_t),
+                 void *translate_opaque, uint64_t *pentry, uint64_t *lowaddr,
+                 uint64_t *highaddr, int big_endian, int elf_machine,
+                 int clear_lsb, int data_swab, AddressSpace *as,
+                 bool load_rom)
 {
    int fd, data_order, target_data_order, must_swab, ret = ELF_LOAD_FAILED;
    uint8_t e_ident[EI_NIDENT];
@@ -473,11 +486,11 @@ int load_elf_as(const char *filename,
    if (e_ident[EI_CLASS] == ELFCLASS64) {
        ret = load_elf64(filename, fd, translate_fn, translate_opaque, must_swab,
                         pentry, lowaddr, highaddr, elf_machine, clear_lsb,
-                         data_swab, as);
+                         data_swab, as, load_rom);
    } else {
        ret = load_elf32(filename, fd, translate_fn, translate_opaque, must_swab,
                         pentry, lowaddr, highaddr, elf_machine, clear_lsb,
-                         data_swab, as);
+                         data_swab, as, load_rom);
    }

 fail:
--- a/hw/core/ptimer.c
+++ b/hw/core/ptimer.c
@@ -12,6 +12,7 @@
 #include "qemu/host-utils.h"
 #include "sysemu/replay.h"
 #include "sysemu/qtest.h"
+#include "block/aio.h"

 #define DELTA_ADJUST     1
 #define DELTA_NO_ADJUST -1
@@ -353,3 +354,10 @@ ptimer_state *ptimer_init(QEMUBH *bh, uint8_t policy_mask)
    s->policy_mask = policy_mask;
    return s;
 }
+
+void ptimer_free(ptimer_state *s)
+{
+    qemu_bh_delete(s->bh);
+    timer_free(s->timer);
+    g_free(s);
+}
--- a/hw/core/qdev-properties-system.c
+++ b/hw/core/qdev-properties-system.c
@@ -73,14 +73,19 @@ static void parse_drive(DeviceState *dev, const char *str, void **ptr,
 {
    BlockBackend *blk;
    bool blk_created = false;
+    int ret;

    blk = blk_by_name(str);
    if (!blk) {
        BlockDriverState *bs = bdrv_lookup_bs(NULL, str, NULL);
        if (bs) {
-            blk = blk_new();
-            blk_insert_bs(blk, bs);
+            blk = blk_new(0, BLK_PERM_ALL);
            blk_created = true;
+
+            ret = blk_insert_bs(blk, bs, errp);
+            if (ret < 0) {
+                goto fail;
+            }
        }
    }
    if (!blk) {
--- a/hw/core/qdev.c
+++ b/hw/core/qdev.c
@@ -37,6 +37,7 @@
 #include "hw/boards.h"
 #include "hw/sysbus.h"
 #include "qapi-event.h"
+#include "migration/migration.h"

 int qdev_hotplug = 0;
 static bool qdev_hot_added = false;
@@ -102,9 +103,23 @@ static void bus_add_child(BusState *bus, DeviceState *child)

 void qdev_set_parent_bus(DeviceState *dev, BusState *bus)
 {
+    bool replugging = dev->parent_bus != NULL;
+
+    if (replugging) {
+        /* Keep a reference to the device while it's not plugged into
+         * any bus, to avoid it potentially evaporating when it is
+         * dereffed in bus_remove_child().
+         */
+        object_ref(OBJECT(dev));
+        bus_remove_child(dev->parent_bus, dev);
+        object_unref(OBJECT(dev->parent_bus));
+    }
    dev->parent_bus = bus;
    object_ref(OBJECT(bus));
    bus_add_child(bus, dev);
+    if (replugging) {
+        object_unref(OBJECT(dev));
+    }
 }

 /* Create a new device.  This only initializes the device state
@@ -889,6 +904,7 @@ static void device_set_realized(Object *obj, bool value, Error **errp)
    Error *local_err = NULL;
    bool unattached_parent = false;
    static int unattached_count;
+    int ret;

    if (dev->hotplugged && !dc->hotpluggable) {
        error_setg(errp, QERR_DEVICE_NO_HOTPLUG, object_get_typename(obj));
@@ -896,6 +912,11 @@ static void device_set_realized(Object *obj, bool value, Error **errp)
    }

    if (value && !dev->realized) {
+        ret = check_migratable(obj, &local_err);
+        if (ret < 0) {
+            goto fail;
+        }
+
        if (!obj->parent) {
            gchar *name = g_strdup_printf("device[%d]", unattached_count++);

--- a/hw/gpio/Makefile.objs
+++ b/hw/gpio/Makefile.objs
@@ -7,3 +7,4 @@ common-obj-$(CONFIG_GPIO_KEY) += gpio_key.o

 obj-$(CONFIG_OMAP) += omap_gpio.o
 obj-$(CONFIG_IMX) += imx_gpio.o
+obj-$(CONFIG_RASPI) += bcm2835_gpio.o
--- a/hw/gpio/bcm2835_gpio.c
+++ b/hw/gpio/bcm2835_gpio.c
@@ -0,0 +1,353 @@
+/*
+ * Raspberry Pi (BCM2835) GPIO Controller
+ *
+ * Copyright (c) 2017 Antfield SAS
+ *
+ * Authors:
+ *  Clement Deschamps <clement.deschamps@antfield.fr>
+ *  Luc Michel <luc.michel@antfield.fr>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "qemu/timer.h"
+#include "qapi/error.h"
+#include "hw/sysbus.h"
+#include "hw/sd/sd.h"
+#include "hw/gpio/bcm2835_gpio.h"
+
+#define GPFSEL0   0x00
+#define GPFSEL1   0x04
+#define GPFSEL2   0x08
+#define GPFSEL3   0x0C
+#define GPFSEL4   0x10
+#define GPFSEL5   0x14
+#define GPSET0    0x1C
+#define GPSET1    0x20
+#define GPCLR0    0x28
+#define GPCLR1    0x2C
+#define GPLEV0    0x34
+#define GPLEV1    0x38
+#define GPEDS0    0x40
+#define GPEDS1    0x44
+#define GPREN0    0x4C
+#define GPREN1    0x50
+#define GPFEN0    0x58
+#define GPFEN1    0x5C
+#define GPHEN0    0x64
+#define GPHEN1    0x68
+#define GPLEN0    0x70
+#define GPLEN1    0x74
+#define GPAREN0   0x7C
+#define GPAREN1   0x80
+#define GPAFEN0   0x88
+#define GPAFEN1   0x8C
+#define GPPUD     0x94
+#define GPPUDCLK0 0x98
+#define GPPUDCLK1 0x9C
+
+static uint32_t gpfsel_get(BCM2835GpioState *s, uint8_t reg)
+{
+    int i;
+    uint32_t value = 0;
+    for (i = 0; i < 10; i++) {
+        uint32_t index = 10 * reg + i;
+        if (index < sizeof(s->fsel)) {
+            value |= (s->fsel[index] & 0x7) << (3 * i);
+        }
+    }
+    return value;
+}
+
+static void gpfsel_set(BCM2835GpioState *s, uint8_t reg, uint32_t value)
+{
+    int i;
+    for (i = 0; i < 10; i++) {
+        uint32_t index = 10 * reg + i;
+        if (index < sizeof(s->fsel)) {
+            int fsel = (value >> (3 * i)) & 0x7;
+            s->fsel[index] = fsel;
+        }
+    }
+
+    /* SD controller selection (48-53) */
+    if (s->sd_fsel != 0
+            && (s->fsel[48] == 0) /* SD_CLK_R */
+            && (s->fsel[49] == 0) /* SD_CMD_R */
+            && (s->fsel[50] == 0) /* SD_DATA0_R */
+            && (s->fsel[51] == 0) /* SD_DATA1_R */
+            && (s->fsel[52] == 0) /* SD_DATA2_R */
+            && (s->fsel[53] == 0) /* SD_DATA3_R */
+            ) {
+        /* SDHCI controller selected */
+        sdbus_reparent_card(s->sdbus_sdhost, s->sdbus_sdhci);
+        s->sd_fsel = 0;
+    } else if (s->sd_fsel != 4
+            && (s->fsel[48] == 4) /* SD_CLK_R */
+            && (s->fsel[49] == 4) /* SD_CMD_R */
+            && (s->fsel[50] == 4) /* SD_DATA0_R */
+            && (s->fsel[51] == 4) /* SD_DATA1_R */
+            && (s->fsel[52] == 4) /* SD_DATA2_R */
+            && (s->fsel[53] == 4) /* SD_DATA3_R */
+            ) {
+        /* SDHost controller selected */
+        sdbus_reparent_card(s->sdbus_sdhci, s->sdbus_sdhost);
+        s->sd_fsel = 4;
+    }
+}
+
+static int gpfsel_is_out(BCM2835GpioState *s, int index)
+{
+    if (index >= 0 && index < 54) {
+        return s->fsel[index] == 1;
+    }
+    return 0;
+}
+
+static void gpset(BCM2835GpioState *s,
+        uint32_t val, uint8_t start, uint8_t count, uint32_t *lev)
+{
+    uint32_t changes = val & ~*lev;
+    uint32_t cur = 1;
+
+    int i;
+    for (i = 0; i < count; i++) {
+        if ((changes & cur) && (gpfsel_is_out(s, start + i))) {
+            qemu_set_irq(s->out[start + i], 1);
+        }
+        cur <<= 1;
+    }
+
+    *lev |= val;
+}
+
+static void gpclr(BCM2835GpioState *s,
+        uint32_t val, uint8_t start, uint8_t count, uint32_t *lev)
+{
+    uint32_t changes = val & *lev;
+    uint32_t cur = 1;
+
+    int i;
+    for (i = 0; i < count; i++) {
+        if ((changes & cur) && (gpfsel_is_out(s, start + i))) {
+            qemu_set_irq(s->out[start + i], 0);
+        }
+        cur <<= 1;
+    }
+
+    *lev &= ~val;
+}
+
+static uint64_t bcm2835_gpio_read(void *opaque, hwaddr offset,
+        unsigned size)
+{
+    BCM2835GpioState *s = (BCM2835GpioState *)opaque;
+
+    switch (offset) {
+    case GPFSEL0:
+    case GPFSEL1:
+    case GPFSEL2:
+    case GPFSEL3:
+    case GPFSEL4:
+    case GPFSEL5:
+        return gpfsel_get(s, offset / 4);
+    case GPSET0:
+    case GPSET1:
+        /* Write Only */
+        return 0;
+    case GPCLR0:
+    case GPCLR1:
+        /* Write Only */
+        return 0;
+    case GPLEV0:
+        return s->lev0;
+    case GPLEV1:
+        return s->lev1;
+    case GPEDS0:
+    case GPEDS1:
+    case GPREN0:
+    case GPREN1:
+    case GPFEN0:
+    case GPFEN1:
+    case GPHEN0:
+    case GPHEN1:
+    case GPLEN0:
+    case GPLEN1:
+    case GPAREN0:
+    case GPAREN1:
+    case GPAFEN0:
+    case GPAFEN1:
+    case GPPUD:
+    case GPPUDCLK0:
+    case GPPUDCLK1:
+        /* Not implemented */
+        return 0;
+    default:
+        qemu_log_mask(LOG_GUEST_ERROR, "%s: Bad offset %"HWADDR_PRIx"\n",
+                __func__, offset);
+        break;
+    }
+
+    return 0;
+}
+
+static void bcm2835_gpio_write(void *opaque, hwaddr offset,
+        uint64_t value, unsigned size)
+{
+    BCM2835GpioState *s = (BCM2835GpioState *)opaque;
+
+    switch (offset) {
+    case GPFSEL0:
+    case GPFSEL1:
+    case GPFSEL2:
+    case GPFSEL3:
+    case GPFSEL4:
+    case GPFSEL5:
+        gpfsel_set(s, offset / 4, value);
+        break;
+    case GPSET0:
+        gpset(s, value, 0, 32, &s->lev0);
+        break;
+    case GPSET1:
+        gpset(s, value, 32, 22, &s->lev1);
+        break;
+    case GPCLR0:
+        gpclr(s, value, 0, 32, &s->lev0);
+        break;
+    case GPCLR1:
+        gpclr(s, value, 32, 22, &s->lev1);
+        break;
+    case GPLEV0:
+    case GPLEV1:
+        /* Read Only */
+        break;
+    case GPEDS0:
+    case GPEDS1:
+    case GPREN0:
+    case GPREN1:
+    case GPFEN0:
+    case GPFEN1:
+    case GPHEN0:
+    case GPHEN1:
+    case GPLEN0:
+    case GPLEN1:
+    case GPAREN0:
+    case GPAREN1:
+    case GPAFEN0:
+    case GPAFEN1:
+    case GPPUD:
+    case GPPUDCLK0:
+    case GPPUDCLK1:
+        /* Not implemented */
+        break;
+    default:
+        goto err_out;
+    }
+    return;
+
+err_out:
+    qemu_log_mask(LOG_GUEST_ERROR, "%s: Bad offset %"HWADDR_PRIx"\n",
+            __func__, offset);
+}
+
+static void bcm2835_gpio_reset(DeviceState *dev)
+{
+    BCM2835GpioState *s = BCM2835_GPIO(dev);
+
+    int i;
+    for (i = 0; i < 6; i++) {
+        gpfsel_set(s, i, 0);
+    }
+
+    s->sd_fsel = 0;
+
+    /* SDHCI is selected by default */
+    sdbus_reparent_card(&s->sdbus, s->sdbus_sdhci);
+
+    s->lev0 = 0;
+    s->lev1 = 0;
+}
+
+static const MemoryRegionOps bcm2835_gpio_ops = {
+    .read = bcm2835_gpio_read,
+    .write = bcm2835_gpio_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static const VMStateDescription vmstate_bcm2835_gpio = {
+    .name = "bcm2835_gpio",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT8_ARRAY(fsel, BCM2835GpioState, 54),
+        VMSTATE_UINT32(lev0, BCM2835GpioState),
+        VMSTATE_UINT32(lev1, BCM2835GpioState),
+        VMSTATE_UINT8(sd_fsel, BCM2835GpioState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void bcm2835_gpio_init(Object *obj)
+{
+    BCM2835GpioState *s = BCM2835_GPIO(obj);
+    DeviceState *dev = DEVICE(obj);
+    SysBusDevice *sbd = SYS_BUS_DEVICE(obj);
+
+    qbus_create_inplace(&s->sdbus, sizeof(s->sdbus),
+                        TYPE_SD_BUS, DEVICE(s), "sd-bus");
+
+    memory_region_init_io(&s->iomem, obj,
+            &bcm2835_gpio_ops, s, "bcm2835_gpio", 0x1000);
+    sysbus_init_mmio(sbd, &s->iomem);
+    qdev_init_gpio_out(dev, s->out, 54);
+}
+
+static void bcm2835_gpio_realize(DeviceState *dev, Error **errp)
+{
+    BCM2835GpioState *s = BCM2835_GPIO(dev);
+    Object *obj;
+    Error *err = NULL;
+
+    obj = object_property_get_link(OBJECT(dev), "sdbus-sdhci", &err);
+    if (obj == NULL) {
+        error_setg(errp, "%s: required sdhci link not found: %s",
+                __func__, error_get_pretty(err));
+        return;
+    }
+    s->sdbus_sdhci = SD_BUS(obj);
+
+    obj = object_property_get_link(OBJECT(dev), "sdbus-sdhost", &err);
+    if (obj == NULL) {
+        error_setg(errp, "%s: required sdhost link not found: %s",
+                __func__, error_get_pretty(err));
+        return;
+    }
+    s->sdbus_sdhost = SD_BUS(obj);
+}
+
+static void bcm2835_gpio_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->vmsd = &vmstate_bcm2835_gpio;
+    dc->realize = &bcm2835_gpio_realize;
+    dc->reset = &bcm2835_gpio_reset;
+}
+
+static const TypeInfo bcm2835_gpio_info = {
+    .name          = TYPE_BCM2835_GPIO,
+    .parent        = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(BCM2835GpioState),
+    .instance_init = bcm2835_gpio_init,
+    .class_init    = bcm2835_gpio_class_init,
+};
+
+static void bcm2835_gpio_register_types(void)
+{
+    type_register_static(&bcm2835_gpio_info);
+}
+
+type_init(bcm2835_gpio_register_types)
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -462,7 +462,7 @@ static void *acpi_set_bsel(PCIBus *bus, void *opaque)

        *bus_bsel = (*bsel_alloc)++;
        object_property_add_uint32_ptr(OBJECT(bus), ACPI_PCIHP_PROP_BSEL,
-                                       bus_bsel, NULL);
+                                       bus_bsel, &error_abort);
    }

    return bsel_alloc;
@@ -471,7 +471,7 @@ static void *acpi_set_bsel(PCIBus *bus, void *opaque)
 static void acpi_set_pci_info(void)
 {
    PCIBus *bus = find_i440fx(); /* TODO: Q35 support */
-    unsigned bsel_alloc = 0;
+    unsigned bsel_alloc = ACPI_PCIHP_BSEL_DEFAULT;

    if (bus) {
        /* Scan all PCI buses. Set property to enable acpi based hotplug. */
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -1120,7 +1120,7 @@ static void ide_cfata_metadata_write(IDEState *s)
 }

 /* called when the inserted state of the media has changed */
-static void ide_cd_change_cb(void *opaque, bool load)
+static void ide_cd_change_cb(void *opaque, bool load, Error **errp)
 {
    IDEState *s = opaque;
    uint64_t nb_sectors;
--- a/hw/ide/qdev.c
+++ b/hw/ide/qdev.c
@@ -170,7 +170,7 @@ static int ide_dev_initfn(IDEDevice *dev, IDEDriveKind kind)
            return -1;
        } else {
            /* Anonymous BlockBackend for an empty drive */
-            dev->conf.blk = blk_new();
+            dev->conf.blk = blk_new(0, BLK_PERM_ALL);
        }
    }

@@ -196,7 +196,12 @@ static int ide_dev_initfn(IDEDevice *dev, IDEDriveKind kind)
            return -1;
        }
    }
-    blkconf_apply_backend_options(&dev->conf);
+    blkconf_apply_backend_options(&dev->conf, kind == IDE_CD, kind != IDE_CD,
+                                  &err);
+    if (err) {
+        error_report_err(err);
+        return -1;
+    }

    if (ide_init_drive(s, dev->conf.blk, kind,
                       dev->version, dev->serial, dev->model, dev->wwn,
--- a/hw/intc/Makefile.objs
+++ b/hw/intc/Makefile.objs
@@ -24,7 +24,7 @@ obj-$(CONFIG_APIC) += apic.o apic_common.o
 obj-$(CONFIG_ARM_GIC_KVM) += arm_gic_kvm.o
 obj-$(call land,$(CONFIG_ARM_GIC_KVM),$(TARGET_AARCH64)) += arm_gicv3_kvm.o
 obj-$(call land,$(CONFIG_ARM_GIC_KVM),$(TARGET_AARCH64)) += arm_gicv3_its_kvm.o
-obj-$(CONFIG_STELLARIS) += armv7m_nvic.o
+obj-$(CONFIG_ARM_V7M) += armv7m_nvic.o
 obj-$(CONFIG_EXYNOS4) += exynos4210_gic.o exynos4210_combiner.o
 obj-$(CONFIG_GRLIB) += grlib_irqmp.o
 obj-$(CONFIG_IOAPIC) += ioapic.o
--- a/hw/intc/arm_gicv3_common.c
+++ b/hw/intc/arm_gicv3_common.c
@@ -70,6 +70,38 @@ static const VMStateDescription vmstate_gicv3_cpu_virt = {
    }
 };

+static int icc_sre_el1_reg_pre_load(void *opaque)
+{
+    GICv3CPUState *cs = opaque;
+
+   /*
+    * If the sre_el1 subsection is not transferred this
+    * means SRE_EL1 is 0x7 (which might not be the same as
+    * our reset value).
+    */
+    cs->icc_sre_el1 = 0x7;
+    return 0;
+}
+
+static bool icc_sre_el1_reg_needed(void *opaque)
+{
+    GICv3CPUState *cs = opaque;
+
+    return cs->icc_sre_el1 != 7;
+}
+
+const VMStateDescription vmstate_gicv3_cpu_sre_el1 = {
+    .name = "arm_gicv3_cpu/sre_el1",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .pre_load = icc_sre_el1_reg_pre_load,
+    .needed = icc_sre_el1_reg_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT64(icc_sre_el1, GICv3CPUState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static const VMStateDescription vmstate_gicv3_cpu = {
    .name = "arm_gicv3_cpu",
    .version_id = 1,
@@ -100,6 +132,10 @@ static const VMStateDescription vmstate_gicv3_cpu = {
    .subsections = (const VMStateDescription * []) {
        &vmstate_gicv3_cpu_virt,
        NULL
+    },
+    .subsections = (const VMStateDescription * []) {
+        &vmstate_gicv3_cpu_sre_el1,
+        NULL
    }
 };

@@ -216,6 +252,8 @@ static void arm_gicv3_common_realize(DeviceState *dev, Error **errp)

        s->cpu[i].cpu = cpu;
        s->cpu[i].gic = s;
+        /* Store GICv3CPUState in CPUARMState gicv3state pointer */
+        gicv3_set_gicv3state(cpu, &s->cpu[i]);

        /* Pre-construct the GICR_TYPER:
         * For our implementation:
--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@@ -19,6 +19,14 @@
 #include "gicv3_internal.h"
 #include "cpu.h"

+void gicv3_set_gicv3state(CPUState *cpu, GICv3CPUState *s)
+{
+    ARMCPU *arm_cpu = ARM_CPU(cpu);
+    CPUARMState *env = &arm_cpu->env;
+
+    env->gicv3state = (void *)s;
+};
+
 static GICv3CPUState *icc_cs_from_env(CPUARMState *env)
 {
    /* Given the CPU, find the right GICv3CPUState struct.
--- a/hw/intc/arm_gicv3_kvm.c
+++ b/hw/intc/arm_gicv3_kvm.c
@@ -23,8 +23,10 @@
 #include "qapi/error.h"
 #include "hw/intc/arm_gicv3_common.h"
 #include "hw/sysbus.h"
+#include "qemu/error-report.h"
 #include "sysemu/kvm.h"
 #include "kvm_arm.h"
+#include "gicv3_internal.h"
 #include "vgic_common.h"
 #include "migration/migration.h"

@@ -44,6 +46,32 @@
 #define KVM_ARM_GICV3_GET_CLASS(obj) \
     OBJECT_GET_CLASS(KVMARMGICv3Class, (obj), TYPE_KVM_ARM_GICV3)

+#define   KVM_DEV_ARM_VGIC_SYSREG(op0, op1, crn, crm, op2)         \
+                             (ARM64_SYS_REG_SHIFT_MASK(op0, OP0) | \
+                              ARM64_SYS_REG_SHIFT_MASK(op1, OP1) | \
+                              ARM64_SYS_REG_SHIFT_MASK(crn, CRN) | \
+                              ARM64_SYS_REG_SHIFT_MASK(crm, CRM) | \
+                              ARM64_SYS_REG_SHIFT_MASK(op2, OP2))
+
+#define ICC_PMR_EL1     \
+    KVM_DEV_ARM_VGIC_SYSREG(3, 0, 4, 6, 0)
+#define ICC_BPR0_EL1    \
+    KVM_DEV_ARM_VGIC_SYSREG(3, 0, 12, 8, 3)
+#define ICC_AP0R_EL1(n) \
+    KVM_DEV_ARM_VGIC_SYSREG(3, 0, 12, 8, 4 | n)
+#define ICC_AP1R_EL1(n) \
+    KVM_DEV_ARM_VGIC_SYSREG(3, 0, 12, 9, n)
+#define ICC_BPR1_EL1    \
+    KVM_DEV_ARM_VGIC_SYSREG(3, 0, 12, 12, 3)
+#define ICC_CTLR_EL1    \
+    KVM_DEV_ARM_VGIC_SYSREG(3, 0, 12, 12, 4)
+#define ICC_SRE_EL1 \
+    KVM_DEV_ARM_VGIC_SYSREG(3, 0, 12, 12, 5)
+#define ICC_IGRPEN0_EL1 \
+    KVM_DEV_ARM_VGIC_SYSREG(3, 0, 12, 12, 6)
+#define ICC_IGRPEN1_EL1 \
+    KVM_DEV_ARM_VGIC_SYSREG(3, 0, 12, 12, 7)
+
 typedef struct KVMARMGICv3Class {
    ARMGICv3CommonClass parent_class;
    DeviceRealize parent_realize;
@@ -57,16 +85,549 @@ static void kvm_arm_gicv3_set_irq(void *opaque, int irq, int level)
    kvm_arm_gic_set_irq(s->num_irq, irq, level);
 }

+#define KVM_VGIC_ATTR(reg, typer) \
+    ((typer & KVM_DEV_ARM_VGIC_V3_MPIDR_MASK) | (reg))
+
+static inline void kvm_gicd_access(GICv3State *s, int offset,
+                                   uint32_t *val, bool write)
+{
+    kvm_device_access(s->dev_fd, KVM_DEV_ARM_VGIC_GRP_DIST_REGS,
+                      KVM_VGIC_ATTR(offset, 0),
+                      val, write);
+}
+
+static inline void kvm_gicr_access(GICv3State *s, int offset, int cpu,
+                                   uint32_t *val, bool write)
+{
+    kvm_device_access(s->dev_fd, KVM_DEV_ARM_VGIC_GRP_REDIST_REGS,
+                      KVM_VGIC_ATTR(offset, s->cpu[cpu].gicr_typer),
+                      val, write);
+}
+
+static inline void kvm_gicc_access(GICv3State *s, uint64_t reg, int cpu,
+                                   uint64_t *val, bool write)
+{
+    kvm_device_access(s->dev_fd, KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS,
+                      KVM_VGIC_ATTR(reg, s->cpu[cpu].gicr_typer),
+                      val, write);
+}
+
+static inline void kvm_gic_line_level_access(GICv3State *s, int irq, int cpu,
+                                             uint32_t *val, bool write)
+{
+    kvm_device_access(s->dev_fd, KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO,
+                      KVM_VGIC_ATTR(irq, s->cpu[cpu].gicr_typer) |
+                      (VGIC_LEVEL_INFO_LINE_LEVEL <<
+                       KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT),
+                      val, write);
+}
+
+/* Loop through each distributor IRQ related register; since bits
+ * corresponding to SPIs and PPIs are RAZ/WI when affinity routing
+ * is enabled, we skip those.
+ */
+#define for_each_dist_irq_reg(_irq, _max, _field_width) \
+    for (_irq = GIC_INTERNAL; _irq < _max; _irq += (32 / _field_width))
+
+static void kvm_dist_get_priority(GICv3State *s, uint32_t offset, uint8_t *bmp)
+{
+    uint32_t reg, *field;
+    int irq;
+
+    field = (uint32_t *)bmp;
+    for_each_dist_irq_reg(irq, s->num_irq, 8) {
+        kvm_gicd_access(s, offset, &reg, false);
+        *field = reg;
+        offset += 4;
+        field++;
+    }
+}
+
+static void kvm_dist_put_priority(GICv3State *s, uint32_t offset, uint8_t *bmp)
+{
+    uint32_t reg, *field;
+    int irq;
+
+    field = (uint32_t *)bmp;
+    for_each_dist_irq_reg(irq, s->num_irq, 8) {
+        reg = *field;
+        kvm_gicd_access(s, offset, &reg, true);
+        offset += 4;
+        field++;
+    }
+}
+
+static void kvm_dist_get_edge_trigger(GICv3State *s, uint32_t offset,
+                                      uint32_t *bmp)
+{
+    uint32_t reg;
+    int irq;
+
+    for_each_dist_irq_reg(irq, s->num_irq, 2) {
+        kvm_gicd_access(s, offset, &reg, false);
+        reg = half_unshuffle32(reg >> 1);
+        if (irq % 32 != 0) {
+            reg = (reg << 16);
+        }
+        *gic_bmp_ptr32(bmp, irq) |=  reg;
+        offset += 4;
+    }
+}
+
+static void kvm_dist_put_edge_trigger(GICv3State *s, uint32_t offset,
+                                      uint32_t *bmp)
+{
+    uint32_t reg;
+    int irq;
+
+    for_each_dist_irq_reg(irq, s->num_irq, 2) {
+        reg = *gic_bmp_ptr32(bmp, irq);
+        if (irq % 32 != 0) {
+            reg = (reg & 0xffff0000) >> 16;
+        } else {
+            reg = reg & 0xffff;
+        }
+        reg = half_shuffle32(reg) << 1;
+        kvm_gicd_access(s, offset, &reg, true);
+        offset += 4;
+    }
+}
+
+static void kvm_gic_get_line_level_bmp(GICv3State *s, uint32_t *bmp)
+{
+    uint32_t reg;
+    int irq;
+
+    for_each_dist_irq_reg(irq, s->num_irq, 1) {
+        kvm_gic_line_level_access(s, irq, 0, &reg, false);
+        *gic_bmp_ptr32(bmp, irq) = reg;
+    }
+}
+
+static void kvm_gic_put_line_level_bmp(GICv3State *s, uint32_t *bmp)
+{
+    uint32_t reg;
+    int irq;
+
+    for_each_dist_irq_reg(irq, s->num_irq, 1) {
+        reg = *gic_bmp_ptr32(bmp, irq);
+        kvm_gic_line_level_access(s, irq, 0, &reg, true);
+    }
+}
+
+/* Read a bitmap register group from the kernel VGIC. */
+static void kvm_dist_getbmp(GICv3State *s, uint32_t offset, uint32_t *bmp)
+{
+    uint32_t reg;
+    int irq;
+
+    for_each_dist_irq_reg(irq, s->num_irq, 1) {
+        kvm_gicd_access(s, offset, &reg, false);
+        *gic_bmp_ptr32(bmp, irq) = reg;
+        offset += 4;
+    }
+}
+
+static void kvm_dist_putbmp(GICv3State *s, uint32_t offset,
+                            uint32_t clroffset, uint32_t *bmp)
+{
+    uint32_t reg;
+    int irq;
+
+    for_each_dist_irq_reg(irq, s->num_irq, 1) {
+        /* If this bitmap is a set/clear register pair, first write to the
+         * clear-reg to clear all bits before using the set-reg to write
+         * the 1 bits.
+         */
+        if (clroffset != 0) {
+            reg = 0;
+            kvm_gicd_access(s, clroffset, &reg, true);
+        }
+        reg = *gic_bmp_ptr32(bmp, irq);
+        kvm_gicd_access(s, offset, &reg, true);
+        offset += 4;
+    }
+}
+
+static void kvm_arm_gicv3_check(GICv3State *s)
+{
+    uint32_t reg;
+    uint32_t num_irq;
+
+    /* Sanity checking s->num_irq */
+    kvm_gicd_access(s, GICD_TYPER, &reg, false);
+    num_irq = ((reg & 0x1f) + 1) * 32;
+
+    if (num_irq < s->num_irq) {
+        error_report("Model requests %u IRQs, but kernel supports max %u",
+                     s->num_irq, num_irq);
+        abort();
+    }
+}
+
 static void kvm_arm_gicv3_put(GICv3State *s)
 {
-    /* TODO */
-    DPRINTF("Cannot put kernel gic state, no kernel interface\n");
+    uint32_t regl, regh, reg;
+    uint64_t reg64, redist_typer;
+    int ncpu, i;
+
+    kvm_arm_gicv3_check(s);
+
+    kvm_gicr_access(s, GICR_TYPER, 0, &regl, false);
+    kvm_gicr_access(s, GICR_TYPER + 4, 0, &regh, false);
+    redist_typer = ((uint64_t)regh << 32) | regl;
+
+    reg = s->gicd_ctlr;
+    kvm_gicd_access(s, GICD_CTLR, &reg, true);
+
+    if (redist_typer & GICR_TYPER_PLPIS) {
+        /* Set base addresses before LPIs are enabled by GICR_CTLR write */
+        for (ncpu = 0; ncpu < s->num_cpu; ncpu++) {
+            GICv3CPUState *c = &s->cpu[ncpu];
+
+            reg64 = c->gicr_propbaser;
+            regl = (uint32_t)reg64;
+            kvm_gicr_access(s, GICR_PROPBASER, ncpu, &regl, true);
+            regh = (uint32_t)(reg64 >> 32);
+            kvm_gicr_access(s, GICR_PROPBASER + 4, ncpu, &regh, true);
+
+            reg64 = c->gicr_pendbaser;
+            if (!c->gicr_ctlr & GICR_CTLR_ENABLE_LPIS) {
+                /* Setting PTZ is advised if LPIs are disabled, to reduce
+                 * GIC initialization time.
+                 */
+                reg64 |= GICR_PENDBASER_PTZ;
+            }
+            regl = (uint32_t)reg64;
+            kvm_gicr_access(s, GICR_PENDBASER, ncpu, &regl, true);
+            regh = (uint32_t)(reg64 >> 32);
+            kvm_gicr_access(s, GICR_PENDBASER + 4, ncpu, &regh, true);
+        }
+    }
+
+    /* Redistributor state (one per CPU) */
+
+    for (ncpu = 0; ncpu < s->num_cpu; ncpu++) {
+        GICv3CPUState *c = &s->cpu[ncpu];
+
+        reg = c->gicr_ctlr;
+        kvm_gicr_access(s, GICR_CTLR, ncpu, &reg, true);
+
+        reg = c->gicr_statusr[GICV3_NS];
+        kvm_gicr_access(s, GICR_STATUSR, ncpu, &reg, true);
+
+        reg = c->gicr_waker;
+        kvm_gicr_access(s, GICR_WAKER, ncpu, &reg, true);
+
+        reg = c->gicr_igroupr0;
+        kvm_gicr_access(s, GICR_IGROUPR0, ncpu, &reg, true);
+
+        reg = ~0;
+        kvm_gicr_access(s, GICR_ICENABLER0, ncpu, &reg, true);
+        reg = c->gicr_ienabler0;
+        kvm_gicr_access(s, GICR_ISENABLER0, ncpu, &reg, true);
+
+        /* Restore config before pending so we treat level/edge correctly */
+        reg = half_shuffle32(c->edge_trigger >> 16) << 1;
+        kvm_gicr_access(s, GICR_ICFGR1, ncpu, &reg, true);
+
+        reg = c->level;
+        kvm_gic_line_level_access(s, 0, ncpu, &reg, true);
+
+        reg = ~0;
+        kvm_gicr_access(s, GICR_ICPENDR0, ncpu, &reg, true);
+        reg = c->gicr_ipendr0;
+        kvm_gicr_access(s, GICR_ISPENDR0, ncpu, &reg, true);
+
+        reg = ~0;
+        kvm_gicr_access(s, GICR_ICACTIVER0, ncpu, &reg, true);
+        reg = c->gicr_iactiver0;
+        kvm_gicr_access(s, GICR_ISACTIVER0, ncpu, &reg, true);
+
+        for (i = 0; i < GIC_INTERNAL; i += 4) {
+            reg = c->gicr_ipriorityr[i] |
+                (c->gicr_ipriorityr[i + 1] << 8) |
+                (c->gicr_ipriorityr[i + 2] << 16) |
+                (c->gicr_ipriorityr[i + 3] << 24);
+            kvm_gicr_access(s, GICR_IPRIORITYR + i, ncpu, &reg, true);
+        }
+    }
+
+    /* Distributor state (shared between all CPUs */
+    reg = s->gicd_statusr[GICV3_NS];
+    kvm_gicd_access(s, GICD_STATUSR, &reg, true);
+
+    /* s->enable bitmap -> GICD_ISENABLERn */
+    kvm_dist_putbmp(s, GICD_ISENABLER, GICD_ICENABLER, s->enabled);
+
+    /* s->group bitmap -> GICD_IGROUPRn */
+    kvm_dist_putbmp(s, GICD_IGROUPR, 0, s->group);
+
+    /* Restore targets before pending to ensure the pending state is set on
+     * the appropriate CPU interfaces in the kernel
+     */
+
+    /* s->gicd_irouter[irq] -> GICD_IROUTERn
+     * We can't use kvm_dist_put() here because the registers are 64-bit
+     */
+    for (i = GIC_INTERNAL; i < s->num_irq; i++) {
+        uint32_t offset;
+
+        offset = GICD_IROUTER + (sizeof(uint32_t) * i);
+        reg = (uint32_t)s->gicd_irouter[i];
+        kvm_gicd_access(s, offset, &reg, true);
+
+        offset = GICD_IROUTER + (sizeof(uint32_t) * i) + 4;
+        reg = (uint32_t)(s->gicd_irouter[i] >> 32);
+        kvm_gicd_access(s, offset, &reg, true);
+    }
+
+    /* s->trigger bitmap -> GICD_ICFGRn
+     * (restore configuration registers before pending IRQs so we treat
+     * level/edge correctly)
+     */
+    kvm_dist_put_edge_trigger(s, GICD_ICFGR, s->edge_trigger);
+
+    /* s->level bitmap ->  line_level */
+    kvm_gic_put_line_level_bmp(s, s->level);
+
+    /* s->pending bitmap -> GICD_ISPENDRn */
+    kvm_dist_putbmp(s, GICD_ISPENDR, GICD_ICPENDR, s->pending);
+
+    /* s->active bitmap -> GICD_ISACTIVERn */
+    kvm_dist_putbmp(s, GICD_ISACTIVER, GICD_ICACTIVER, s->active);
+
+    /* s->gicd_ipriority[] -> GICD_IPRIORITYRn */
+    kvm_dist_put_priority(s, GICD_IPRIORITYR, s->gicd_ipriority);
+
+    /* CPU Interface state (one per CPU) */
+
+    for (ncpu = 0; ncpu < s->num_cpu; ncpu++) {
+        GICv3CPUState *c = &s->cpu[ncpu];
+        int num_pri_bits;
+
+        kvm_gicc_access(s, ICC_SRE_EL1, ncpu, &c->icc_sre_el1, true);
+        kvm_gicc_access(s, ICC_CTLR_EL1, ncpu,
+                        &c->icc_ctlr_el1[GICV3_NS], true);
+        kvm_gicc_access(s, ICC_IGRPEN0_EL1, ncpu,
+                        &c->icc_igrpen[GICV3_G0], true);
+        kvm_gicc_access(s, ICC_IGRPEN1_EL1, ncpu,
+                        &c->icc_igrpen[GICV3_G1NS], true);
+        kvm_gicc_access(s, ICC_PMR_EL1, ncpu, &c->icc_pmr_el1, true);
+        kvm_gicc_access(s, ICC_BPR0_EL1, ncpu, &c->icc_bpr[GICV3_G0], true);
+        kvm_gicc_access(s, ICC_BPR1_EL1, ncpu, &c->icc_bpr[GICV3_G1NS], true);
+
+        num_pri_bits = ((c->icc_ctlr_el1[GICV3_NS] &
+                        ICC_CTLR_EL1_PRIBITS_MASK) >>
+                        ICC_CTLR_EL1_PRIBITS_SHIFT) + 1;
+
+        switch (num_pri_bits) {
+        case 7:
+            reg64 = c->icc_apr[GICV3_G0][3];
+            kvm_gicc_access(s, ICC_AP0R_EL1(3), ncpu, &reg64, true);
+            reg64 = c->icc_apr[GICV3_G0][2];
+            kvm_gicc_access(s, ICC_AP0R_EL1(2), ncpu, &reg64, true);
+        case 6:
+            reg64 = c->icc_apr[GICV3_G0][1];
+            kvm_gicc_access(s, ICC_AP0R_EL1(1), ncpu, &reg64, true);
+        default:
+            reg64 = c->icc_apr[GICV3_G0][0];
+            kvm_gicc_access(s, ICC_AP0R_EL1(0), ncpu, &reg64, true);
+        }
+
+        switch (num_pri_bits) {
+        case 7:
+            reg64 = c->icc_apr[GICV3_G1NS][3];
+            kvm_gicc_access(s, ICC_AP1R_EL1(3), ncpu, &reg64, true);
+            reg64 = c->icc_apr[GICV3_G1NS][2];
+            kvm_gicc_access(s, ICC_AP1R_EL1(2), ncpu, &reg64, true);
+        case 6:
+            reg64 = c->icc_apr[GICV3_G1NS][1];
+            kvm_gicc_access(s, ICC_AP1R_EL1(1), ncpu, &reg64, true);
+        default:
+            reg64 = c->icc_apr[GICV3_G1NS][0];
+            kvm_gicc_access(s, ICC_AP1R_EL1(0), ncpu, &reg64, true);
+        }
+    }
 }

 static void kvm_arm_gicv3_get(GICv3State *s)
 {
-    /* TODO */
-    DPRINTF("Cannot get kernel gic state, no kernel interface\n");
+    uint32_t regl, regh, reg;
+    uint64_t reg64, redist_typer;
+    int ncpu, i;
+
+    kvm_arm_gicv3_check(s);
+
+    kvm_gicr_access(s, GICR_TYPER, 0, &regl, false);
+    kvm_gicr_access(s, GICR_TYPER + 4, 0, &regh, false);
+    redist_typer = ((uint64_t)regh << 32) | regl;
+
+    kvm_gicd_access(s, GICD_CTLR, &reg, false);
+    s->gicd_ctlr = reg;
+
+    /* Redistributor state (one per CPU) */
+
+    for (ncpu = 0; ncpu < s->num_cpu; ncpu++) {
+        GICv3CPUState *c = &s->cpu[ncpu];
+
+        kvm_gicr_access(s, GICR_CTLR, ncpu, &reg, false);
+        c->gicr_ctlr = reg;
+
+        kvm_gicr_access(s, GICR_STATUSR, ncpu, &reg, false);
+        c->gicr_statusr[GICV3_NS] = reg;
+
+        kvm_gicr_access(s, GICR_WAKER, ncpu, &reg, false);
+        c->gicr_waker = reg;
+
+        kvm_gicr_access(s, GICR_IGROUPR0, ncpu, &reg, false);
+        c->gicr_igroupr0 = reg;
+        kvm_gicr_access(s, GICR_ISENABLER0, ncpu, &reg, false);
+        c->gicr_ienabler0 = reg;
+        kvm_gicr_access(s, GICR_ICFGR1, ncpu, &reg, false);
+        c->edge_trigger = half_unshuffle32(reg >> 1) << 16;
+        kvm_gic_line_level_access(s, 0, ncpu, &reg, false);
+        c->level = reg;
+        kvm_gicr_access(s, GICR_ISPENDR0, ncpu, &reg, false);
+        c->gicr_ipendr0 = reg;
+        kvm_gicr_access(s, GICR_ISACTIVER0, ncpu, &reg, false);
+        c->gicr_iactiver0 = reg;
+
+        for (i = 0; i < GIC_INTERNAL; i += 4) {
+            kvm_gicr_access(s, GICR_IPRIORITYR + i, ncpu, &reg, false);
+            c->gicr_ipriorityr[i] = extract32(reg, 0, 8);
+            c->gicr_ipriorityr[i + 1] = extract32(reg, 8, 8);
+            c->gicr_ipriorityr[i + 2] = extract32(reg, 16, 8);
+            c->gicr_ipriorityr[i + 3] = extract32(reg, 24, 8);
+        }
+    }
+
+    if (redist_typer & GICR_TYPER_PLPIS) {
+        for (ncpu = 0; ncpu < s->num_cpu; ncpu++) {
+            GICv3CPUState *c = &s->cpu[ncpu];
+
+            kvm_gicr_access(s, GICR_PROPBASER, ncpu, &regl, false);
+            kvm_gicr_access(s, GICR_PROPBASER + 4, ncpu, &regh, false);
+            c->gicr_propbaser = ((uint64_t)regh << 32) | regl;
+
+            kvm_gicr_access(s, GICR_PENDBASER, ncpu, &regl, false);
+            kvm_gicr_access(s, GICR_PENDBASER + 4, ncpu, &regh, false);
+            c->gicr_pendbaser = ((uint64_t)regh << 32) | regl;
+        }
+    }
+
+    /* Distributor state (shared between all CPUs */
+
+    kvm_gicd_access(s, GICD_STATUSR, &reg, false);
+    s->gicd_statusr[GICV3_NS] = reg;
+
+    /* GICD_IGROUPRn -> s->group bitmap */
+    kvm_dist_getbmp(s, GICD_IGROUPR, s->group);
+
+    /* GICD_ISENABLERn -> s->enabled bitmap */
+    kvm_dist_getbmp(s, GICD_ISENABLER, s->enabled);
+
+    /* Line level of irq */
+    kvm_gic_get_line_level_bmp(s, s->level);
+    /* GICD_ISPENDRn -> s->pending bitmap */
+    kvm_dist_getbmp(s, GICD_ISPENDR, s->pending);
+
+    /* GICD_ISACTIVERn -> s->active bitmap */
+    kvm_dist_getbmp(s, GICD_ISACTIVER, s->active);
+
+    /* GICD_ICFGRn -> s->trigger bitmap */
+    kvm_dist_get_edge_trigger(s, GICD_ICFGR, s->edge_trigger);
+
+    /* GICD_IPRIORITYRn -> s->gicd_ipriority[] */
+    kvm_dist_get_priority(s, GICD_IPRIORITYR, s->gicd_ipriority);
+
+    /* GICD_IROUTERn -> s->gicd_irouter[irq] */
+    for (i = GIC_INTERNAL; i < s->num_irq; i++) {
+        uint32_t offset;
+
+        offset = GICD_IROUTER + (sizeof(uint32_t) * i);
+        kvm_gicd_access(s, offset, &regl, false);
+        offset = GICD_IROUTER + (sizeof(uint32_t) * i) + 4;
+        kvm_gicd_access(s, offset, &regh, false);
+        s->gicd_irouter[i] = ((uint64_t)regh << 32) | regl;
+    }
+
+    /*****************************************************************
+     * CPU Interface(s) State
+     */
+
+    for (ncpu = 0; ncpu < s->num_cpu; ncpu++) {
+        GICv3CPUState *c = &s->cpu[ncpu];
+        int num_pri_bits;
+
+        kvm_gicc_access(s, ICC_SRE_EL1, ncpu, &c->icc_sre_el1, false);
+        kvm_gicc_access(s, ICC_CTLR_EL1, ncpu,
+                        &c->icc_ctlr_el1[GICV3_NS], false);
+        kvm_gicc_access(s, ICC_IGRPEN0_EL1, ncpu,
+                        &c->icc_igrpen[GICV3_G0], false);
+        kvm_gicc_access(s, ICC_IGRPEN1_EL1, ncpu,
+                        &c->icc_igrpen[GICV3_G1NS], false);
+        kvm_gicc_access(s, ICC_PMR_EL1, ncpu, &c->icc_pmr_el1, false);
+        kvm_gicc_access(s, ICC_BPR0_EL1, ncpu, &c->icc_bpr[GICV3_G0], false);
+        kvm_gicc_access(s, ICC_BPR1_EL1, ncpu, &c->icc_bpr[GICV3_G1NS], false);
+        num_pri_bits = ((c->icc_ctlr_el1[GICV3_NS] &
+                        ICC_CTLR_EL1_PRIBITS_MASK) >>
+                        ICC_CTLR_EL1_PRIBITS_SHIFT) + 1;
+
+        switch (num_pri_bits) {
+        case 7:
+            kvm_gicc_access(s, ICC_AP0R_EL1(3), ncpu, &reg64, false);
+            c->icc_apr[GICV3_G0][3] = reg64;
+            kvm_gicc_access(s, ICC_AP0R_EL1(2), ncpu, &reg64, false);
+            c->icc_apr[GICV3_G0][2] = reg64;
+        case 6:
+            kvm_gicc_access(s, ICC_AP0R_EL1(1), ncpu, &reg64, false);
+            c->icc_apr[GICV3_G0][1] = reg64;
+        default:
+            kvm_gicc_access(s, ICC_AP0R_EL1(0), ncpu, &reg64, false);
+            c->icc_apr[GICV3_G0][0] = reg64;
+        }
+
+        switch (num_pri_bits) {
+        case 7:
+            kvm_gicc_access(s, ICC_AP1R_EL1(3), ncpu, &reg64, false);
+            c->icc_apr[GICV3_G1NS][3] = reg64;
+            kvm_gicc_access(s, ICC_AP1R_EL1(2), ncpu, &reg64, false);
+            c->icc_apr[GICV3_G1NS][2] = reg64;
+        case 6:
+            kvm_gicc_access(s, ICC_AP1R_EL1(1), ncpu, &reg64, false);
+            c->icc_apr[GICV3_G1NS][1] = reg64;
+        default:
+            kvm_gicc_access(s, ICC_AP1R_EL1(0), ncpu, &reg64, false);
+            c->icc_apr[GICV3_G1NS][0] = reg64;
+        }
+    }
+}
+
+static void arm_gicv3_icc_reset(CPUARMState *env, const ARMCPRegInfo *ri)
+{
+    ARMCPU *cpu;
+    GICv3State *s;
+    GICv3CPUState *c;
+
+    c = (GICv3CPUState *)env->gicv3state;
+    s = c->gic;
+    cpu = ARM_CPU(c->cpu);
+
+    /* Initialize to actual HW supported configuration */
+    kvm_device_access(s->dev_fd, KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS,
+                      KVM_VGIC_ATTR(ICC_CTLR_EL1, cpu->mp_affinity),
+                      &c->icc_ctlr_el1[GICV3_NS], false);
+
+    c->icc_ctlr_el1[GICV3_S] = c->icc_ctlr_el1[GICV3_NS];
+    c->icc_pmr_el1 = 0;
+    c->icc_bpr[GICV3_G0] = GIC_MIN_BPR;
+    c->icc_bpr[GICV3_G1] = GIC_MIN_BPR;
+    c->icc_bpr[GICV3_G1NS] = GIC_MIN_BPR;
+
+    c->icc_sre_el1 = 0x7;
+    memset(c->icc_apr, 0, sizeof(c->icc_apr));
+    memset(c->icc_igrpen, 0, sizeof(c->icc_igrpen));
 }

 static void kvm_arm_gicv3_reset(DeviceState *dev)
@@ -77,9 +638,43 @@ static void kvm_arm_gicv3_reset(DeviceState *dev)
    DPRINTF("Reset\n");

    kgc->parent_reset(dev);
+
+    if (s->migration_blocker) {
+        DPRINTF("Cannot put kernel gic state, no kernel interface\n");
+        return;
+    }
+
    kvm_arm_gicv3_put(s);
 }

+/*
+ * CPU interface registers of GIC needs to be reset on CPU reset.
+ * For the calling arm_gicv3_icc_reset() on CPU reset, we register
+ * below ARMCPRegInfo. As we reset the whole cpu interface under single
+ * register reset, we define only one register of CPU interface instead
+ * of defining all the registers.
+ */
+static const ARMCPRegInfo gicv3_cpuif_reginfo[] = {
+    { .name = "ICC_CTLR_EL1", .state = ARM_CP_STATE_BOTH,
+      .opc0 = 3, .opc1 = 0, .crn = 12, .crm = 12, .opc2 = 4,
+      /*
+       * If ARM_CP_NOP is used, resetfn is not called,
+       * So ARM_CP_NO_RAW is appropriate type.
+       */
+      .type = ARM_CP_NO_RAW,
+      .access = PL1_RW,
+      .readfn = arm_cp_read_zero,
+      .writefn = arm_cp_write_ignore,
+      /*
+       * We hang the whole cpu interface reset routine off here
+       * rather than parcelling it out into one little function
+       * per register
+       */
+      .resetfn = arm_gicv3_icc_reset,
+    },
+    REGINFO_SENTINEL
+};
+
 static void kvm_arm_gicv3_realize(DeviceState *dev, Error **errp)
 {
    GICv3State *s = KVM_ARM_GICV3(dev);
@@ -103,16 +698,10 @@ static void kvm_arm_gicv3_realize(DeviceState *dev, Error **errp)

    gicv3_init_irqs_and_mmio(s, kvm_arm_gicv3_set_irq, NULL);

-    /* Block migration of a KVM GICv3 device: the API for saving and restoring
-     * the state in the kernel is not yet finalised in the kernel or
-     * implemented in QEMU.
-     */
-    error_setg(&s->migration_blocker, "vGICv3 migration is not implemented");
-    migrate_add_blocker(s->migration_blocker, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        error_free(s->migration_blocker);
-        return;
+    for (i = 0; i < s->num_cpu; i++) {
+        ARMCPU *cpu = ARM_CPU(qemu_get_cpu(i));
+
+        define_arm_cp_regs(cpu, gicv3_cpuif_reginfo);
    }

    /* Try to create the device via the device control API */
@@ -145,6 +734,18 @@ static void kvm_arm_gicv3_realize(DeviceState *dev, Error **errp)

        kvm_irqchip_commit_routes(kvm_state);
    }
+
+    if (!kvm_device_check_attr(s->dev_fd, KVM_DEV_ARM_VGIC_GRP_DIST_REGS,
+                               GICD_CTLR)) {
+        error_setg(&s->migration_blocker, "This operating system kernel does "
+                                          "not support vGICv3 migration");
+        migrate_add_blocker(s->migration_blocker, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            error_free(s->migration_blocker);
+            return;
+        }
+    }
 }

 static void kvm_arm_gicv3_class_init(ObjectClass *klass, void *data)
--- a/hw/intc/armv7m_nvic.c
+++ b/hw/intc/armv7m_nvic.c
@@ -17,8 +17,8 @@
 #include "hw/sysbus.h"
 #include "qemu/timer.h"
 #include "hw/arm/arm.h"
+#include "hw/arm/armv7m_nvic.h"
 #include "target/arm/cpu.h"
-#include "exec/address-spaces.h"
 #include "qemu/log.h"
 #include "trace.h"

@@ -47,7 +47,6 @@
 * "exception" more or less interchangeably.
 */
 #define NVIC_FIRST_IRQ 16
-#define NVIC_MAX_VECTORS 512
 #define NVIC_MAX_IRQ (NVIC_MAX_VECTORS - NVIC_FIRST_IRQ)

 /* Effective running priority of the CPU when no exception is active
@@ -55,116 +54,10 @@
 */
 #define NVIC_NOEXC_PRIO 0x100

-typedef struct VecInfo {
-    /* Exception priorities can range from -3 to 255; only the unmodifiable
-     * priority values for RESET, NMI and HardFault can be negative.
-     */
-    int16_t prio;
-    uint8_t enabled;
-    uint8_t pending;
-    uint8_t active;
-    uint8_t level; /* exceptions <=15 never set level */
-} VecInfo;
-
-typedef struct NVICState {
-    /*< private >*/
-    SysBusDevice parent_obj;
-    /*< public >*/
-
-    ARMCPU *cpu;
-
-    VecInfo vectors[NVIC_MAX_VECTORS];
-    uint32_t prigroup;
-
-    /* vectpending and exception_prio are both cached state that can
-     * be recalculated from the vectors[] array and the prigroup field.
-     */
-    unsigned int vectpending; /* highest prio pending enabled exception */
-    int exception_prio; /* group prio of the highest prio active exception */
-
-    struct {
-        uint32_t control;
-        uint32_t reload;
-        int64_t tick;
-        QEMUTimer *timer;
-    } systick;
-
-    MemoryRegion sysregmem;
-    MemoryRegion container;
-
-    uint32_t num_irq;
-    qemu_irq excpout;
-    qemu_irq sysresetreq;
-} NVICState;
-
-#define TYPE_NVIC "armv7m_nvic"
-
-#define NVIC(obj) \
-    OBJECT_CHECK(NVICState, (obj), TYPE_NVIC)
-
 static const uint8_t nvic_id[] = {
    0x00, 0xb0, 0x1b, 0x00, 0x0d, 0xe0, 0x05, 0xb1
 };

-/* qemu timers run at 1GHz.   We want something closer to 1MHz.  */
-#define SYSTICK_SCALE 1000ULL
-
-#define SYSTICK_ENABLE    (1 << 0)
-#define SYSTICK_TICKINT   (1 << 1)
-#define SYSTICK_CLKSOURCE (1 << 2)
-#define SYSTICK_COUNTFLAG (1 << 16)
-
-int system_clock_scale;
-
-/* Conversion factor from qemu timer to SysTick frequencies.  */
-static inline int64_t systick_scale(NVICState *s)
-{
-    if (s->systick.control & SYSTICK_CLKSOURCE)
-        return system_clock_scale;
-    else
-        return 1000;
-}
-
-static void systick_reload(NVICState *s, int reset)
-{
-    /* The Cortex-M3 Devices Generic User Guide says that "When the
-     * ENABLE bit is set to 1, the counter loads the RELOAD value from the
-     * SYST RVR register and then counts down". So, we need to check the
-     * ENABLE bit before reloading the value.
-     */
-    if ((s->systick.control & SYSTICK_ENABLE) == 0) {
-        return;
-    }
-
-    if (reset)
-        s->systick.tick = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
-    s->systick.tick += (s->systick.reload + 1) * systick_scale(s);
-    timer_mod(s->systick.timer, s->systick.tick);
-}
-
-static void systick_timer_tick(void * opaque)
-{
-    NVICState *s = (NVICState *)opaque;
-    s->systick.control |= SYSTICK_COUNTFLAG;
-    if (s->systick.control & SYSTICK_TICKINT) {
-        /* Trigger the interrupt.  */
-        armv7m_nvic_set_pending(s, ARMV7M_EXCP_SYSTICK);
-    }
-    if (s->systick.reload == 0) {
-        s->systick.control &= ~SYSTICK_ENABLE;
-    } else {
-        systick_reload(s, 0);
-    }
-}
-
-static void systick_reset(NVICState *s)
-{
-    s->systick.control = 0;
-    s->systick.reload = 0;
-    s->systick.tick = 0;
-    timer_del(s->systick.timer);
-}
-
 static int nvic_pending_prio(NVICState *s)
 {
    /* return the priority of the current pending interrupt,
@@ -510,30 +403,6 @@ static uint32_t nvic_readl(NVICState *s, uint32_t offset)
    switch (offset) {
    case 4: /* Interrupt Control Type.  */
        return ((s->num_irq - NVIC_FIRST_IRQ) / 32) - 1;
-    case 0x10: /* SysTick Control and Status.  */
-        val = s->systick.control;
-        s->systick.control &= ~SYSTICK_COUNTFLAG;
-        return val;
-    case 0x14: /* SysTick Reload Value.  */
-        return s->systick.reload;
-    case 0x18: /* SysTick Current Value.  */
-        {
-            int64_t t;
-            if ((s->systick.control & SYSTICK_ENABLE) == 0)
-                return 0;
-            t = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
-            if (t >= s->systick.tick)
-                return 0;
-            val = ((s->systick.tick - (t + 1)) / systick_scale(s)) + 1;
-            /* The interrupt in triggered when the timer reaches zero.
-               However the counter is not reloaded until the next clock
-               tick.  This is a hack to return zero during the first tick.  */
-            if (val > s->systick.reload)
-                val = 0;
-            return val;
-        }
-    case 0x1c: /* SysTick Calibration Value.  */
-        return 10000;
    case 0xd00: /* CPUID Base.  */
        return cpu->midr;
    case 0xd04: /* Interrupt Control State.  */
@@ -668,40 +537,8 @@ static uint32_t nvic_readl(NVICState *s, uint32_t offset)
 static void nvic_writel(NVICState *s, uint32_t offset, uint32_t value)
 {
    ARMCPU *cpu = s->cpu;
-    uint32_t oldval;
+
    switch (offset) {
-    case 0x10: /* SysTick Control and Status.  */
-        oldval = s->systick.control;
-        s->systick.control &= 0xfffffff8;
-        s->systick.control |= value & 7;
-        if ((oldval ^ value) & SYSTICK_ENABLE) {
-            int64_t now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
-            if (value & SYSTICK_ENABLE) {
-                if (s->systick.tick) {
-                    s->systick.tick += now;
-                    timer_mod(s->systick.timer, s->systick.tick);
-                } else {
-                    systick_reload(s, 1);
-                }
-            } else {
-                timer_del(s->systick.timer);
-                s->systick.tick -= now;
-                if (s->systick.tick < 0)
-                  s->systick.tick = 0;
-            }
-        } else if ((oldval ^ value) & SYSTICK_CLKSOURCE) {
-            /* This is a hack. Force the timer to be reloaded
-               when the reference clock is changed.  */
-            systick_reload(s, 1);
-        }
-        break;
-    case 0x14: /* SysTick Reload Value.  */
-        s->systick.reload = value;
-        break;
-    case 0x18: /* SysTick Current Value.  Writes reload the timer.  */
-        systick_reload(s, 1);
-        s->systick.control &= ~SYSTICK_COUNTFLAG;
-        break;
    case 0xd04: /* Interrupt Control State.  */
        if (value & (1 << 31)) {
            armv7m_nvic_set_pending(s, ARMV7M_EXCP_NMI);
@@ -1000,16 +837,12 @@ static const VMStateDescription vmstate_VecInfo = {

 static const VMStateDescription vmstate_nvic = {
    .name = "armv7m_nvic",
-    .version_id = 3,
-    .minimum_version_id = 3,
+    .version_id = 4,
+    .minimum_version_id = 4,
    .post_load = &nvic_post_load,
    .fields = (VMStateField[]) {
        VMSTATE_STRUCT_ARRAY(vectors, NVICState, NVIC_MAX_VECTORS, 1,
                             vmstate_VecInfo, VecInfo),
-        VMSTATE_UINT32(systick.control, NVICState),
-        VMSTATE_UINT32(systick.reload, NVICState),
-        VMSTATE_INT64(systick.tick, NVICState),
-        VMSTATE_TIMER_PTR(systick.timer, NVICState),
        VMSTATE_UINT32(prigroup, NVICState),
        VMSTATE_END_OF_LIST()
    }
@@ -1047,13 +880,26 @@ static void armv7m_nvic_reset(DeviceState *dev)

    s->exception_prio = NVIC_NOEXC_PRIO;
    s->vectpending = 0;
+}

-    systick_reset(s);
+static void nvic_systick_trigger(void *opaque, int n, int level)
+{
+    NVICState *s = opaque;
+
+    if (level) {
+        /* SysTick just asked us to pend its exception.
+         * (This is different from an external interrupt line's
+         * behaviour.)
+         */
+        armv7m_nvic_set_pending(s, ARMV7M_EXCP_SYSTICK);
+    }
 }

 static void armv7m_nvic_realize(DeviceState *dev, Error **errp)
 {
    NVICState *s = NVIC(dev);
+    SysBusDevice *systick_sbd;
+    Error *err = NULL;

    s->cpu = ARM_CPU(qemu_get_cpu(0));
    assert(s->cpu);
@@ -1068,10 +914,19 @@ static void armv7m_nvic_realize(DeviceState *dev, Error **errp)
    /* include space for internal exception vectors */
    s->num_irq += NVIC_FIRST_IRQ;

+    object_property_set_bool(OBJECT(&s->systick), true, "realized", &err);
+    if (err != NULL) {
+        error_propagate(errp, err);
+        return;
+    }
+    systick_sbd = SYS_BUS_DEVICE(&s->systick);
+    sysbus_connect_irq(systick_sbd, 0,
+                       qdev_get_gpio_in_named(dev, "systick-trigger", 0));
+
    /* The NVIC and System Control Space (SCS) starts at 0xe000e000
     * and looks like this:
     *  0x004 - ICTR
-     *  0x010 - 0x1c - systick
+     *  0x010 - 0xff - systick
     *  0x100..0x7ec - NVIC
     *  0x7f0..0xcff - Reserved
     *  0xd00..0xd3c - SCS registers
@@ -1089,12 +944,11 @@ static void armv7m_nvic_realize(DeviceState *dev, Error **errp)
    memory_region_init_io(&s->sysregmem, OBJECT(s), &nvic_sysreg_ops, s,
                          "nvic_sysregs", 0x1000);
    memory_region_add_subregion(&s->container, 0, &s->sysregmem);
+    memory_region_add_subregion_overlap(&s->container, 0x10,
+                                        sysbus_mmio_get_region(systick_sbd, 0),
+                                        1);

-    /* Map the whole thing into system memory at the location required
-     * by the v7M architecture.
-     */
-    memory_region_add_subregion(get_system_memory(), 0xe000e000, &s->container);
-    s->systick.timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, systick_timer_tick, s);
+    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &s->container);
 }

 static void armv7m_nvic_instance_init(Object *obj)
@@ -1109,8 +963,12 @@ static void armv7m_nvic_instance_init(Object *obj)
    NVICState *nvic = NVIC(obj);
    SysBusDevice *sbd = SYS_BUS_DEVICE(obj);

+    object_initialize(&nvic->systick, sizeof(nvic->systick), TYPE_SYSTICK);
+    qdev_set_parent_bus(DEVICE(&nvic->systick), sysbus_get_default());
+
    sysbus_init_irq(sbd, &nvic->excpout);
    qdev_init_gpio_out_named(dev, &nvic->sysresetreq, "SYSRESETREQ", 1);
+    qdev_init_gpio_in_named(dev, nvic_systick_trigger, "systick-trigger", 1);
 }

 static void armv7m_nvic_class_init(ObjectClass *klass, void *data)
--- a/hw/intc/gicv3_internal.h
+++ b/hw/intc/gicv3_internal.h
@@ -138,6 +138,7 @@
 #define ICC_CTLR_EL1_EOIMODE        (1U << 1)
 #define ICC_CTLR_EL1_PMHE           (1U << 6)
 #define ICC_CTLR_EL1_PRIBITS_SHIFT 8
+#define ICC_CTLR_EL1_PRIBITS_MASK   (7U << ICC_CTLR_EL1_PRIBITS_SHIFT)
 #define ICC_CTLR_EL1_IDBITS_SHIFT 11
 #define ICC_CTLR_EL1_SEIS           (1U << 14)
 #define ICC_CTLR_EL1_A3V            (1U << 15)
@@ -407,4 +408,6 @@ static inline void gicv3_cache_all_target_cpustates(GICv3State *s)
    }
 }

+void gicv3_set_gicv3state(CPUState *cpu, GICv3CPUState *s);
+
 #endif /* QEMU_ARM_GICV3_INTERNAL_H */
--- a/hw/intc/xics.c
+++ b/hw/intc/xics.c
@@ -49,40 +49,41 @@ int xics_get_cpu_index_by_dt_id(int cpu_dt_id)
    return -1;
 }

-void xics_cpu_destroy(XICSState *xics, PowerPCCPU *cpu)
+void xics_cpu_destroy(XICSFabric *xi, PowerPCCPU *cpu)
 {
    CPUState *cs = CPU(cpu);
-    ICPState *ss = &xics->ss[cs->cpu_index];
+    ICPState *icp = xics_icp_get(xi, cs->cpu_index);

-    assert(cs->cpu_index < xics->nr_servers);
-    assert(cs == ss->cs);
+    assert(icp);
+    assert(cs == icp->cs);

-    ss->output = NULL;
-    ss->cs = NULL;
+    icp->output = NULL;
+    icp->cs = NULL;
 }

-void xics_cpu_setup(XICSState *xics, PowerPCCPU *cpu)
+void xics_cpu_setup(XICSFabric *xi, PowerPCCPU *cpu)
 {
    CPUState *cs = CPU(cpu);
    CPUPPCState *env = &cpu->env;
-    ICPState *ss = &xics->ss[cs->cpu_index];
-    XICSStateClass *info = XICS_COMMON_GET_CLASS(xics);
+    ICPState *icp = xics_icp_get(xi, cs->cpu_index);
+    ICPStateClass *icpc;

-    assert(cs->cpu_index < xics->nr_servers);
+    assert(icp);

-    ss->cs = cs;
+    icp->cs = cs;

-    if (info->cpu_setup) {
-        info->cpu_setup(xics, cpu);
+    icpc = ICP_GET_CLASS(icp);
+    if (icpc->cpu_setup) {
+        icpc->cpu_setup(icp, cpu);
    }

    switch (PPC_INPUT(env)) {
    case PPC_FLAGS_INPUT_POWER7:
-        ss->output = env->irq_inputs[POWER7_INPUT_INT];
+        icp->output = env->irq_inputs[POWER7_INPUT_INT];
        break;

    case PPC_FLAGS_INPUT_970:
-        ss->output = env->irq_inputs[PPC970_INPUT_INT];
+        icp->output = env->irq_inputs[PPC970_INPUT_INT];
        break;

    default:
@@ -92,185 +93,43 @@ void xics_cpu_setup(XICSState *xics, PowerPCCPU *cpu)
    }
 }

-static void xics_common_pic_print_info(InterruptStatsProvider *obj,
-                                       Monitor *mon)
+void icp_pic_print_info(ICPState *icp, Monitor *mon)
+{
+    int cpu_index = icp->cs ? icp->cs->cpu_index : -1;
+
+    if (!icp->output) {
+        return;
+    }
+    monitor_printf(mon, "CPU %d XIRR=%08x (%p) PP=%02x MFRR=%02x\n",
+                   cpu_index, icp->xirr, icp->xirr_owner,
+                   icp->pending_priority, icp->mfrr);
+}
+
+void ics_pic_print_info(ICSState *ics, Monitor *mon)
 {
-    XICSState *xics = XICS_COMMON(obj);
-    ICSState *ics;
    uint32_t i;

-    for (i = 0; i < xics->nr_servers; i++) {
-        ICPState *icp = &xics->ss[i];
+    monitor_printf(mon, "ICS %4x..%4x %p\n",
+                   ics->offset, ics->offset + ics->nr_irqs - 1, ics);

-        if (!icp->output) {
+    if (!ics->irqs) {
+        return;
+    }
+
+    for (i = 0; i < ics->nr_irqs; i++) {
+        ICSIRQState *irq = ics->irqs + i;
+
+        if (!(irq->flags & XICS_FLAGS_IRQ_MASK)) {
            continue;
        }
-        monitor_printf(mon, "CPU %d XIRR=%08x (%p) PP=%02x MFRR=%02x\n",
-                       i, icp->xirr, icp->xirr_owner,
-                       icp->pending_priority, icp->mfrr);
-    }
-
-    QLIST_FOREACH(ics, &xics->ics, list) {
-        monitor_printf(mon, "ICS %4x..%4x %p\n",
-                       ics->offset, ics->offset + ics->nr_irqs - 1, ics);
-
-        if (!ics->irqs) {
-            continue;
-        }
-
-        for (i = 0; i < ics->nr_irqs; i++) {
-            ICSIRQState *irq = ics->irqs + i;
-
-            if (!(irq->flags & XICS_FLAGS_IRQ_MASK)) {
-                continue;
-            }
-            monitor_printf(mon, "  %4x %s %02x %02x\n",
-                           ics->offset + i,
-                           (irq->flags & XICS_FLAGS_IRQ_LSI) ?
-                           "LSI" : "MSI",
-                           irq->priority, irq->status);
-        }
+        monitor_printf(mon, "  %4x %s %02x %02x\n",
+                       ics->offset + i,
+                       (irq->flags & XICS_FLAGS_IRQ_LSI) ?
+                       "LSI" : "MSI",
+                       irq->priority, irq->status);
    }
 }

-/*
- * XICS Common class - parent for emulated XICS and KVM-XICS
- */
-static void xics_common_reset(DeviceState *d)
-{
-    XICSState *xics = XICS_COMMON(d);
-    ICSState *ics;
-    int i;
-
-    for (i = 0; i < xics->nr_servers; i++) {
-        device_reset(DEVICE(&xics->ss[i]));
-    }
-
-    QLIST_FOREACH(ics, &xics->ics, list) {
-        device_reset(DEVICE(ics));
-    }
-}
-
-static void xics_prop_get_nr_irqs(Object *obj, Visitor *v, const char *name,
-                                  void *opaque, Error **errp)
-{
-    XICSState *xics = XICS_COMMON(obj);
-    int64_t value = xics->nr_irqs;
-
-    visit_type_int(v, name, &value, errp);
-}
-
-static void xics_prop_set_nr_irqs(Object *obj, Visitor *v, const char *name,
-                                  void *opaque, Error **errp)
-{
-    XICSState *xics = XICS_COMMON(obj);
-    XICSStateClass *info = XICS_COMMON_GET_CLASS(xics);
-    Error *error = NULL;
-    int64_t value;
-
-    visit_type_int(v, name, &value, &error);
-    if (error) {
-        error_propagate(errp, error);
-        return;
-    }
-    if (xics->nr_irqs) {
-        error_setg(errp, "Number of interrupts is already set to %u",
-                   xics->nr_irqs);
-        return;
-    }
-
-    assert(info->set_nr_irqs);
-    info->set_nr_irqs(xics, value, errp);
-}
-
-void xics_set_nr_servers(XICSState *xics, uint32_t nr_servers,
-                         const char *typename, Error **errp)
-{
-    int i;
-
-    xics->nr_servers = nr_servers;
-
-    xics->ss = g_malloc0(xics->nr_servers * sizeof(ICPState));
-    for (i = 0; i < xics->nr_servers; i++) {
-        char name[32];
-        ICPState *icp = &xics->ss[i];
-
-        object_initialize(icp, sizeof(*icp), typename);
-        snprintf(name, sizeof(name), "icp[%d]", i);
-        object_property_add_child(OBJECT(xics), name, OBJECT(icp), errp);
-        icp->xics = xics;
-    }
-}
-
-static void xics_prop_get_nr_servers(Object *obj, Visitor *v,
-                                     const char *name, void *opaque,
-                                     Error **errp)
-{
-    XICSState *xics = XICS_COMMON(obj);
-    int64_t value = xics->nr_servers;
-
-    visit_type_int(v, name, &value, errp);
-}
-
-static void xics_prop_set_nr_servers(Object *obj, Visitor *v,
-                                     const char *name, void *opaque,
-                                     Error **errp)
-{
-    XICSState *xics = XICS_COMMON(obj);
-    XICSStateClass *xsc = XICS_COMMON_GET_CLASS(xics);
-    Error *error = NULL;
-    int64_t value;
-
-    visit_type_int(v, name, &value, &error);
-    if (error) {
-        error_propagate(errp, error);
-        return;
-    }
-    if (xics->nr_servers) {
-        error_setg(errp, "Number of servers is already set to %u",
-                   xics->nr_servers);
-        return;
-    }
-
-    assert(xsc->set_nr_servers);
-    xsc->set_nr_servers(xics, value, errp);
-}
-
-static void xics_common_initfn(Object *obj)
-{
-    XICSState *xics = XICS_COMMON(obj);
-
-    QLIST_INIT(&xics->ics);
-    object_property_add(obj, "nr_irqs", "int",
-                        xics_prop_get_nr_irqs, xics_prop_set_nr_irqs,
-                        NULL, NULL, NULL);
-    object_property_add(obj, "nr_servers", "int",
-                        xics_prop_get_nr_servers, xics_prop_set_nr_servers,
-                        NULL, NULL, NULL);
-}
-
-static void xics_common_class_init(ObjectClass *oc, void *data)
-{
-    DeviceClass *dc = DEVICE_CLASS(oc);
-    InterruptStatsProviderClass *ic = INTERRUPT_STATS_PROVIDER_CLASS(oc);
-
-    dc->reset = xics_common_reset;
-    ic->print_info = xics_common_pic_print_info;
-}
-
-static const TypeInfo xics_common_info = {
-    .name          = TYPE_XICS_COMMON,
-    .parent        = TYPE_SYS_BUS_DEVICE,
-    .instance_size = sizeof(XICSState),
-    .class_size    = sizeof(XICSStateClass),
-    .instance_init = xics_common_initfn,
-    .class_init    = xics_common_class_init,
-    .interfaces = (InterfaceInfo[]) {
-        { TYPE_INTERRUPT_STATS_PROVIDER },
-        { }
-    },
-};
-
 /*
 * ICP: Presentation layer
 */
@@ -278,8 +137,8 @@ static const TypeInfo xics_common_info = {
 #define XISR_MASK  0x00ffffff
 #define CPPR_MASK  0xff000000

-#define XISR(ss)   (((ss)->xirr) & XISR_MASK)
-#define CPPR(ss)   (((ss)->xirr) >> 24)
+#define XISR(icp)   (((icp)->xirr) & XISR_MASK)
+#define CPPR(icp)   (((icp)->xirr) >> 24)

 static void ics_reject(ICSState *ics, uint32_t nr)
 {
@@ -290,7 +149,7 @@ static void ics_reject(ICSState *ics, uint32_t nr)
    }
 }

-static void ics_resend(ICSState *ics)
+void ics_resend(ICSState *ics)
 {
    ICSStateClass *k = ICS_BASE_GET_CLASS(ics);

@@ -308,151 +167,152 @@ static void ics_eoi(ICSState *ics, int nr)
    }
 }

-static void icp_check_ipi(ICPState *ss)
+static void icp_check_ipi(ICPState *icp)
 {
-    if (XISR(ss) && (ss->pending_priority <= ss->mfrr)) {
+    if (XISR(icp) && (icp->pending_priority <= icp->mfrr)) {
        return;
    }

-    trace_xics_icp_check_ipi(ss->cs->cpu_index, ss->mfrr);
+    trace_xics_icp_check_ipi(icp->cs->cpu_index, icp->mfrr);

-    if (XISR(ss) && ss->xirr_owner) {
-        ics_reject(ss->xirr_owner, XISR(ss));
+    if (XISR(icp) && icp->xirr_owner) {
+        ics_reject(icp->xirr_owner, XISR(icp));
    }

-    ss->xirr = (ss->xirr & ~XISR_MASK) | XICS_IPI;
-    ss->pending_priority = ss->mfrr;
-    ss->xirr_owner = NULL;
-    qemu_irq_raise(ss->output);
+    icp->xirr = (icp->xirr & ~XISR_MASK) | XICS_IPI;
+    icp->pending_priority = icp->mfrr;
+    icp->xirr_owner = NULL;
+    qemu_irq_raise(icp->output);
 }

-static void icp_resend(ICPState *ss)
+void icp_resend(ICPState *icp)
 {
-    ICSState *ics;
+    XICSFabric *xi = icp->xics;
+    XICSFabricClass *xic = XICS_FABRIC_GET_CLASS(xi);

-    if (ss->mfrr < CPPR(ss)) {
-        icp_check_ipi(ss);
-    }
-    QLIST_FOREACH(ics, &ss->xics->ics, list) {
-        ics_resend(ics);
+    if (icp->mfrr < CPPR(icp)) {
+        icp_check_ipi(icp);
    }
+
+    xic->ics_resend(xi);
 }

-void icp_set_cppr(ICPState *ss, uint8_t cppr)
+void icp_set_cppr(ICPState *icp, uint8_t cppr)
 {
    uint8_t old_cppr;
    uint32_t old_xisr;

-    old_cppr = CPPR(ss);
-    ss->xirr = (ss->xirr & ~CPPR_MASK) | (cppr << 24);
+    old_cppr = CPPR(icp);
+    icp->xirr = (icp->xirr & ~CPPR_MASK) | (cppr << 24);

    if (cppr < old_cppr) {
-        if (XISR(ss) && (cppr <= ss->pending_priority)) {
-            old_xisr = XISR(ss);
-            ss->xirr &= ~XISR_MASK; /* Clear XISR */
-            ss->pending_priority = 0xff;
-            qemu_irq_lower(ss->output);
-            if (ss->xirr_owner) {
-                ics_reject(ss->xirr_owner, old_xisr);
-                ss->xirr_owner = NULL;
+        if (XISR(icp) && (cppr <= icp->pending_priority)) {
+            old_xisr = XISR(icp);
+            icp->xirr &= ~XISR_MASK; /* Clear XISR */
+            icp->pending_priority = 0xff;
+            qemu_irq_lower(icp->output);
+            if (icp->xirr_owner) {
+                ics_reject(icp->xirr_owner, old_xisr);
+                icp->xirr_owner = NULL;
            }
        }
    } else {
-        if (!XISR(ss)) {
-            icp_resend(ss);
+        if (!XISR(icp)) {
+            icp_resend(icp);
        }
    }
 }

-void icp_set_mfrr(ICPState *ss, uint8_t mfrr)
+void icp_set_mfrr(ICPState *icp, uint8_t mfrr)
 {
-    ss->mfrr = mfrr;
-    if (mfrr < CPPR(ss)) {
-        icp_check_ipi(ss);
+    icp->mfrr = mfrr;
+    if (mfrr < CPPR(icp)) {
+        icp_check_ipi(icp);
    }
 }

-uint32_t icp_accept(ICPState *ss)
+uint32_t icp_accept(ICPState *icp)
 {
-    uint32_t xirr = ss->xirr;
+    uint32_t xirr = icp->xirr;

-    qemu_irq_lower(ss->output);
-    ss->xirr = ss->pending_priority << 24;
-    ss->pending_priority = 0xff;
-    ss->xirr_owner = NULL;
+    qemu_irq_lower(icp->output);
+    icp->xirr = icp->pending_priority << 24;
+    icp->pending_priority = 0xff;
+    icp->xirr_owner = NULL;

-    trace_xics_icp_accept(xirr, ss->xirr);
+    trace_xics_icp_accept(xirr, icp->xirr);

    return xirr;
 }

-uint32_t icp_ipoll(ICPState *ss, uint32_t *mfrr)
+uint32_t icp_ipoll(ICPState *icp, uint32_t *mfrr)
 {
    if (mfrr) {
-        *mfrr = ss->mfrr;
+        *mfrr = icp->mfrr;
    }
-    return ss->xirr;
+    return icp->xirr;
 }

-void icp_eoi(ICPState *ss, uint32_t xirr)
+void icp_eoi(ICPState *icp, uint32_t xirr)
 {
+    XICSFabric *xi = icp->xics;
+    XICSFabricClass *xic = XICS_FABRIC_GET_CLASS(xi);
    ICSState *ics;
    uint32_t irq;

    /* Send EOI -> ICS */
-    ss->xirr = (ss->xirr & ~CPPR_MASK) | (xirr & CPPR_MASK);
-    trace_xics_icp_eoi(ss->cs->cpu_index, xirr, ss->xirr);
+    icp->xirr = (icp->xirr & ~CPPR_MASK) | (xirr & CPPR_MASK);
+    trace_xics_icp_eoi(icp->cs->cpu_index, xirr, icp->xirr);
    irq = xirr & XISR_MASK;
-    QLIST_FOREACH(ics, &ss->xics->ics, list) {
-        if (ics_valid_irq(ics, irq)) {
-            ics_eoi(ics, irq);
-        }
+
+    ics = xic->ics_get(xi, irq);
+    if (ics) {
+        ics_eoi(ics, irq);
    }
-    if (!XISR(ss)) {
-        icp_resend(ss);
+    if (!XISR(icp)) {
+        icp_resend(icp);
    }
 }

 static void icp_irq(ICSState *ics, int server, int nr, uint8_t priority)
 {
-    XICSState *xics = ics->xics;
-    ICPState *ss = xics->ss + server;
+    ICPState *icp = xics_icp_get(ics->xics, server);

    trace_xics_icp_irq(server, nr, priority);

-    if ((priority >= CPPR(ss))
-        || (XISR(ss) && (ss->pending_priority <= priority))) {
+    if ((priority >= CPPR(icp))
+        || (XISR(icp) && (icp->pending_priority <= priority))) {
        ics_reject(ics, nr);
    } else {
-        if (XISR(ss) && ss->xirr_owner) {
-            ics_reject(ss->xirr_owner, XISR(ss));
-            ss->xirr_owner = NULL;
+        if (XISR(icp) && icp->xirr_owner) {
+            ics_reject(icp->xirr_owner, XISR(icp));
+            icp->xirr_owner = NULL;
        }
-        ss->xirr = (ss->xirr & ~XISR_MASK) | (nr & XISR_MASK);
-        ss->xirr_owner = ics;
-        ss->pending_priority = priority;
-        trace_xics_icp_raise(ss->xirr, ss->pending_priority);
-        qemu_irq_raise(ss->output);
+        icp->xirr = (icp->xirr & ~XISR_MASK) | (nr & XISR_MASK);
+        icp->xirr_owner = ics;
+        icp->pending_priority = priority;
+        trace_xics_icp_raise(icp->xirr, icp->pending_priority);
+        qemu_irq_raise(icp->output);
    }
 }

 static void icp_dispatch_pre_save(void *opaque)
 {
-    ICPState *ss = opaque;
-    ICPStateClass *info = ICP_GET_CLASS(ss);
+    ICPState *icp = opaque;
+    ICPStateClass *info = ICP_GET_CLASS(icp);

    if (info->pre_save) {
-        info->pre_save(ss);
+        info->pre_save(icp);
    }
 }

 static int icp_dispatch_post_load(void *opaque, int version_id)
 {
-    ICPState *ss = opaque;
-    ICPStateClass *info = ICP_GET_CLASS(ss);
+    ICPState *icp = opaque;
+    ICPStateClass *info = ICP_GET_CLASS(icp);

    if (info->post_load) {
-        return info->post_load(ss, version_id);
+        return info->post_load(icp, version_id);
    }

    return 0;
@@ -485,12 +345,30 @@ static void icp_reset(DeviceState *dev)
    qemu_set_irq(icp->output, 0);
 }

+static void icp_realize(DeviceState *dev, Error **errp)
+{
+    ICPState *icp = ICP(dev);
+    Object *obj;
+    Error *err = NULL;
+
+    obj = object_property_get_link(OBJECT(dev), "xics", &err);
+    if (!obj) {
+        error_setg(errp, "%s: required link 'xics' not found: %s",
+                   __func__, error_get_pretty(err));
+        return;
+    }
+
+    icp->xics = XICS_FABRIC(obj);
+}
+
+
 static void icp_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);

    dc->reset = icp_reset;
    dc->vmsd = &vmstate_icp_server;
+    dc->realize = icp_realize;
 }

 static const TypeInfo icp_info = {
@@ -663,17 +541,6 @@ static void ics_simple_reset(DeviceState *dev)
    }
 }

-static int ics_simple_post_load(ICSState *ics, int version_id)
-{
-    int i;
-
-    for (i = 0; i < ics->xics->nr_servers; i++) {
-        icp_resend(&ics->xics->ss[i]);
-    }
-
-    return 0;
-}
-
 static void ics_simple_dispatch_pre_save(void *opaque)
 {
    ICSState *ics = opaque;
@@ -746,15 +613,20 @@ static void ics_simple_realize(DeviceState *dev, Error **errp)
    ics->qirqs = qemu_allocate_irqs(ics_simple_set_irq, ics, ics->nr_irqs);
 }

+static Property ics_simple_properties[] = {
+    DEFINE_PROP_UINT32("nr-irqs", ICSState, nr_irqs, 0),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void ics_simple_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
    ICSStateClass *isc = ICS_BASE_CLASS(klass);

-    dc->realize = ics_simple_realize;
+    isc->realize = ics_simple_realize;
+    dc->props = ics_simple_properties;
    dc->vmsd = &vmstate_ics_simple;
    dc->reset = ics_simple_reset;
-    isc->post_load = ics_simple_post_load;
    isc->reject = ics_simple_reject;
    isc->resend = ics_simple_resend;
    isc->eoi = ics_simple_eoi;
@@ -769,32 +641,56 @@ static const TypeInfo ics_simple_info = {
    .instance_init = ics_simple_initfn,
 };

+static void ics_base_realize(DeviceState *dev, Error **errp)
+{
+    ICSStateClass *icsc = ICS_BASE_GET_CLASS(dev);
+    ICSState *ics = ICS_BASE(dev);
+    Object *obj;
+    Error *err = NULL;
+
+    obj = object_property_get_link(OBJECT(dev), "xics", &err);
+    if (!obj) {
+        error_setg(errp, "%s: required link 'xics' not found: %s",
+                   __func__, error_get_pretty(err));
+        return;
+    }
+    ics->xics = XICS_FABRIC(obj);
+
+
+    if (icsc->realize) {
+        icsc->realize(dev, errp);
+    }
+}
+
+static void ics_base_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->realize = ics_base_realize;
+}
+
 static const TypeInfo ics_base_info = {
    .name = TYPE_ICS_BASE,
    .parent = TYPE_DEVICE,
    .abstract = true,
    .instance_size = sizeof(ICSState),
+    .class_init = ics_base_class_init,
    .class_size = sizeof(ICSStateClass),
 };

+static const TypeInfo xics_fabric_info = {
+    .name = TYPE_XICS_FABRIC,
+    .parent = TYPE_INTERFACE,
+    .class_size = sizeof(XICSFabricClass),
+};
+
 /*
 * Exported functions
 */
-ICSState *xics_find_source(XICSState *xics, int irq)
+qemu_irq xics_get_qirq(XICSFabric *xi, int irq)
 {
-    ICSState *ics;
-
-    QLIST_FOREACH(ics, &xics->ics, list) {
-        if (ics_valid_irq(ics, irq)) {
-            return ics;
-        }
-    }
-    return NULL;
-}
-
-qemu_irq xics_get_qirq(XICSState *xics, int irq)
-{
-    ICSState *ics = xics_find_source(xics, irq);
+    XICSFabricClass *xic = XICS_FABRIC_GET_CLASS(xi);
+    ICSState *ics = xic->ics_get(xi, irq);

    if (ics) {
        return ics->qirqs[irq - ics->offset];
@@ -803,6 +699,13 @@ qemu_irq xics_get_qirq(XICSState *xics, int irq)
    return NULL;
 }

+ICPState *xics_icp_get(XICSFabric *xi, int server)
+{
+    XICSFabricClass *xic = XICS_FABRIC_GET_CLASS(xi);
+
+    return xic->icp_get(xi, server);
+}
+
 void ics_set_irq_type(ICSState *ics, int srcno, bool lsi)
 {
    assert(!(ics->irqs[srcno].flags & XICS_FLAGS_IRQ_MASK));
@@ -813,10 +716,10 @@ void ics_set_irq_type(ICSState *ics, int srcno, bool lsi)

 static void xics_register_types(void)
 {
-    type_register_static(&xics_common_info);
    type_register_static(&ics_simple_info);
    type_register_static(&ics_base_info);
    type_register_static(&icp_info);
+    type_register_static(&xics_fabric_info);
 }

 type_init(xics_register_types)
--- a/hw/intc/xics_kvm.c
+++ b/hw/intc/xics_kvm.c
@@ -40,16 +40,12 @@

 #include <sys/ioctl.h>

-typedef struct KVMXICSState {
-    XICSState parent_obj;
-
-    int kernel_xics_fd;
-} KVMXICSState;
+static int kernel_xics_fd = -1;

 /*
 * ICP-KVM
 */
-static void icp_get_kvm_state(ICPState *ss)
+static void icp_get_kvm_state(ICPState *icp)
 {
    uint64_t state;
    struct kvm_one_reg reg = {
@@ -59,25 +55,25 @@ static void icp_get_kvm_state(ICPState *ss)
    int ret;

    /* ICP for this CPU thread is not in use, exiting */
-    if (!ss->cs) {
+    if (!icp->cs) {
        return;
    }

-    ret = kvm_vcpu_ioctl(ss->cs, KVM_GET_ONE_REG, &reg);
+    ret = kvm_vcpu_ioctl(icp->cs, KVM_GET_ONE_REG, &reg);
    if (ret != 0) {
        error_report("Unable to retrieve KVM interrupt controller state"
-                " for CPU %ld: %s", kvm_arch_vcpu_id(ss->cs), strerror(errno));
+                " for CPU %ld: %s", kvm_arch_vcpu_id(icp->cs), strerror(errno));
        exit(1);
    }

-    ss->xirr = state >> KVM_REG_PPC_ICP_XISR_SHIFT;
-    ss->mfrr = (state >> KVM_REG_PPC_ICP_MFRR_SHIFT)
+    icp->xirr = state >> KVM_REG_PPC_ICP_XISR_SHIFT;
+    icp->mfrr = (state >> KVM_REG_PPC_ICP_MFRR_SHIFT)
        & KVM_REG_PPC_ICP_MFRR_MASK;
-    ss->pending_priority = (state >> KVM_REG_PPC_ICP_PPRI_SHIFT)
+    icp->pending_priority = (state >> KVM_REG_PPC_ICP_PPRI_SHIFT)
        & KVM_REG_PPC_ICP_PPRI_MASK;
 }

-static int icp_set_kvm_state(ICPState *ss, int version_id)
+static int icp_set_kvm_state(ICPState *icp, int version_id)
 {
    uint64_t state;
    struct kvm_one_reg reg = {
@@ -87,18 +83,18 @@ static int icp_set_kvm_state(ICPState *ss, int version_id)
    int ret;

    /* ICP for this CPU thread is not in use, exiting */
-    if (!ss->cs) {
+    if (!icp->cs) {
        return 0;
    }

-    state = ((uint64_t)ss->xirr << KVM_REG_PPC_ICP_XISR_SHIFT)
-        | ((uint64_t)ss->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT)
-        | ((uint64_t)ss->pending_priority << KVM_REG_PPC_ICP_PPRI_SHIFT);
+    state = ((uint64_t)icp->xirr << KVM_REG_PPC_ICP_XISR_SHIFT)
+        | ((uint64_t)icp->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT)
+        | ((uint64_t)icp->pending_priority << KVM_REG_PPC_ICP_PPRI_SHIFT);

-    ret = kvm_vcpu_ioctl(ss->cs, KVM_SET_ONE_REG, &reg);
+    ret = kvm_vcpu_ioctl(icp->cs, KVM_SET_ONE_REG, &reg);
    if (ret != 0) {
        error_report("Unable to restore KVM interrupt controller state (0x%"
-                PRIx64 ") for CPU %ld: %s", state, kvm_arch_vcpu_id(ss->cs),
+                PRIx64 ") for CPU %ld: %s", state, kvm_arch_vcpu_id(icp->cs),
                strerror(errno));
        return ret;
    }
@@ -122,6 +118,34 @@ static void icp_kvm_reset(DeviceState *dev)
    icp_set_kvm_state(icp, 1);
 }

+static void icp_kvm_cpu_setup(ICPState *icp, PowerPCCPU *cpu)
+{
+    CPUState *cs = CPU(cpu);
+    int ret;
+
+    if (kernel_xics_fd == -1) {
+        abort();
+    }
+
+    /*
+     * If we are reusing a parked vCPU fd corresponding to the CPU
+     * which was hot-removed earlier we don't have to renable
+     * KVM_CAP_IRQ_XICS capability again.
+     */
+    if (icp->cap_irq_xics_enabled) {
+        return;
+    }
+
+    ret = kvm_vcpu_enable_cap(cs, KVM_CAP_IRQ_XICS, 0, kernel_xics_fd,
+                              kvm_arch_vcpu_id(cs));
+    if (ret < 0) {
+        error_report("Unable to connect CPU%ld to kernel XICS: %s",
+                     kvm_arch_vcpu_id(cs), strerror(errno));
+        exit(1);
+    }
+    icp->cap_irq_xics_enabled = true;
+}
+
 static void icp_kvm_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
@@ -130,6 +154,7 @@ static void icp_kvm_class_init(ObjectClass *klass, void *data)
    dc->reset = icp_kvm_reset;
    icpc->pre_save = icp_get_kvm_state;
    icpc->post_load = icp_set_kvm_state;
+    icpc->cpu_setup = icp_kvm_cpu_setup;
 }

 static const TypeInfo icp_kvm_info = {
@@ -145,7 +170,6 @@ static const TypeInfo icp_kvm_info = {
 */
 static void ics_get_kvm_state(ICSState *ics)
 {
-    KVMXICSState *xicskvm = XICS_SPAPR_KVM(ics->xics);
    uint64_t state;
    struct kvm_device_attr attr = {
        .flags = 0,
@@ -160,7 +184,7 @@ static void ics_get_kvm_state(ICSState *ics)

        attr.attr = i + ics->offset;

-        ret = ioctl(xicskvm->kernel_xics_fd, KVM_GET_DEVICE_ATTR, &attr);
+        ret = ioctl(kernel_xics_fd, KVM_GET_DEVICE_ATTR, &attr);
        if (ret != 0) {
            error_report("Unable to retrieve KVM interrupt controller state"
                    " for IRQ %d: %s", i + ics->offset, strerror(errno));
@@ -204,7 +228,6 @@ static void ics_get_kvm_state(ICSState *ics)

 static int ics_set_kvm_state(ICSState *ics, int version_id)
 {
-    KVMXICSState *xicskvm = XICS_SPAPR_KVM(ics->xics);
    uint64_t state;
    struct kvm_device_attr attr = {
        .flags = 0,
@@ -238,7 +261,7 @@ static int ics_set_kvm_state(ICSState *ics, int version_id)
            }
        }

-        ret = ioctl(xicskvm->kernel_xics_fd, KVM_SET_DEVICE_ATTR, &attr);
+        ret = ioctl(kernel_xics_fd, KVM_SET_DEVICE_ATTR, &attr);
        if (ret != 0) {
            error_report("Unable to restore KVM interrupt controller state"
                    " for IRQs %d: %s", i + ics->offset, strerror(errno));
@@ -308,7 +331,7 @@ static void ics_kvm_class_init(ObjectClass *klass, void *data)
    DeviceClass *dc = DEVICE_CLASS(klass);
    ICSStateClass *icsc = ICS_BASE_CLASS(klass);

-    dc->realize = ics_kvm_realize;
+    icsc->realize = ics_kvm_realize;
    dc->reset = ics_kvm_reset;
    icsc->pre_save = ics_get_kvm_state;
    icsc->post_load = ics_set_kvm_state;
@@ -324,57 +347,6 @@ static const TypeInfo ics_kvm_info = {
 /*
 * XICS-KVM
 */
-static void xics_kvm_cpu_setup(XICSState *xics, PowerPCCPU *cpu)
-{
-    CPUState *cs;
-    ICPState *ss;
-    KVMXICSState *xicskvm = XICS_SPAPR_KVM(xics);
-    int ret;
-
-    cs = CPU(cpu);
-    ss = &xics->ss[cs->cpu_index];
-
-    assert(cs->cpu_index < xics->nr_servers);
-    if (xicskvm->kernel_xics_fd == -1) {
-        abort();
-    }
-
-    /*
-     * If we are reusing a parked vCPU fd corresponding to the CPU
-     * which was hot-removed earlier we don't have to renable
-     * KVM_CAP_IRQ_XICS capability again.
-     */
-    if (ss->cap_irq_xics_enabled) {
-        return;
-    }
-
-    ret = kvm_vcpu_enable_cap(cs, KVM_CAP_IRQ_XICS, 0, xicskvm->kernel_xics_fd,
-                              kvm_arch_vcpu_id(cs));
-    if (ret < 0) {
-        error_report("Unable to connect CPU%ld to kernel XICS: %s",
-                     kvm_arch_vcpu_id(cs), strerror(errno));
-        exit(1);
-    }
-    ss->cap_irq_xics_enabled = true;
-}
-
-static void xics_kvm_set_nr_irqs(XICSState *xics, uint32_t nr_irqs,
-                                 Error **errp)
-{
-    ICSState *ics = QLIST_FIRST(&xics->ics);
-
-    /* This needs to be deprecated ... */
-    xics->nr_irqs = nr_irqs;
-    if (ics) {
-        ics->nr_irqs = nr_irqs;
-    }
-}
-
-static void xics_kvm_set_nr_servers(XICSState *xics, uint32_t nr_servers,
-                                    Error **errp)
-{
-    xics_set_nr_servers(xics, nr_servers, TYPE_KVM_ICP, errp);
-}

 static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                       uint32_t token,
@@ -385,13 +357,9 @@ static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                 __func__);
 }

-static void xics_kvm_realize(DeviceState *dev, Error **errp)
+int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
 {
-    KVMXICSState *xicskvm = XICS_SPAPR_KVM(dev);
-    XICSState *xics = XICS_COMMON(dev);
-    ICSState *ics;
-    int i, rc;
-    Error *error = NULL;
+    int rc;
    struct kvm_create_device xics_create_device = {
        .type = KVM_DEV_TYPE_XICS,
        .flags = 0,
@@ -439,72 +407,24 @@ static void xics_kvm_realize(DeviceState *dev, Error **errp)
        goto fail;
    }

-    xicskvm->kernel_xics_fd = xics_create_device.fd;
-
-    QLIST_FOREACH(ics, &xics->ics, list) {
-        object_property_set_bool(OBJECT(ics), true, "realized", &error);
-        if (error) {
-            error_propagate(errp, error);
-            goto fail;
-        }
-    }
-
-    assert(xics->nr_servers);
-    for (i = 0; i < xics->nr_servers; i++) {
-        object_property_set_bool(OBJECT(&xics->ss[i]), true, "realized",
-                                 &error);
-        if (error) {
-            error_propagate(errp, error);
-            goto fail;
-        }
-    }
+    kernel_xics_fd = xics_create_device.fd;

    kvm_kernel_irqchip = true;
    kvm_msi_via_irqfd_allowed = true;
    kvm_gsi_direct_mapping = true;

-    return;
+    return rc;

 fail:
    kvmppc_define_rtas_kernel_token(0, "ibm,set-xive");
    kvmppc_define_rtas_kernel_token(0, "ibm,get-xive");
    kvmppc_define_rtas_kernel_token(0, "ibm,int-on");
    kvmppc_define_rtas_kernel_token(0, "ibm,int-off");
+    return -1;
 }

-static void xics_kvm_initfn(Object *obj)
-{
-    XICSState *xics = XICS_COMMON(obj);
-    ICSState *ics;
-
-    ics = ICS_SIMPLE(object_new(TYPE_ICS_KVM));
-    object_property_add_child(obj, "ics", OBJECT(ics), NULL);
-    ics->xics = xics;
-    QLIST_INSERT_HEAD(&xics->ics, ics, list);
-}
-
-static void xics_kvm_class_init(ObjectClass *oc, void *data)
-{
-    DeviceClass *dc = DEVICE_CLASS(oc);
-    XICSStateClass *xsc = XICS_COMMON_CLASS(oc);
-
-    dc->realize = xics_kvm_realize;
-    xsc->cpu_setup = xics_kvm_cpu_setup;
-    xsc->set_nr_irqs = xics_kvm_set_nr_irqs;
-    xsc->set_nr_servers = xics_kvm_set_nr_servers;
-}
-
-static const TypeInfo xics_spapr_kvm_info = {
-    .name          = TYPE_XICS_SPAPR_KVM,
-    .parent        = TYPE_XICS_COMMON,
-    .instance_size = sizeof(KVMXICSState),
-    .class_init    = xics_kvm_class_init,
-    .instance_init = xics_kvm_initfn,
-};
-
 static void xics_kvm_register_types(void)
 {
-    type_register_static(&xics_spapr_kvm_info);
    type_register_static(&ics_kvm_info);
    type_register_static(&icp_kvm_info);
 }
--- a/hw/intc/xics_spapr.c
+++ b/hw/intc/xics_spapr.c
@@ -44,7 +44,7 @@ static target_ulong h_cppr(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                           target_ulong opcode, target_ulong *args)
 {
    CPUState *cs = CPU(cpu);
-    ICPState *icp = &spapr->xics->ss[cs->cpu_index];
+    ICPState *icp = xics_icp_get(XICS_FABRIC(spapr), cs->cpu_index);
    target_ulong cppr = args[0];

    icp_set_cppr(icp, cppr);
@@ -56,12 +56,13 @@ static target_ulong h_ipi(PowerPCCPU *cpu, sPAPRMachineState *spapr,
 {
    target_ulong server = xics_get_cpu_index_by_dt_id(args[0]);
    target_ulong mfrr = args[1];
+    ICPState *icp = xics_icp_get(XICS_FABRIC(spapr), server);

-    if (server >= spapr->xics->nr_servers) {
+    if (!icp) {
        return H_PARAMETER;
    }

-    icp_set_mfrr(spapr->xics->ss + server, mfrr);
+    icp_set_mfrr(icp, mfrr);
    return H_SUCCESS;
 }

@@ -69,7 +70,7 @@ static target_ulong h_xirr(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                           target_ulong opcode, target_ulong *args)
 {
    CPUState *cs = CPU(cpu);
-    ICPState *icp = &spapr->xics->ss[cs->cpu_index];
+    ICPState *icp = xics_icp_get(XICS_FABRIC(spapr), cs->cpu_index);
    uint32_t xirr = icp_accept(icp);

    args[0] = xirr;
@@ -80,7 +81,7 @@ static target_ulong h_xirr_x(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                             target_ulong opcode, target_ulong *args)
 {
    CPUState *cs = CPU(cpu);
-    ICPState *icp = &spapr->xics->ss[cs->cpu_index];
+    ICPState *icp = xics_icp_get(XICS_FABRIC(spapr), cs->cpu_index);
    uint32_t xirr = icp_accept(icp);

    args[0] = xirr;
@@ -92,7 +93,7 @@ static target_ulong h_eoi(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                          target_ulong opcode, target_ulong *args)
 {
    CPUState *cs = CPU(cpu);
-    ICPState *icp = &spapr->xics->ss[cs->cpu_index];
+    ICPState *icp = xics_icp_get(XICS_FABRIC(spapr), cs->cpu_index);
    target_ulong xirr = args[0];

    icp_eoi(icp, xirr);
@@ -103,7 +104,7 @@ static target_ulong h_ipoll(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                            target_ulong opcode, target_ulong *args)
 {
    CPUState *cs = CPU(cpu);
-    ICPState *icp = &spapr->xics->ss[cs->cpu_index];
+    ICPState *icp = xics_icp_get(XICS_FABRIC(spapr), cs->cpu_index);
    uint32_t mfrr;
    uint32_t xirr = icp_ipoll(icp, &mfrr);

@@ -118,7 +119,7 @@ static void rtas_set_xive(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                          uint32_t nargs, target_ulong args,
                          uint32_t nret, target_ulong rets)
 {
-    ICSState *ics = QLIST_FIRST(&spapr->xics->ics);
+    ICSState *ics = spapr->ics;
    uint32_t nr, srcno, server, priority;

    if ((nargs != 3) || (nret != 1)) {
@@ -134,7 +135,7 @@ static void rtas_set_xive(PowerPCCPU *cpu, sPAPRMachineState *spapr,
    server = xics_get_cpu_index_by_dt_id(rtas_ld(args, 1));
    priority = rtas_ld(args, 2);

-    if (!ics_valid_irq(ics, nr) || (server >= ics->xics->nr_servers)
+    if (!ics_valid_irq(ics, nr) || !xics_icp_get(XICS_FABRIC(spapr), server)
        || (priority > 0xff)) {
        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
        return;
@@ -151,7 +152,7 @@ static void rtas_get_xive(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                          uint32_t nargs, target_ulong args,
                          uint32_t nret, target_ulong rets)
 {
-    ICSState *ics = QLIST_FIRST(&spapr->xics->ics);
+    ICSState *ics = spapr->ics;
    uint32_t nr, srcno;

    if ((nargs != 1) || (nret != 3)) {
@@ -181,7 +182,7 @@ static void rtas_int_off(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                         uint32_t nargs, target_ulong args,
                         uint32_t nret, target_ulong rets)
 {
-    ICSState *ics = QLIST_FIRST(&spapr->xics->ics);
+    ICSState *ics = spapr->ics;
    uint32_t nr, srcno;

    if ((nargs != 1) || (nret != 1)) {
@@ -212,7 +213,7 @@ static void rtas_int_on(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                        uint32_t nargs, target_ulong args,
                        uint32_t nret, target_ulong rets)
 {
-    ICSState *ics = QLIST_FIRST(&spapr->xics->ics);
+    ICSState *ics = spapr->ics;
    uint32_t nr, srcno;

    if ((nargs != 1) || (nret != 1)) {
@@ -239,36 +240,8 @@ static void rtas_int_on(PowerPCCPU *cpu, sPAPRMachineState *spapr,
    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
 }

-static void xics_spapr_set_nr_irqs(XICSState *xics, uint32_t nr_irqs,
-                                   Error **errp)
+int xics_spapr_init(sPAPRMachineState *spapr, Error **errp)
 {
-    ICSState *ics = QLIST_FIRST(&xics->ics);
-
-    /* This needs to be deprecated ... */
-    xics->nr_irqs = nr_irqs;
-    if (ics) {
-        ics->nr_irqs = nr_irqs;
-    }
-}
-
-static void xics_spapr_set_nr_servers(XICSState *xics, uint32_t nr_servers,
-                                      Error **errp)
-{
-    xics_set_nr_servers(xics, nr_servers, TYPE_ICP, errp);
-}
-
-static void xics_spapr_realize(DeviceState *dev, Error **errp)
-{
-    XICSState *xics = XICS_SPAPR(dev);
-    ICSState *ics;
-    Error *error = NULL;
-    int i;
-
-    if (!xics->nr_servers) {
-        error_setg(errp, "Number of servers needs to be greater 0");
-        return;
-    }
-
    /* Registration of global state belongs into realize */
    spapr_rtas_register(RTAS_IBM_SET_XIVE, "ibm,set-xive", rtas_set_xive);
    spapr_rtas_register(RTAS_IBM_GET_XIVE, "ibm,get-xive", rtas_get_xive);
@@ -281,55 +254,9 @@ static void xics_spapr_realize(DeviceState *dev, Error **errp)
    spapr_register_hypercall(H_XIRR_X, h_xirr_x);
    spapr_register_hypercall(H_EOI, h_eoi);
    spapr_register_hypercall(H_IPOLL, h_ipoll);
-
-    QLIST_FOREACH(ics, &xics->ics, list) {
-        object_property_set_bool(OBJECT(ics), true, "realized", &error);
-        if (error) {
-            error_propagate(errp, error);
-            return;
-        }
-    }
-
-    for (i = 0; i < xics->nr_servers; i++) {
-        object_property_set_bool(OBJECT(&xics->ss[i]), true, "realized",
-                                 &error);
-        if (error) {
-            error_propagate(errp, error);
-            return;
-        }
-    }
+    return 0;
 }

-static void xics_spapr_initfn(Object *obj)
-{
-    XICSState *xics = XICS_SPAPR(obj);
-    ICSState *ics;
-
-    ics = ICS_SIMPLE(object_new(TYPE_ICS_SIMPLE));
-    object_property_add_child(obj, "ics", OBJECT(ics), NULL);
-    ics->xics = xics;
-    QLIST_INSERT_HEAD(&xics->ics, ics, list);
-}
-
-static void xics_spapr_class_init(ObjectClass *oc, void *data)
-{
-    DeviceClass *dc = DEVICE_CLASS(oc);
-    XICSStateClass *xsc = XICS_SPAPR_CLASS(oc);
-
-    dc->realize = xics_spapr_realize;
-    xsc->set_nr_irqs = xics_spapr_set_nr_irqs;
-    xsc->set_nr_servers = xics_spapr_set_nr_servers;
-}
-
-static const TypeInfo xics_spapr_info = {
-    .name          = TYPE_XICS_SPAPR,
-    .parent        = TYPE_XICS_COMMON,
-    .instance_size = sizeof(XICSState),
-    .class_size = sizeof(XICSStateClass),
-    .class_init    = xics_spapr_class_init,
-    .instance_init = xics_spapr_initfn,
-};
-
 #define ICS_IRQ_FREE(ics, srcno)   \
    (!((ics)->irqs[(srcno)].flags & (XICS_FLAGS_IRQ_MASK)))

@@ -354,9 +281,8 @@ static int ics_find_free_block(ICSState *ics, int num, int alignnum)
    return -1;
 }

-int xics_spapr_alloc(XICSState *xics, int irq_hint, bool lsi, Error **errp)
+int spapr_ics_alloc(ICSState *ics, int irq_hint, bool lsi, Error **errp)
 {
-    ICSState *ics = QLIST_FIRST(&xics->ics);
    int irq;

    if (!ics) {
@@ -387,10 +313,9 @@ int xics_spapr_alloc(XICSState *xics, int irq_hint, bool lsi, Error **errp)
 * Allocate block of consecutive IRQs, and return the number of the first IRQ in
 * the block. If align==true, aligns the first IRQ number to num.
 */
-int xics_spapr_alloc_block(XICSState *xics, int num, bool lsi, bool align,
-                           Error **errp)
+int spapr_ics_alloc_block(ICSState *ics, int num, bool lsi,
+                          bool align, Error **errp)
 {
-    ICSState *ics = QLIST_FIRST(&xics->ics);
    int i, first = -1;

    if (!ics) {
@@ -440,20 +365,18 @@ static void ics_free(ICSState *ics, int srcno, int num)
    }
 }

-void xics_spapr_free(XICSState *xics, int irq, int num)
+void spapr_ics_free(ICSState *ics, int irq, int num)
 {
-    ICSState *ics = xics_find_source(xics, irq);
-
-    if (ics) {
+    if (ics_valid_irq(ics, irq)) {
        trace_xics_ics_free(0, irq, num);
        ics_free(ics, irq - ics->offset, num);
    }
 }

-void spapr_dt_xics(XICSState *xics, void *fdt, uint32_t phandle)
+void spapr_dt_xics(int nr_servers, void *fdt, uint32_t phandle)
 {
    uint32_t interrupt_server_ranges_prop[] = {
-        0, cpu_to_be32(xics->nr_servers),
+        0, cpu_to_be32(nr_servers),
    };
    int node;

@@ -470,10 +393,3 @@ void spapr_dt_xics(XICSState *xics, void *fdt, uint32_t phandle)
    _FDT(fdt_setprop_cell(fdt, node, "linux,phandle", phandle));
    _FDT(fdt_setprop_cell(fdt, node, "phandle", phandle));
 }
-
-static void xics_spapr_register_types(void)
-{
-    type_register_static(&xics_spapr_info);
-}
-
-type_init(xics_spapr_register_types)
--- a/hw/nvram/spapr_nvram.c
+++ b/hw/nvram/spapr_nvram.c
@@ -141,9 +141,17 @@ static void rtas_nvram_store(PowerPCCPU *cpu, sPAPRMachineState *spapr,
 static void spapr_nvram_realize(VIOsPAPRDevice *dev, Error **errp)
 {
    sPAPRNVRAM *nvram = VIO_SPAPR_NVRAM(dev);
+    int ret;

    if (nvram->blk) {
        nvram->size = blk_getlength(nvram->blk);
+
+        ret = blk_set_perm(nvram->blk,
+                           BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE,
+                           BLK_PERM_ALL, errp);
+        if (ret < 0) {
+            return;
+        }
    } else {
        nvram->size = DEFAULT_NVRAM_SIZE;
    }
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -1530,6 +1530,34 @@ static const pci_class_desc pci_class_descriptions[] =
    { 0, NULL}
 };

+static void pci_for_each_device_under_bus_reverse(PCIBus *bus,
+                                                  void (*fn)(PCIBus *b,
+                                                             PCIDevice *d,
+                                                             void *opaque),
+                                                  void *opaque)
+{
+    PCIDevice *d;
+    int devfn;
+
+    for (devfn = 0; devfn < ARRAY_SIZE(bus->devices); devfn++) {
+        d = bus->devices[ARRAY_SIZE(bus->devices) - 1 - devfn];
+        if (d) {
+            fn(bus, d, opaque);
+        }
+    }
+}
+
+void pci_for_each_device_reverse(PCIBus *bus, int bus_num,
+                         void (*fn)(PCIBus *b, PCIDevice *d, void *opaque),
+                         void *opaque)
+{
+    bus = pci_find_bus_nr(bus, bus_num);
+
+    if (bus) {
+        pci_for_each_device_under_bus_reverse(bus, fn, opaque);
+    }
+}
+
 static void pci_for_each_device_under_bus(PCIBus *bus,
                                          void (*fn)(PCIBus *b, PCIDevice *d,
                                                     void *opaque),
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -63,6 +63,7 @@
 #include "qemu/error-report.h"
 #include "trace.h"
 #include "hw/nmi.h"
+#include "hw/intc/intc.h"

 #include "hw/compat.h"
 #include "qemu/cutils.h"
@@ -95,37 +96,68 @@

 #define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))

-static XICSState *try_create_xics(const char *type, int nr_servers,
-                                  int nr_irqs, Error **errp)
+static int try_create_xics(sPAPRMachineState *spapr, const char *type_ics,
+                           const char *type_icp, int nr_servers,
+                           int nr_irqs, Error **errp)
 {
-    Error *err = NULL;
-    DeviceState *dev;
+    XICSFabric *xi = XICS_FABRIC(spapr);
+    Error *err = NULL, *local_err = NULL;
+    ICSState *ics = NULL;
+    int i;

-    dev = qdev_create(NULL, type);
-    qdev_prop_set_uint32(dev, "nr_servers", nr_servers);
-    qdev_prop_set_uint32(dev, "nr_irqs", nr_irqs);
-    object_property_set_bool(OBJECT(dev), true, "realized", &err);
+    ics = ICS_SIMPLE(object_new(type_ics));
+    qdev_set_parent_bus(DEVICE(ics), sysbus_get_default());
+    object_property_add_child(OBJECT(spapr), "ics", OBJECT(ics), NULL);
+    object_property_set_int(OBJECT(ics), nr_irqs, "nr-irqs", &err);
+    object_property_add_const_link(OBJECT(ics), "xics", OBJECT(xi), NULL);
+    object_property_set_bool(OBJECT(ics), true, "realized", &local_err);
+    error_propagate(&err, local_err);
    if (err) {
-        error_propagate(errp, err);
-        object_unparent(OBJECT(dev));
-        return NULL;
+        goto error;
    }
-    return XICS_COMMON(dev);
+
+    spapr->icps = g_malloc0(nr_servers * sizeof(ICPState));
+    spapr->nr_servers = nr_servers;
+
+    for (i = 0; i < nr_servers; i++) {
+        ICPState *icp = &spapr->icps[i];
+
+        object_initialize(icp, sizeof(*icp), type_icp);
+        qdev_set_parent_bus(DEVICE(icp), sysbus_get_default());
+        object_property_add_child(OBJECT(spapr), "icp[*]", OBJECT(icp), NULL);
+        object_property_add_const_link(OBJECT(icp), "xics", OBJECT(xi), NULL);
+        object_property_set_bool(OBJECT(icp), true, "realized", &err);
+        if (err) {
+            goto error;
+        }
+        object_unref(OBJECT(icp));
+    }
+
+    spapr->ics = ics;
+    return 0;
+
+error:
+    error_propagate(errp, err);
+    if (ics) {
+        object_unparent(OBJECT(ics));
+    }
+    return -1;
 }

-static XICSState *xics_system_init(MachineState *machine,
-                                   int nr_servers, int nr_irqs, Error **errp)
+static int xics_system_init(MachineState *machine,
+                            int nr_servers, int nr_irqs, Error **errp)
 {
-    XICSState *xics = NULL;
+    int rc = -1;

    if (kvm_enabled()) {
        Error *err = NULL;

-        if (machine_kernel_irqchip_allowed(machine)) {
-            xics = try_create_xics(TYPE_XICS_SPAPR_KVM, nr_servers, nr_irqs,
-                                   &err);
+        if (machine_kernel_irqchip_allowed(machine) &&
+            !xics_kvm_init(SPAPR_MACHINE(machine), errp)) {
+            rc = try_create_xics(SPAPR_MACHINE(machine), TYPE_ICS_KVM,
+                                 TYPE_KVM_ICP, nr_servers, nr_irqs, &err);
        }
-        if (machine_kernel_irqchip_required(machine) && !xics) {
+        if (machine_kernel_irqchip_required(machine) && rc < 0) {
            error_reportf_err(err,
                              "kernel_irqchip requested but unavailable: ");
        } else {
@@ -133,11 +165,13 @@ static XICSState *xics_system_init(MachineState *machine,
        }
    }

-    if (!xics) {
-        xics = try_create_xics(TYPE_XICS_SPAPR, nr_servers, nr_irqs, errp);
+    if (rc < 0) {
+        xics_spapr_init(SPAPR_MACHINE(machine), errp);
+        rc = try_create_xics(SPAPR_MACHINE(machine), TYPE_ICS_SIMPLE,
+                               TYPE_ICP, nr_servers, nr_irqs, errp);
    }

-    return xics;
+    return rc;
 }

 static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
@@ -924,7 +958,7 @@ static void *spapr_build_fdt(sPAPRMachineState *spapr,
    _FDT(fdt_setprop_cell(fdt, 0, "#size-cells", 2));

    /* /interrupt controller */
-    spapr_dt_xics(spapr->xics, fdt, PHANDLE_XICP);
+    spapr_dt_xics(spapr->nr_servers, fdt, PHANDLE_XICP);

    ret = spapr_populate_memory(spapr, fdt);
    if (ret < 0) {
@@ -1053,6 +1087,62 @@ static void close_htab_fd(sPAPRMachineState *spapr)
    spapr->htab_fd = -1;
 }

+static hwaddr spapr_hpt_mask(PPCVirtualHypervisor *vhyp)
+{
+    sPAPRMachineState *spapr = SPAPR_MACHINE(vhyp);
+
+    return HTAB_SIZE(spapr) / HASH_PTEG_SIZE_64 - 1;
+}
+
+static const ppc_hash_pte64_t *spapr_map_hptes(PPCVirtualHypervisor *vhyp,
+                                                hwaddr ptex, int n)
+{
+    sPAPRMachineState *spapr = SPAPR_MACHINE(vhyp);
+    hwaddr pte_offset = ptex * HASH_PTE_SIZE_64;
+
+    if (!spapr->htab) {
+        /*
+         * HTAB is controlled by KVM. Fetch into temporary buffer
+         */
+        ppc_hash_pte64_t *hptes = g_malloc(n * HASH_PTE_SIZE_64);
+        kvmppc_read_hptes(hptes, ptex, n);
+        return hptes;
+    }
+
+    /*
+     * HTAB is controlled by QEMU. Just point to the internally
+     * accessible PTEG.
+     */
+    return (const ppc_hash_pte64_t *)(spapr->htab + pte_offset);
+}
+
+static void spapr_unmap_hptes(PPCVirtualHypervisor *vhyp,
+                              const ppc_hash_pte64_t *hptes,
+                              hwaddr ptex, int n)
+{
+    sPAPRMachineState *spapr = SPAPR_MACHINE(vhyp);
+
+    if (!spapr->htab) {
+        g_free((void *)hptes);
+    }
+
+    /* Nothing to do for qemu managed HPT */
+}
+
+static void spapr_store_hpte(PPCVirtualHypervisor *vhyp, hwaddr ptex,
+                             uint64_t pte0, uint64_t pte1)
+{
+    sPAPRMachineState *spapr = SPAPR_MACHINE(vhyp);
+    hwaddr offset = ptex * HASH_PTE_SIZE_64;
+
+    if (!spapr->htab) {
+        kvmppc_write_hpte(ptex, pte0, pte1);
+    } else {
+        stq_p(spapr->htab + offset, pte0);
+        stq_p(spapr->htab + offset + HASH_PTE_SIZE_64 / 2, pte1);
+    }
+}
+
 static int spapr_hpt_shift_for_ramsize(uint64_t ramsize)
 {
    int shift;
@@ -1252,6 +1342,13 @@ static int spapr_post_load(void *opaque, int version_id)
    sPAPRMachineState *spapr = (sPAPRMachineState *)opaque;
    int err = 0;

+    if (!object_dynamic_cast(OBJECT(spapr->ics), TYPE_ICS_KVM)) {
+        int i;
+        for (i = 0; i < spapr->nr_servers; i++) {
+            icp_resend(&spapr->icps[i]);
+        }
+    }
+
    /* In earlier versions, there was no separate qdev for the PAPR
     * RTC, so the RTC offset was stored directly in sPAPREnvironment.
     * So when migrating from those versions, poke the incoming offset
@@ -1902,9 +1999,8 @@ static void ppc_spapr_init(MachineState *machine)
    load_limit = MIN(spapr->rma_size, RTAS_MAX_ADDR) - FW_OVERHEAD;

    /* Set up Interrupt Controller before we create the VCPUs */
-    spapr->xics = xics_system_init(machine,
-                                   DIV_ROUND_UP(max_cpus * smt, smp_threads),
-                                   XICS_IRQS_SPAPR, &error_fatal);
+    xics_system_init(machine, DIV_ROUND_UP(max_cpus * smt, smp_threads),
+                     XICS_IRQS_SPAPR, &error_fatal);

    /* Set up containers for ibm,client-set-architecture negotiated options */
    spapr->ov5 = spapr_ovec_new();
@@ -2872,6 +2968,40 @@ static void spapr_phb_placement(sPAPRMachineState *spapr, uint32_t index,
    *mmio64 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM64_WIN_SIZE;
 }

+static ICSState *spapr_ics_get(XICSFabric *dev, int irq)
+{
+    sPAPRMachineState *spapr = SPAPR_MACHINE(dev);
+
+    return ics_valid_irq(spapr->ics, irq) ? spapr->ics : NULL;
+}
+
+static void spapr_ics_resend(XICSFabric *dev)
+{
+    sPAPRMachineState *spapr = SPAPR_MACHINE(dev);
+
+    ics_resend(spapr->ics);
+}
+
+static ICPState *spapr_icp_get(XICSFabric *xi, int server)
+{
+    sPAPRMachineState *spapr = SPAPR_MACHINE(xi);
+
+    return (server < spapr->nr_servers) ? &spapr->icps[server] : NULL;
+}
+
+static void spapr_pic_print_info(InterruptStatsProvider *obj,
+                                 Monitor *mon)
+{
+    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
+    int i;
+
+    for (i = 0; i < spapr->nr_servers; i++) {
+        icp_pic_print_info(&spapr->icps[i], mon);
+    }
+
+    ics_pic_print_info(spapr->ics, mon);
+}
+
 static void spapr_machine_class_init(ObjectClass *oc, void *data)
 {
    MachineClass *mc = MACHINE_CLASS(oc);
@@ -2880,6 +3010,8 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
    NMIClass *nc = NMI_CLASS(oc);
    HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
    PPCVirtualHypervisorClass *vhc = PPC_VIRTUAL_HYPERVISOR_CLASS(oc);
+    XICSFabricClass *xic = XICS_FABRIC_CLASS(oc);
+    InterruptStatsProviderClass *ispc = INTERRUPT_STATS_PROVIDER_CLASS(oc);

    mc->desc = "pSeries Logical Partition (PAPR compliant)";

@@ -2891,7 +3023,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
    mc->init = ppc_spapr_init;
    mc->reset = ppc_spapr_reset;
    mc->block_default_type = IF_SCSI;
-    mc->max_cpus = 255;
+    mc->max_cpus = 1024;
    mc->no_parallel = 1;
    mc->default_boot_order = "";
    mc->default_ram_size = 512 * M_BYTE;
@@ -2913,6 +3045,14 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
    nc->nmi_monitor_handler = spapr_nmi;
    smc->phb_placement = spapr_phb_placement;
    vhc->hypercall = emulate_spapr_hypercall;
+    vhc->hpt_mask = spapr_hpt_mask;
+    vhc->map_hptes = spapr_map_hptes;
+    vhc->unmap_hptes = spapr_unmap_hptes;
+    vhc->store_hpte = spapr_store_hpte;
+    xic->ics_get = spapr_ics_get;
+    xic->ics_resend = spapr_ics_resend;
+    xic->icp_get = spapr_icp_get;
+    ispc->print_info = spapr_pic_print_info;
 }

 static const TypeInfo spapr_machine_info = {
@@ -2929,6 +3069,8 @@ static const TypeInfo spapr_machine_info = {
        { TYPE_NMI },
        { TYPE_HOTPLUG_HANDLER },
        { TYPE_PPC_VIRTUAL_HYPERVISOR },
+        { TYPE_XICS_FABRIC },
+        { TYPE_INTERRUPT_STATS_PROVIDER },
        { }
    },
 };
--- a/hw/ppc/spapr_cpu_core.c
+++ b/hw/ppc/spapr_cpu_core.c
@@ -13,10 +13,12 @@
 #include "hw/boards.h"
 #include "qapi/error.h"
 #include "sysemu/cpus.h"
+#include "sysemu/kvm.h"
 #include "target/ppc/kvm_ppc.h"
 #include "hw/ppc/ppc.h"
 #include "target/ppc/mmu-hash64.h"
 #include "sysemu/numa.h"
+#include "qemu/error-report.h"

 static void spapr_cpu_reset(void *opaque)
 {
@@ -34,15 +36,26 @@ static void spapr_cpu_reset(void *opaque)

    env->spr[SPR_HIOR] = 0;

-    ppc_hash64_set_external_hpt(cpu, spapr->htab, spapr->htab_shift,
-                                &error_fatal);
+    /*
+     * This is a hack for the benefit of KVM PR - it abuses the SDR1
+     * slot in kvm_sregs to communicate the userspace address of the
+     * HPT
+     */
+    if (kvm_enabled()) {
+        env->spr[SPR_SDR1] = (target_ulong)(uintptr_t)spapr->htab
+            | (spapr->htab_shift - 18);
+        if (kvmppc_put_books_sregs(cpu) < 0) {
+            error_report("Unable to update SDR1 in KVM");
+            exit(1);
+        }
+    }
 }

 static void spapr_cpu_destroy(PowerPCCPU *cpu)
 {
    sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());

-    xics_cpu_destroy(spapr->xics, cpu);
+    xics_cpu_destroy(XICS_FABRIC(spapr), cpu);
    qemu_unregister_reset(spapr_cpu_reset, cpu);
 }

@@ -57,8 +70,7 @@ static void spapr_cpu_init(sPAPRMachineState *spapr, PowerPCCPU *cpu,
    cpu_ppc_tb_init(env, SPAPR_TIMEBASE_FREQ);

    /* Enable PAPR mode in TCG or KVM */
-    cpu_ppc_set_vhyp(cpu, PPC_VIRTUAL_HYPERVISOR(spapr));
-    cpu_ppc_set_papr(cpu);
+    cpu_ppc_set_papr(cpu, PPC_VIRTUAL_HYPERVISOR(spapr));

    if (cpu->max_compat) {
        Error *local_err = NULL;
@@ -76,7 +88,7 @@ static void spapr_cpu_init(sPAPRMachineState *spapr, PowerPCCPU *cpu,
            cs->numa_node = i;
    }

-    xics_cpu_setup(spapr->xics, cpu);
+    xics_cpu_setup(XICS_FABRIC(spapr), cpu);

    qemu_register_reset(spapr_cpu_reset, cpu);
    spapr_cpu_reset(cpu);
--- a/hw/ppc/spapr_events.c
+++ b/hw/ppc/spapr_events.c
@@ -481,7 +481,7 @@ static void spapr_powerdown_req(Notifier *n, void *opaque)

    rtas_event_log_queue(RTAS_LOG_TYPE_EPOW, new_epow, true);

-    qemu_irq_pulse(xics_get_qirq(spapr->xics,
+    qemu_irq_pulse(xics_get_qirq(XICS_FABRIC(spapr),
                                 rtas_event_log_to_irq(spapr,
                                                       RTAS_LOG_TYPE_EPOW)));
 }
@@ -574,7 +574,7 @@ static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,

    rtas_event_log_queue(RTAS_LOG_TYPE_HOTPLUG, new_hp, true);

-    qemu_irq_pulse(xics_get_qirq(spapr->xics,
+    qemu_irq_pulse(xics_get_qirq(XICS_FABRIC(spapr),
                                 rtas_event_log_to_irq(spapr,
                                                       RTAS_LOG_TYPE_HOTPLUG)));
 }
@@ -695,7 +695,7 @@ static void check_exception(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                spapr_event_sources_get_source(spapr->event_sources, i);

            g_assert(source->enabled);
-            qemu_irq_pulse(xics_get_qirq(spapr->xics, source->irq));
+            qemu_irq_pulse(xics_get_qirq(XICS_FABRIC(spapr), source->irq));
        }
    }

@@ -752,7 +752,7 @@ void spapr_events_init(sPAPRMachineState *spapr)
    spapr->event_sources = spapr_event_sources_new();

    spapr_event_sources_register(spapr->event_sources, EVENT_CLASS_EPOW,
-                                 xics_spapr_alloc(spapr->xics, 0, false,
+                                 spapr_ics_alloc(spapr->ics, 0, false,
                                                  &error_fatal));

    /* NOTE: if machine supports modern/dedicated hotplug event source,
@@ -765,7 +765,7 @@ void spapr_events_init(sPAPRMachineState *spapr)
     */
    if (spapr->use_hotplug_event_source) {
        spapr_event_sources_register(spapr->event_sources, EVENT_CLASS_HOT_PLUG,
-                                     xics_spapr_alloc(spapr->xics, 0, false,
+                                     spapr_ics_alloc(spapr->ics, 0, false,
                                                      &error_fatal));
    }

--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -47,12 +47,12 @@ static bool has_spr(PowerPCCPU *cpu, int spr)
    return cpu->env.spr_cb[spr].name != NULL;
 }

-static inline bool valid_pte_index(CPUPPCState *env, target_ulong pte_index)
+static inline bool valid_ptex(PowerPCCPU *cpu, target_ulong ptex)
 {
    /*
-     * hash value/pteg group index is normalized by htab_mask
+     * hash value/pteg group index is normalized by HPT mask
     */
-    if (((pte_index & ~7ULL) / HPTES_PER_GROUP) & ~env->htab_mask) {
+    if (((ptex & ~7ULL) / HPTES_PER_GROUP) & ~ppc_hash64_hpt_mask(cpu)) {
        return false;
    }
    return true;
@@ -77,15 +77,14 @@ static bool is_ram_address(sPAPRMachineState *spapr, hwaddr addr)
 static target_ulong h_enter(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                            target_ulong opcode, target_ulong *args)
 {
-    CPUPPCState *env = &cpu->env;
    target_ulong flags = args[0];
-    target_ulong pte_index = args[1];
+    target_ulong ptex = args[1];
    target_ulong pteh = args[2];
    target_ulong ptel = args[3];
    unsigned apshift;
    target_ulong raddr;
-    target_ulong index;
-    uint64_t token;
+    target_ulong slot;
+    const ppc_hash_pte64_t *hptes;

    apshift = ppc_hash64_hpte_page_shift_noslb(cpu, pteh, ptel);
    if (!apshift) {
@@ -116,36 +115,36 @@ static target_ulong h_enter(PowerPCCPU *cpu, sPAPRMachineState *spapr,

    pteh &= ~0x60ULL;

-    if (!valid_pte_index(env, pte_index)) {
+    if (!valid_ptex(cpu, ptex)) {
        return H_PARAMETER;
    }

-    index = 0;
+    slot = ptex & 7ULL;
+    ptex = ptex & ~7ULL;
+
    if (likely((flags & H_EXACT) == 0)) {
-        pte_index &= ~7ULL;
-        token = ppc_hash64_start_access(cpu, pte_index);
-        for (; index < 8; index++) {
-            if (!(ppc_hash64_load_hpte0(cpu, token, index) & HPTE64_V_VALID)) {
+        hptes = ppc_hash64_map_hptes(cpu, ptex, HPTES_PER_GROUP);
+        for (slot = 0; slot < 8; slot++) {
+            if (!(ppc_hash64_hpte0(cpu, hptes, slot) & HPTE64_V_VALID)) {
                break;
            }
        }
-        ppc_hash64_stop_access(cpu, token);
-        if (index == 8) {
+        ppc_hash64_unmap_hptes(cpu, hptes, ptex, HPTES_PER_GROUP);
+        if (slot == 8) {
            return H_PTEG_FULL;
        }
    } else {
-        token = ppc_hash64_start_access(cpu, pte_index);
-        if (ppc_hash64_load_hpte0(cpu, token, 0) & HPTE64_V_VALID) {
-            ppc_hash64_stop_access(cpu, token);
+        hptes = ppc_hash64_map_hptes(cpu, ptex + slot, 1);
+        if (ppc_hash64_hpte0(cpu, hptes, 0) & HPTE64_V_VALID) {
+            ppc_hash64_unmap_hptes(cpu, hptes, ptex + slot, 1);
            return H_PTEG_FULL;
        }
-        ppc_hash64_stop_access(cpu, token);
+        ppc_hash64_unmap_hptes(cpu, hptes, ptex, 1);
    }

-    ppc_hash64_store_hpte(cpu, pte_index + index,
-                          pteh | HPTE64_V_HPTE_DIRTY, ptel);
+    ppc_hash64_store_hpte(cpu, ptex + slot, pteh | HPTE64_V_HPTE_DIRTY, ptel);

-    args[0] = pte_index + index;
+    args[0] = ptex + slot;
    return H_SUCCESS;
 }

@@ -161,18 +160,17 @@ static RemoveResult remove_hpte(PowerPCCPU *cpu, target_ulong ptex,
                                target_ulong flags,
                                target_ulong *vp, target_ulong *rp)
 {
-    CPUPPCState *env = &cpu->env;
-    uint64_t token;
+    const ppc_hash_pte64_t *hptes;
    target_ulong v, r;

-    if (!valid_pte_index(env, ptex)) {
+    if (!valid_ptex(cpu, ptex)) {
        return REMOVE_PARM;
    }

-    token = ppc_hash64_start_access(cpu, ptex);
-    v = ppc_hash64_load_hpte0(cpu, token, 0);
-    r = ppc_hash64_load_hpte1(cpu, token, 0);
-    ppc_hash64_stop_access(cpu, token);
+    hptes = ppc_hash64_map_hptes(cpu, ptex, 1);
+    v = ppc_hash64_hpte0(cpu, hptes, 0);
+    r = ppc_hash64_hpte1(cpu, hptes, 0);
+    ppc_hash64_unmap_hptes(cpu, hptes, ptex, 1);

    if ((v & HPTE64_V_VALID) == 0 ||
        ((flags & H_AVPN) && (v & ~0x7fULL) != avpn) ||
@@ -191,11 +189,11 @@ static target_ulong h_remove(PowerPCCPU *cpu, sPAPRMachineState *spapr,
 {
    CPUPPCState *env = &cpu->env;
    target_ulong flags = args[0];
-    target_ulong pte_index = args[1];
+    target_ulong ptex = args[1];
    target_ulong avpn = args[2];
    RemoveResult ret;

-    ret = remove_hpte(cpu, pte_index, avpn, flags,
+    ret = remove_hpte(cpu, ptex, avpn, flags,
                      &args[0], &args[1]);

    switch (ret) {
@@ -291,19 +289,19 @@ static target_ulong h_protect(PowerPCCPU *cpu, sPAPRMachineState *spapr,
 {
    CPUPPCState *env = &cpu->env;
    target_ulong flags = args[0];
-    target_ulong pte_index = args[1];
+    target_ulong ptex = args[1];
    target_ulong avpn = args[2];
-    uint64_t token;
+    const ppc_hash_pte64_t *hptes;
    target_ulong v, r;

-    if (!valid_pte_index(env, pte_index)) {
+    if (!valid_ptex(cpu, ptex)) {
        return H_PARAMETER;
    }

-    token = ppc_hash64_start_access(cpu, pte_index);
-    v = ppc_hash64_load_hpte0(cpu, token, 0);
-    r = ppc_hash64_load_hpte1(cpu, token, 0);
-    ppc_hash64_stop_access(cpu, token);
+    hptes = ppc_hash64_map_hptes(cpu, ptex, 1);
+    v = ppc_hash64_hpte0(cpu, hptes, 0);
+    r = ppc_hash64_hpte1(cpu, hptes, 0);
+    ppc_hash64_unmap_hptes(cpu, hptes, ptex, 1);

    if ((v & HPTE64_V_VALID) == 0 ||
        ((flags & H_AVPN) && (v & ~0x7fULL) != avpn)) {
@@ -315,36 +313,35 @@ static target_ulong h_protect(PowerPCCPU *cpu, sPAPRMachineState *spapr,
    r |= (flags << 55) & HPTE64_R_PP0;
    r |= (flags << 48) & HPTE64_R_KEY_HI;
    r |= flags & (HPTE64_R_PP | HPTE64_R_N | HPTE64_R_KEY_LO);
-    ppc_hash64_store_hpte(cpu, pte_index,
+    ppc_hash64_store_hpte(cpu, ptex,
                          (v & ~HPTE64_V_VALID) | HPTE64_V_HPTE_DIRTY, 0);
-    ppc_hash64_tlb_flush_hpte(cpu, pte_index, v, r);
+    ppc_hash64_tlb_flush_hpte(cpu, ptex, v, r);
    /* Flush the tlb */
    check_tlb_flush(env, true);
    /* Don't need a memory barrier, due to qemu's global lock */
-    ppc_hash64_store_hpte(cpu, pte_index, v | HPTE64_V_HPTE_DIRTY, r);
+    ppc_hash64_store_hpte(cpu, ptex, v | HPTE64_V_HPTE_DIRTY, r);
    return H_SUCCESS;
 }

 static target_ulong h_read(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                           target_ulong opcode, target_ulong *args)
 {
-    CPUPPCState *env = &cpu->env;
    target_ulong flags = args[0];
-    target_ulong pte_index = args[1];
+    target_ulong ptex = args[1];
    uint8_t *hpte;
    int i, ridx, n_entries = 1;

-    if (!valid_pte_index(env, pte_index)) {
+    if (!valid_ptex(cpu, ptex)) {
        return H_PARAMETER;
    }

    if (flags & H_READ_4) {
        /* Clear the two low order bits */
-        pte_index &= ~(3ULL);
+        ptex &= ~(3ULL);
        n_entries = 4;
    }

-    hpte = env->external_htab + (pte_index * HASH_PTE_SIZE_64);
+    hpte = spapr->htab + (ptex * HASH_PTE_SIZE_64);

    for (i = 0, ridx = 0; i < n_entries; i++) {
        args[ridx++] = ldq_p(hpte);
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -43,6 +43,7 @@

 #include "hw/pci/pci_bridge.h"
 #include "hw/pci/pci_bus.h"
+#include "hw/pci/pci_ids.h"
 #include "hw/ppc/spapr_drc.h"
 #include "sysemu/device_tree.h"
 #include "sysemu/kvm.h"
@@ -325,7 +326,7 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPRMachineState *spapr,
            return;
        }

-        xics_spapr_free(spapr->xics, msi->first_irq, msi->num);
+        spapr_ics_free(spapr->ics, msi->first_irq, msi->num);
        if (msi_present(pdev)) {
            spapr_msi_setmsg(pdev, 0, false, 0, 0);
        }
@@ -363,7 +364,7 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPRMachineState *spapr,
    }

    /* Allocate MSIs */
-    irq = xics_spapr_alloc_block(spapr->xics, req_num, false,
+    irq = spapr_ics_alloc_block(spapr->ics, req_num, false,
                           ret_intr_type == RTAS_TYPE_MSI, &err);
    if (err) {
        error_reportf_err(err, "Can't allocate MSIs for device %x: ",
@@ -374,7 +375,7 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPRMachineState *spapr,

    /* Release previous MSIs */
    if (msi) {
-        xics_spapr_free(spapr->xics, msi->first_irq, msi->num);
+        spapr_ics_free(spapr->ics, msi->first_irq, msi->num);
        g_hash_table_remove(phb->msi, &config_addr);
    }

@@ -736,7 +737,7 @@ static void spapr_msi_write(void *opaque, hwaddr addr,

    trace_spapr_pci_msi_write(addr, data, irq);

-    qemu_irq_pulse(xics_get_qirq(spapr->xics, irq));
+    qemu_irq_pulse(xics_get_qirq(XICS_FABRIC(spapr), irq));
 }

 static const MemoryRegionOps spapr_msi_ops = {
@@ -946,6 +947,274 @@ static void populate_resource_props(PCIDevice *d, ResourceProps *rp)
    rp->assigned_len = assigned_idx * sizeof(ResourceFields);
 }

+typedef struct PCIClass PCIClass;
+typedef struct PCISubClass PCISubClass;
+typedef struct PCIIFace PCIIFace;
+
+struct PCIIFace {
+    int iface;
+    const char *name;
+};
+
+struct PCISubClass {
+    int subclass;
+    const char *name;
+    const PCIIFace *iface;
+};
+
+struct PCIClass {
+    const char *name;
+    const PCISubClass *subc;
+};
+
+static const PCISubClass undef_subclass[] = {
+    { PCI_CLASS_NOT_DEFINED_VGA, "display", NULL },
+    { 0xFF, NULL, NULL },
+};
+
+static const PCISubClass mass_subclass[] = {
+    { PCI_CLASS_STORAGE_SCSI, "scsi", NULL },
+    { PCI_CLASS_STORAGE_IDE, "ide", NULL },
+    { PCI_CLASS_STORAGE_FLOPPY, "fdc", NULL },
+    { PCI_CLASS_STORAGE_IPI, "ipi", NULL },
+    { PCI_CLASS_STORAGE_RAID, "raid", NULL },
+    { PCI_CLASS_STORAGE_ATA, "ata", NULL },
+    { PCI_CLASS_STORAGE_SATA, "sata", NULL },
+    { PCI_CLASS_STORAGE_SAS, "sas", NULL },
+    { 0xFF, NULL, NULL },
+};
+
+static const PCISubClass net_subclass[] = {
+    { PCI_CLASS_NETWORK_ETHERNET, "ethernet", NULL },
+    { PCI_CLASS_NETWORK_TOKEN_RING, "token-ring", NULL },
+    { PCI_CLASS_NETWORK_FDDI, "fddi", NULL },
+    { PCI_CLASS_NETWORK_ATM, "atm", NULL },
+    { PCI_CLASS_NETWORK_ISDN, "isdn", NULL },
+    { PCI_CLASS_NETWORK_WORLDFIP, "worldfip", NULL },
+    { PCI_CLASS_NETWORK_PICMG214, "picmg", NULL },
+    { 0xFF, NULL, NULL },
+};
+
+static const PCISubClass displ_subclass[] = {
+    { PCI_CLASS_DISPLAY_VGA, "vga", NULL },
+    { PCI_CLASS_DISPLAY_XGA, "xga", NULL },
+    { PCI_CLASS_DISPLAY_3D, "3d-controller", NULL },
+    { 0xFF, NULL, NULL },
+};
+
+static const PCISubClass media_subclass[] = {
+    { PCI_CLASS_MULTIMEDIA_VIDEO, "video", NULL },
+    { PCI_CLASS_MULTIMEDIA_AUDIO, "sound", NULL },
+    { PCI_CLASS_MULTIMEDIA_PHONE, "telephony", NULL },
+    { 0xFF, NULL, NULL },
+};
+
+static const PCISubClass mem_subclass[] = {
+    { PCI_CLASS_MEMORY_RAM, "memory", NULL },
+    { PCI_CLASS_MEMORY_FLASH, "flash", NULL },
+    { 0xFF, NULL, NULL },
+};
+
+static const PCISubClass bridg_subclass[] = {
+    { PCI_CLASS_BRIDGE_HOST, "host", NULL },
+    { PCI_CLASS_BRIDGE_ISA, "isa", NULL },
+    { PCI_CLASS_BRIDGE_EISA, "eisa", NULL },
+    { PCI_CLASS_BRIDGE_MC, "mca", NULL },
+    { PCI_CLASS_BRIDGE_PCI, "pci", NULL },
+    { PCI_CLASS_BRIDGE_PCMCIA, "pcmcia", NULL },
+    { PCI_CLASS_BRIDGE_NUBUS, "nubus", NULL },
+    { PCI_CLASS_BRIDGE_CARDBUS, "cardbus", NULL },
+    { PCI_CLASS_BRIDGE_RACEWAY, "raceway", NULL },
+    { PCI_CLASS_BRIDGE_PCI_SEMITP, "semi-transparent-pci", NULL },
+    { PCI_CLASS_BRIDGE_IB_PCI, "infiniband", NULL },
+    { 0xFF, NULL, NULL },
+};
+
+static const PCISubClass comm_subclass[] = {
+    { PCI_CLASS_COMMUNICATION_SERIAL, "serial", NULL },
+    { PCI_CLASS_COMMUNICATION_PARALLEL, "parallel", NULL },
+    { PCI_CLASS_COMMUNICATION_MULTISERIAL, "multiport-serial", NULL },
+    { PCI_CLASS_COMMUNICATION_MODEM, "modem", NULL },
+    { PCI_CLASS_COMMUNICATION_GPIB, "gpib", NULL },
+    { PCI_CLASS_COMMUNICATION_SC, "smart-card", NULL },
+    { 0xFF, NULL, NULL, },
+};
+
+static const PCIIFace pic_iface[] = {
+    { PCI_CLASS_SYSTEM_PIC_IOAPIC, "io-apic" },
+    { PCI_CLASS_SYSTEM_PIC_IOXAPIC, "io-xapic" },
+    { 0xFF, NULL },
+};
+
+static const PCISubClass sys_subclass[] = {
+    { PCI_CLASS_SYSTEM_PIC, "interrupt-controller", pic_iface },
+    { PCI_CLASS_SYSTEM_DMA, "dma-controller", NULL },
+    { PCI_CLASS_SYSTEM_TIMER, "timer", NULL },
+    { PCI_CLASS_SYSTEM_RTC, "rtc", NULL },
+    { PCI_CLASS_SYSTEM_PCI_HOTPLUG, "hot-plug-controller", NULL },
+    { PCI_CLASS_SYSTEM_SDHCI, "sd-host-controller", NULL },
+    { 0xFF, NULL, NULL },
+};
+
+static const PCISubClass inp_subclass[] = {
+    { PCI_CLASS_INPUT_KEYBOARD, "keyboard", NULL },
+    { PCI_CLASS_INPUT_PEN, "pen", NULL },
+    { PCI_CLASS_INPUT_MOUSE, "mouse", NULL },
+    { PCI_CLASS_INPUT_SCANNER, "scanner", NULL },
+    { PCI_CLASS_INPUT_GAMEPORT, "gameport", NULL },
+    { 0xFF, NULL, NULL },
+};
+
+static const PCISubClass dock_subclass[] = {
+    { PCI_CLASS_DOCKING_GENERIC, "dock", NULL },
+    { 0xFF, NULL, NULL },
+};
+
+static const PCISubClass cpu_subclass[] = {
+    { PCI_CLASS_PROCESSOR_PENTIUM, "pentium", NULL },
+    { PCI_CLASS_PROCESSOR_POWERPC, "powerpc", NULL },
+    { PCI_CLASS_PROCESSOR_MIPS, "mips", NULL },
+    { PCI_CLASS_PROCESSOR_CO, "co-processor", NULL },
+    { 0xFF, NULL, NULL },
+};
+
+static const PCIIFace usb_iface[] = {
+    { PCI_CLASS_SERIAL_USB_UHCI, "usb-uhci" },
+    { PCI_CLASS_SERIAL_USB_OHCI, "usb-ohci", },
+    { PCI_CLASS_SERIAL_USB_EHCI, "usb-ehci" },
+    { PCI_CLASS_SERIAL_USB_XHCI, "usb-xhci" },
+    { PCI_CLASS_SERIAL_USB_UNKNOWN, "usb-unknown" },
+    { PCI_CLASS_SERIAL_USB_DEVICE, "usb-device" },
+    { 0xFF, NULL },
+};
+
+static const PCISubClass ser_subclass[] = {
+    { PCI_CLASS_SERIAL_FIREWIRE, "firewire", NULL },
+    { PCI_CLASS_SERIAL_ACCESS, "access-bus", NULL },
+    { PCI_CLASS_SERIAL_SSA, "ssa", NULL },
+    { PCI_CLASS_SERIAL_USB, "usb", usb_iface },
+    { PCI_CLASS_SERIAL_FIBER, "fibre-channel", NULL },
+    { PCI_CLASS_SERIAL_SMBUS, "smb", NULL },
+    { PCI_CLASS_SERIAL_IB, "infiniband", NULL },
+    { PCI_CLASS_SERIAL_IPMI, "ipmi", NULL },
+    { PCI_CLASS_SERIAL_SERCOS, "sercos", NULL },
+    { PCI_CLASS_SERIAL_CANBUS, "canbus", NULL },
+    { 0xFF, NULL, NULL },
+};
+
+static const PCISubClass wrl_subclass[] = {
+    { PCI_CLASS_WIRELESS_IRDA, "irda", NULL },
+    { PCI_CLASS_WIRELESS_CIR, "consumer-ir", NULL },
+    { PCI_CLASS_WIRELESS_RF_CONTROLLER, "rf-controller", NULL },
+    { PCI_CLASS_WIRELESS_BLUETOOTH, "bluetooth", NULL },
+    { PCI_CLASS_WIRELESS_BROADBAND, "broadband", NULL },
+    { 0xFF, NULL, NULL },
+};
+
+static const PCISubClass sat_subclass[] = {
+    { PCI_CLASS_SATELLITE_TV, "satellite-tv", NULL },
+    { PCI_CLASS_SATELLITE_AUDIO, "satellite-audio", NULL },
+    { PCI_CLASS_SATELLITE_VOICE, "satellite-voice", NULL },
+    { PCI_CLASS_SATELLITE_DATA, "satellite-data", NULL },
+    { 0xFF, NULL, NULL },
+};
+
+static const PCISubClass crypt_subclass[] = {
+    { PCI_CLASS_CRYPT_NETWORK, "network-encryption", NULL },
+    { PCI_CLASS_CRYPT_ENTERTAINMENT,
+      "entertainment-encryption", NULL },
+    { 0xFF, NULL, NULL },
+};
+
+static const PCISubClass spc_subclass[] = {
+    { PCI_CLASS_SP_DPIO, "dpio", NULL },
+    { PCI_CLASS_SP_PERF, "counter", NULL },
+    { PCI_CLASS_SP_SYNCH, "measurement", NULL },
+    { PCI_CLASS_SP_MANAGEMENT, "management-card", NULL },
+    { 0xFF, NULL, NULL },
+};
+
+static const PCIClass pci_classes[] = {
+    { "legacy-device", undef_subclass },
+    { "mass-storage",  mass_subclass },
+    { "network", net_subclass },
+    { "display", displ_subclass, },
+    { "multimedia-device", media_subclass },
+    { "memory-controller", mem_subclass },
+    { "unknown-bridge", bridg_subclass },
+    { "communication-controller", comm_subclass},
+    { "system-peripheral", sys_subclass },
+    { "input-controller", inp_subclass },
+    { "docking-station", dock_subclass },
+    { "cpu", cpu_subclass },
+    { "serial-bus", ser_subclass },
+    { "wireless-controller", wrl_subclass },
+    { "intelligent-io", NULL },
+    { "satellite-device", sat_subclass },
+    { "encryption", crypt_subclass },
+    { "data-processing-controller", spc_subclass },
+};
+
+static const char *pci_find_device_name(uint8_t class, uint8_t subclass,
+                                        uint8_t iface)
+{
+    const PCIClass *pclass;
+    const PCISubClass *psubclass;
+    const PCIIFace *piface;
+    const char *name;
+
+    if (class >= ARRAY_SIZE(pci_classes)) {
+        return "pci";
+    }
+
+    pclass = pci_classes + class;
+    name = pclass->name;
+
+    if (pclass->subc == NULL) {
+        return name;
+    }
+
+    psubclass = pclass->subc;
+    while ((psubclass->subclass & 0xff) != 0xff) {
+        if ((psubclass->subclass & 0xff) == subclass) {
+            name = psubclass->name;
+            break;
+        }
+        psubclass++;
+    }
+
+    piface = psubclass->iface;
+    if (piface == NULL) {
+        return name;
+    }
+    while ((piface->iface & 0xff) != 0xff) {
+        if ((piface->iface & 0xff) == iface) {
+            name = piface->name;
+            break;
+        }
+        piface++;
+    }
+
+    return name;
+}
+
+static void pci_get_node_name(char *nodename, int len, PCIDevice *dev)
+{
+    int slot = PCI_SLOT(dev->devfn);
+    int func = PCI_FUNC(dev->devfn);
+    uint32_t ccode = pci_default_read_config(dev, PCI_CLASS_PROG, 3);
+    const char *name;
+
+    name = pci_find_device_name((ccode >> 16) & 0xff, (ccode >> 8) & 0xff,
+                                ccode & 0xff);
+
+    if (func != 0) {
+        snprintf(nodename, len, "%s@%x,%x", name, slot, func);
+    } else {
+        snprintf(nodename, len, "%s@%x", name, slot);
+    }
+}
+
 static uint32_t spapr_phb_get_pci_drc_index(sPAPRPHBState *phb,
                                            PCIDevice *pdev);

@@ -957,6 +1226,7 @@ static int spapr_populate_pci_child_dt(PCIDevice *dev, void *fdt, int offset,
    int pci_status, err;
    char *buf = NULL;
    uint32_t drc_index = spapr_phb_get_pci_drc_index(sphb, dev);
+    uint32_t ccode = pci_default_read_config(dev, PCI_CLASS_PROG, 3);
    uint32_t max_msi, max_msix;

    if (pci_default_read_config(dev, PCI_HEADER_TYPE, 1) ==
@@ -971,8 +1241,7 @@ static int spapr_populate_pci_child_dt(PCIDevice *dev, void *fdt, int offset,
                          pci_default_read_config(dev, PCI_DEVICE_ID, 2)));
    _FDT(fdt_setprop_cell(fdt, offset, "revision-id",
                          pci_default_read_config(dev, PCI_REVISION_ID, 1)));
-    _FDT(fdt_setprop_cell(fdt, offset, "class-code",
-                          pci_default_read_config(dev, PCI_CLASS_PROG, 3)));
+    _FDT(fdt_setprop_cell(fdt, offset, "class-code", ccode));
    if (pci_default_read_config(dev, PCI_INTERRUPT_PIN, 1)) {
        _FDT(fdt_setprop_cell(fdt, offset, "interrupts",
                 pci_default_read_config(dev, PCI_INTERRUPT_PIN, 1)));
@@ -1013,11 +1282,10 @@ static int spapr_populate_pci_child_dt(PCIDevice *dev, void *fdt, int offset,
        _FDT(fdt_setprop(fdt, offset, "udf-supported", NULL, 0));
    }

-    /* NOTE: this is normally generated by firmware via path/unit name,
-     * but in our case we must set it manually since it does not get
-     * processed by OF beforehand
-     */
-    _FDT(fdt_setprop_string(fdt, offset, "name", "pci"));
+    _FDT(fdt_setprop_string(fdt, offset, "name",
+                            pci_find_device_name((ccode >> 16) & 0xff,
+                                                 (ccode >> 8) & 0xff,
+                                                 ccode & 0xff)));
    buf = spapr_phb_get_loc_code(sphb, dev);
    if (!buf) {
        error_report("Failed setting the ibm,loc-code");
@@ -1061,15 +1329,9 @@ static int spapr_create_pci_child_dt(sPAPRPHBState *phb, PCIDevice *dev,
                                     void *fdt, int node_offset)
 {
    int offset, ret;
-    int slot = PCI_SLOT(dev->devfn);
-    int func = PCI_FUNC(dev->devfn);
    char nodename[FDT_NAME_MAX];

-    if (func != 0) {
-        snprintf(nodename, FDT_NAME_MAX, "pci@%x,%x", slot, func);
-    } else {
-        snprintf(nodename, FDT_NAME_MAX, "pci@%x", slot);
-    }
+    pci_get_node_name(nodename, FDT_NAME_MAX, dev);
    offset = fdt_add_subnode(fdt, node_offset, nodename);
    ret = spapr_populate_pci_child_dt(dev, fdt, offset, phb);

@@ -1485,7 +1747,7 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
        uint32_t irq;
        Error *local_err = NULL;

-        irq = xics_spapr_alloc_block(spapr->xics, 1, true, false, &local_err);
+        irq = spapr_ics_alloc_block(spapr->ics, 1, true, false, &local_err);
        if (local_err) {
            error_propagate(errp, local_err);
            error_prepend(errp, "can't allocate LSIs: ");
@@ -1782,9 +2044,9 @@ static void spapr_populate_pci_devices_dt(PCIBus *bus, PCIDevice *pdev,
    s_fdt.fdt = p->fdt;
    s_fdt.node_off = offset;
    s_fdt.sphb = p->sphb;
-    pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
-                        spapr_populate_pci_devices_dt,
-                        &s_fdt);
+    pci_for_each_device_reverse(sec_bus, pci_bus_num(sec_bus),
+                                spapr_populate_pci_devices_dt,
+                                &s_fdt);
 }

 static void spapr_phb_pci_enumerate_bridge(PCIBus *bus, PCIDevice *pdev,
@@ -1953,9 +2215,9 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
    s_fdt.fdt = fdt;
    s_fdt.node_off = bus_off;
    s_fdt.sphb = phb;
-    pci_for_each_device(bus, pci_bus_num(bus),
-                        spapr_populate_pci_devices_dt,
-                        &s_fdt);
+    pci_for_each_device_reverse(bus, pci_bus_num(bus),
+                                spapr_populate_pci_devices_dt,
+                                &s_fdt);

    ret = spapr_drc_populate_dt(fdt, bus_off, OBJECT(phb),
                                SPAPR_DR_CONNECTOR_TYPE_PCI);
--- a/hw/ppc/spapr_vio.c
+++ b/hw/ppc/spapr_vio.c
@@ -454,7 +454,7 @@ static void spapr_vio_busdev_realize(DeviceState *qdev, Error **errp)
        dev->qdev.id = id;
    }

-    dev->irq = xics_spapr_alloc(spapr->xics, dev->irq, false, &local_err);
+    dev->irq = spapr_ics_alloc(spapr->ics, dev->irq, false, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        return;
--- a/hw/s390x/ipl.c
+++ b/hw/s390x/ipl.c
@@ -20,6 +20,7 @@
 #include "hw/s390x/virtio-ccw.h"
 #include "hw/s390x/css.h"
 #include "ipl.h"
+#include "qemu/error-report.h"

 #define KERN_IMAGE_START                0x010000UL
 #define KERN_PARM_AREA                  0x010480UL
@@ -209,6 +210,7 @@ static Property s390_ipl_properties[] = {
    DEFINE_PROP_STRING("initrd", S390IPLState, initrd),
    DEFINE_PROP_STRING("cmdline", S390IPLState, cmdline),
    DEFINE_PROP_STRING("firmware", S390IPLState, firmware),
+    DEFINE_PROP_STRING("netboot_fw", S390IPLState, netboot_fw),
    DEFINE_PROP_BOOL("enforce_bios", S390IPLState, enforce_bios, false),
    DEFINE_PROP_BOOL("iplbext_migration", S390IPLState, iplbext_migration,
                     true),
@@ -226,6 +228,12 @@ static bool s390_gen_initial_iplb(S390IPLState *ipl)
                TYPE_VIRTIO_CCW_DEVICE);
        SCSIDevice *sd = (SCSIDevice *) object_dynamic_cast(OBJECT(dev_st),
                                                            TYPE_SCSI_DEVICE);
+        VirtIONet *vn = (VirtIONet *) object_dynamic_cast(OBJECT(dev_st),
+                                                          TYPE_VIRTIO_NET);
+
+        if (vn) {
+            ipl->netboot = true;
+        }
        if (virtio_ccw_dev) {
            CcwDevice *ccw_dev = CCW_DEVICE(virtio_ccw_dev);

@@ -258,12 +266,86 @@ static bool s390_gen_initial_iplb(S390IPLState *ipl)
    return false;
 }

+static int load_netboot_image(Error **errp)
+{
+    S390IPLState *ipl = get_ipl_device();
+    char *netboot_filename;
+    MemoryRegion *sysmem =  get_system_memory();
+    MemoryRegion *mr = NULL;
+    void *ram_ptr = NULL;
+    int img_size = -1;
+
+    mr = memory_region_find(sysmem, 0, 1).mr;
+    if (!mr) {
+        error_setg(errp, "Failed to find memory region at address 0");
+        return -1;
+    }
+
+    ram_ptr = memory_region_get_ram_ptr(mr);
+    if (!ram_ptr) {
+        error_setg(errp, "No RAM found");
+        goto unref_mr;
+    }
+
+    netboot_filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, ipl->netboot_fw);
+    if (netboot_filename == NULL) {
+        error_setg(errp, "Could not find network bootloader");
+        goto unref_mr;
+    }
+
+    img_size = load_elf_ram(netboot_filename, NULL, NULL, &ipl->start_addr,
+                            NULL, NULL, 1, EM_S390, 0, 0, NULL, false);
+
+    if (img_size < 0) {
+        img_size = load_image_size(netboot_filename, ram_ptr, ram_size);
+        ipl->start_addr = KERN_IMAGE_START;
+    }
+
+    if (img_size < 0) {
+        error_setg(errp, "Failed to load network bootloader");
+    }
+
+    g_free(netboot_filename);
+
+unref_mr:
+    memory_region_unref(mr);
+    return img_size;
+}
+
+static bool is_virtio_net_device(IplParameterBlock *iplb)
+{
+    uint8_t cssid;
+    uint8_t ssid;
+    uint16_t devno;
+    uint16_t schid;
+    SubchDev *sch = NULL;
+
+    if (iplb->pbt != S390_IPL_TYPE_CCW) {
+        return false;
+    }
+
+    devno = be16_to_cpu(iplb->ccw.devno);
+    ssid = iplb->ccw.ssid & 3;
+
+    for (schid = 0; schid < MAX_SCHID; schid++) {
+        for (cssid = 0; cssid < MAX_CSSID; cssid++) {
+            sch = css_find_subch(1, cssid, ssid, schid);
+
+            if (sch && sch->devno == devno) {
+                return sch->id.cu_model == VIRTIO_ID_NET;
+            }
+        }
+    }
+    return false;
+}
+
 void s390_ipl_update_diag308(IplParameterBlock *iplb)
 {
    S390IPLState *ipl = get_ipl_device();

    ipl->iplb = *iplb;
    ipl->iplb_valid = true;
+    ipl->netboot = is_virtio_net_device(iplb);
 }

 IplParameterBlock *s390_ipl_get_iplb(void)
@@ -287,6 +369,7 @@ void s390_reipl_request(void)
 void s390_ipl_prepare_cpu(S390CPU *cpu)
 {
    S390IPLState *ipl = get_ipl_device();
+    Error *err = NULL;

    cpu->env.psw.addr = ipl->start_addr;
    cpu->env.psw.mask = IPL_PSW_MASK;
@@ -297,6 +380,13 @@ void s390_ipl_prepare_cpu(S390CPU *cpu)
            ipl->iplb_valid = s390_gen_initial_iplb(ipl);
        }
    }
+    if (ipl->netboot) {
+        if (load_netboot_image(&err) < 0) {
+            error_report_err(err);
+            vm_stop(RUN_STATE_INTERNAL_ERROR);
+        }
+        ipl->iplb.ccw.netboot_start_addr = ipl->start_addr;
+    }
 }

 static void s390_ipl_reset(DeviceState *dev)
--- a/hw/s390x/ipl.h
+++ b/hw/s390x/ipl.h
@@ -16,7 +16,8 @@
 #include "cpu.h"

 struct IplBlockCcw {
-    uint8_t  reserved0[85];
+    uint64_t netboot_start_addr;
+    uint8_t  reserved0[77];
    uint8_t  ssid;
    uint16_t devno;
    uint8_t  vm_flags;
@@ -100,12 +101,14 @@ struct S390IPLState {
    IplParameterBlock iplb;
    bool iplb_valid;
    bool reipl_requested;
+    bool netboot;

    /*< public >*/
    char *kernel;
    char *initrd;
    char *cmdline;
    char *firmware;
+    char *netboot_fw;
    uint8_t cssid;
    uint8_t ssid;
    uint16_t devno;
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -116,7 +116,8 @@ static void ccw_init(MachineState *machine)
    /* get a BUS */
    css_bus = virtual_css_bus_init();
    s390_init_ipl_dev(machine->kernel_filename, machine->kernel_cmdline,
-                      machine->initrd_filename, "s390-ccw.img", true);
+                      machine->initrd_filename, "s390-ccw.img",
+                      "s390-netboot.img", true);
    s390_flic_init();

    dev = qdev_create(NULL, TYPE_S390_PCI_HOST_BRIDGE);
--- a/hw/s390x/s390-virtio.c
+++ b/hw/s390x/s390-virtio.c
@@ -65,6 +65,7 @@ void s390_init_ipl_dev(const char *kernel_filename,
                       const char *kernel_cmdline,
                       const char *initrd_filename,
                       const char *firmware,
+                       const char *netboot_fw,
                       bool enforce_bios)
 {
    Object *new = object_new(TYPE_S390_IPL);
@@ -78,6 +79,7 @@ void s390_init_ipl_dev(const char *kernel_filename,
    }
    qdev_prop_set_string(dev, "cmdline", kernel_cmdline);
    qdev_prop_set_string(dev, "firmware", firmware);
+    qdev_prop_set_string(dev, "netboot_fw", netboot_fw);
    qdev_prop_set_bit(dev, "enforce_bios", enforce_bios);
    object_property_add_child(qdev_get_machine(), TYPE_S390_IPL,
                              new, NULL);
--- a/hw/s390x/s390-virtio.h
+++ b/hw/s390x/s390-virtio.h
@@ -24,6 +24,7 @@ void s390_init_ipl_dev(const char *kernel_filename,
                       const char *kernel_cmdline,
                       const char *initrd_filename,
                       const char *firmware,
+                       const char *netboot_fw,
                       bool enforce_bios);
 void s390_create_virtio_net(BusState *bus, const char *name);
 void s390_nmi(NMIState *n, int cpu_index, Error **errp);
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -2240,7 +2240,7 @@ static void scsi_disk_resize_cb(void *opaque)
    }
 }

-static void scsi_cd_change_media_cb(void *opaque, bool load)
+static void scsi_cd_change_media_cb(void *opaque, bool load, Error **errp)
 {
    SCSIDiskState *s = opaque;

@@ -2328,7 +2328,13 @@ static void scsi_realize(SCSIDevice *dev, Error **errp)
            return;
        }
    }
-    blkconf_apply_backend_options(&dev->conf);
+    blkconf_apply_backend_options(&dev->conf,
+                                  blk_is_read_only(s->qdev.conf.blk),
+                                  dev->type == TYPE_DISK, &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }

    if (s->qdev.conf.discard_granularity == -1) {
        s->qdev.conf.discard_granularity =
@@ -2380,7 +2386,7 @@ static void scsi_cd_realize(SCSIDevice *dev, Error **errp)
    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, dev);

    if (!dev->conf.blk) {
-        dev->conf.blk = blk_new();
+        dev->conf.blk = blk_new(0, BLK_PERM_ALL);
    }

    s->qdev.blocksize = 2048;
--- a/hw/sd/core.c
+++ b/hw/sd/core.c
@@ -131,6 +131,33 @@ void sdbus_set_readonly(SDBus *sdbus, bool readonly)
    }
 }

+void sdbus_reparent_card(SDBus *from, SDBus *to)
+{
+    SDState *card = get_card(from);
+    SDCardClass *sc;
+    bool readonly;
+
+    /* We directly reparent the card object rather than implementing this
+     * as a hotpluggable connection because we don't want to expose SD cards
+     * to users as being hotpluggable, and we can get away with it in this
+     * limited use case. This could perhaps be implemented more cleanly in
+     * future by adding support to the hotplug infrastructure for "device
+     * can be hotplugged only via code, not by user".
+     */
+
+    if (!card) {
+        return;
+    }
+
+    sc = SD_CARD_GET_CLASS(card);
+    readonly = sc->get_readonly(card);
+
+    sdbus_set_inserted(from, false);
+    qdev_set_parent_bus(DEVICE(card), &to->qbus);
+    sdbus_set_inserted(to, true);
+    sdbus_set_readonly(to, readonly);
+}
+
 static const TypeInfo sd_bus_info = {
    .name = TYPE_SD_BUS,
    .parent = TYPE_BUS,
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -458,7 +458,7 @@ static bool sd_get_readonly(SDState *sd)
    return sd->wp_switch;
 }

-static void sd_cardchange(void *opaque, bool load)
+static void sd_cardchange(void *opaque, bool load, Error **errp)
 {
    SDState *sd = opaque;
    DeviceState *dev = DEVICE(sd);
@@ -1887,6 +1887,7 @@ static void sd_instance_finalize(Object *obj)
 static void sd_realize(DeviceState *dev, Error **errp)
 {
    SDState *sd = SD_CARD(dev);
+    int ret;

    if (sd->blk && blk_is_read_only(sd->blk)) {
        error_setg(errp, "Cannot use read-only drive as SD card");
@@ -1894,6 +1895,11 @@ static void sd_realize(DeviceState *dev, Error **errp)
    }

    if (sd->blk) {
+        ret = blk_set_perm(sd->blk, BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE,
+                           BLK_PERM_ALL, errp);
+        if (ret < 0) {
+            return;
+        }
        blk_set_dev_ops(sd->blk, &sd_block_ops, sd);
    }
 }
--- a/hw/timer/Makefile.objs
+++ b/hw/timer/Makefile.objs
@@ -1,5 +1,6 @@
 common-obj-$(CONFIG_ARM_TIMER) += arm_timer.o
 common-obj-$(CONFIG_ARM_MPTIMER) += arm_mptimer.o
+common-obj-$(CONFIG_ARM_V7M) += armv7m_systick.o
 common-obj-$(CONFIG_A9_GTIMER) += a9gtimer.o
 common-obj-$(CONFIG_CADENCE) += cadence_ttc.o
 common-obj-$(CONFIG_DS1338) += ds1338.o
--- a/Show More
+++ b/Show More