MAINTAINERS: add self as virtio co-maintainer

This will help make sure I get Cc'd on patches. Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
q35: document gigabyte_align
2014-01-26 13:11:45 +02:00 · 2014-01-26 13:11:45 +02:00 · 2014-01-26 13:11:45 +02:00 · 2014-01-26 13:11:45 +02:00 · 2014-01-26 13:11:45 +02:00 · 2014-01-26 13:11:45 +02:00
146 changed files with 6037 additions and 1265 deletions
--- a/1
+++ b/1
@@ -610,6 +610,7 @@ F: hw/*/*vhost*

 virtio
 M: Anthony Liguori <aliguori@amazon.com>
+M: Michael S. Tsirkin <mst@redhat.com>
 S: Supported
 F: hw/*/virtio*

--- a/Makefile.objs
+++ b/Makefile.objs
@@ -43,7 +43,6 @@ libcacard-y += libcacard/vcardt.o
 ifeq ($(CONFIG_SOFTMMU),y)
 common-obj-y = $(block-obj-y) blockdev.o blockdev-nbd.o block/
 common-obj-y += net/
-common-obj-y += readline.o
 common-obj-y += qdev-monitor.o device-hotplug.o
 common-obj-$(CONFIG_WIN32) += os-win32.o
 common-obj-$(CONFIG_POSIX) += os-posix.o
--- a/block.c
+++ b/block.c
--- a/block/backup.c
+++ b/block/backup.c
@@ -181,8 +181,13 @@ static int coroutine_fn backup_before_write_notify(
        void *opaque)
 {
    BdrvTrackedRequest *req = opaque;
+    int64_t sector_num = req->offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = req->bytes >> BDRV_SECTOR_BITS;

-    return backup_do_cow(req->bs, req->sector_num, req->nb_sectors, NULL);
+    assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    return backup_do_cow(req->bs, sector_num, nb_sectors, NULL);
 }

 static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -186,6 +186,14 @@ static const char *event_names[BLKDBG_EVENT_MAX] = {

    [BLKDBG_FLUSH_TO_OS]                    = "flush_to_os",
    [BLKDBG_FLUSH_TO_DISK]                  = "flush_to_disk",
+
+    [BLKDBG_PWRITEV_RMW_HEAD]               = "pwritev_rmw.head",
+    [BLKDBG_PWRITEV_RMW_AFTER_HEAD]         = "pwritev_rmw.after_head",
+    [BLKDBG_PWRITEV_RMW_TAIL]               = "pwritev_rmw.tail",
+    [BLKDBG_PWRITEV_RMW_AFTER_TAIL]         = "pwritev_rmw.after_tail",
+    [BLKDBG_PWRITEV]                        = "pwritev",
+    [BLKDBG_PWRITEV_ZERO]                   = "pwritev_zero",
+    [BLKDBG_PWRITEV_DONE]                   = "pwritev_done",
 };

 static int get_event_by_name(const char *name, BlkDebugEvent *event)
@@ -271,19 +279,33 @@ static void remove_rule(BlkdebugRule *rule)
    g_free(rule);
 }

-static int read_config(BDRVBlkdebugState *s, const char *filename)
+static int read_config(BDRVBlkdebugState *s, const char *filename,
+                       QDict *options, Error **errp)
 {
-    FILE *f;
+    FILE *f = NULL;
    int ret;
    struct add_rule_data d;
+    Error *local_err = NULL;

-    f = fopen(filename, "r");
-    if (f == NULL) {
-        return -errno;
+    if (filename) {
+        f = fopen(filename, "r");
+        if (f == NULL) {
+            error_setg_errno(errp, errno, "Could not read blkdebug config file");
+            return -errno;
+        }
+
+        ret = qemu_config_parse(f, config_groups, filename);
+        if (ret < 0) {
+            error_setg(errp, "Could not parse blkdebug config file");
+            ret = -EINVAL;
+            goto fail;
+        }
    }

-    ret = qemu_config_parse(f, config_groups, filename);
-    if (ret < 0) {
+    qemu_config_parse_qdict(options, config_groups, &local_err);
+    if (error_is_set(&local_err)) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
        goto fail;
    }

@@ -298,7 +320,9 @@ static int read_config(BDRVBlkdebugState *s, const char *filename)
 fail:
    qemu_opts_reset(&inject_error_opts);
    qemu_opts_reset(&set_state_opts);
-    fclose(f);
+    if (f) {
+        fclose(f);
+    }
    return ret;
 }

@@ -310,7 +334,9 @@ static void blkdebug_parse_filename(const char *filename, QDict *options,

    /* Parse the blkdebug: prefix */
    if (!strstart(filename, "blkdebug:", &filename)) {
-        error_setg(errp, "File name string must start with 'blkdebug:'");
+        /* There was no prefix; therefore, all options have to be already
+           present in the QDict (except for the filename) */
+        qdict_put(options, "x-image", qstring_from_str(filename));
        return;
    }

@@ -346,6 +372,11 @@ static QemuOptsList runtime_opts = {
            .type = QEMU_OPT_STRING,
            .help = "[internal use only, will be removed]",
        },
+        {
+            .name = "align",
+            .type = QEMU_OPT_SIZE,
+            .help = "Required alignment in bytes",
+        },
        { /* end of list */ }
    },
 };
@@ -356,7 +387,8 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
    BDRVBlkdebugState *s = bs->opaque;
    QemuOpts *opts;
    Error *local_err = NULL;
-    const char *filename, *config;
+    const char *config;
+    uint64_t align;
    int ret;

    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
@@ -367,30 +399,31 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
        goto fail;
    }

-    /* Read rules from config file */
+    /* Read rules from config file or command line options */
    config = qemu_opt_get(opts, "config");
-    if (config) {
-        ret = read_config(s, config);
-        if (ret < 0) {
-            error_setg_errno(errp, -ret, "Could not read blkdebug config file");
-            goto fail;
-        }
+    ret = read_config(s, config, options, errp);
+    if (ret) {
+        goto fail;
    }

    /* Set initial state */
    s->state = 1;

    /* Open the backing file */
-    filename = qemu_opt_get(opts, "x-image");
-    if (filename == NULL) {
-        error_setg(errp, "Could not retrieve image file name");
-        ret = -EINVAL;
+    ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-image"), options, "image",
+                          flags, true, false, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
        goto fail;
    }

-    ret = bdrv_file_open(&bs->file, filename, NULL, flags, &local_err);
-    if (ret < 0) {
-        error_propagate(errp, local_err);
+    /* Set request alignment */
+    align = qemu_opt_get_size(opts, "align", bs->request_alignment);
+    if (align > 0 && align < INT_MAX && !(align & (align - 1))) {
+        bs->request_alignment = align;
+    } else {
+        error_setg(errp, "Invalid alignment");
+        ret = -EINVAL;
        goto fail;
    }

--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -78,7 +78,9 @@ static void blkverify_parse_filename(const char *filename, QDict *options,

    /* Parse the blkverify: prefix */
    if (!strstart(filename, "blkverify:", &filename)) {
-        error_setg(errp, "File name string must start with 'blkverify:'");
+        /* There was no prefix; therefore, all options have to be already
+           present in the QDict (except for the filename) */
+        qdict_put(options, "x-image", qstring_from_str(filename));
        return;
    }

@@ -122,7 +124,6 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags,
    BDRVBlkverifyState *s = bs->opaque;
    QemuOpts *opts;
    Error *local_err = NULL;
-    const char *filename, *raw;
    int ret;

    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
@@ -133,33 +134,19 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags,
        goto fail;
    }

-    /* Parse the raw image filename */
-    raw = qemu_opt_get(opts, "x-raw");
-    if (raw == NULL) {
-        error_setg(errp, "Could not retrieve raw image filename");
-        ret = -EINVAL;
-        goto fail;
-    }
-
-    ret = bdrv_file_open(&bs->file, raw, NULL, flags, &local_err);
+    /* Open the raw file */
+    ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-raw"), options,
+                          "raw", flags, true, false, &local_err);
    if (ret < 0) {
        error_propagate(errp, local_err);
        goto fail;
    }

    /* Open the test file */
-    filename = qemu_opt_get(opts, "x-image");
-    if (filename == NULL) {
-        error_setg(errp, "Could not retrieve test image filename");
-        ret = -EINVAL;
-        goto fail;
-    }
-
-    s->test_file = bdrv_new("");
-    ret = bdrv_open(s->test_file, filename, NULL, flags, NULL, &local_err);
+    ret = bdrv_open_image(&s->test_file, qemu_opt_get(opts, "x-image"), options,
+                          "test", flags, false, false, &local_err);
    if (ret < 0) {
        error_propagate(errp, local_err);
-        bdrv_unref(s->test_file);
        s->test_file = NULL;
        goto fail;
    }
@@ -417,7 +404,7 @@ static BlockDriver bdrv_blkverify = {
    .bdrv_aio_writev        = blkverify_aio_writev,
    .bdrv_aio_flush         = blkverify_aio_flush,

-    .bdrv_check_ext_snapshot = bdrv_check_ext_snapshot_forbidden,
+    .authorizations         = { true, false },
 };

 static void bdrv_blkverify_init(void)
--- a/block/cow.c
+++ b/block/cow.c
@@ -351,7 +351,8 @@ static int cow_create(const char *filename, QEMUOptionParameter *options,
        return ret;
    }

-    ret = bdrv_file_open(&cow_bs, filename, NULL, BDRV_O_RDWR, &local_err);
+    ret = bdrv_file_open(&cow_bs, filename, NULL, NULL, BDRV_O_RDWR,
+                         &local_err);
    if (ret < 0) {
        qerror_report_err(local_err);
        error_free(local_err);
--- a/block/curl.c
+++ b/block/curl.c
@@ -34,6 +34,11 @@
 #define DPRINTF(fmt, ...) do { } while (0)
 #endif

+#if LIBCURL_VERSION_NUM >= 0x071000
+/* The multi interface timer callback was introduced in 7.16.0 */
+#define NEED_CURL_TIMER_CALLBACK
+#endif
+
 #define PROTOCOLS (CURLPROTO_HTTP | CURLPROTO_HTTPS | \
                   CURLPROTO_FTP | CURLPROTO_FTPS | \
                   CURLPROTO_TFTP)
@@ -77,6 +82,7 @@ typedef struct CURLState

 typedef struct BDRVCURLState {
    CURLM *multi;
+    QEMUTimer timer;
    size_t len;
    CURLState states[CURL_NUM_STATES];
    char *url;
@@ -87,6 +93,23 @@ typedef struct BDRVCURLState {
 static void curl_clean_state(CURLState *s);
 static void curl_multi_do(void *arg);

+#ifdef NEED_CURL_TIMER_CALLBACK
+static int curl_timer_cb(CURLM *multi, long timeout_ms, void *opaque)
+{
+    BDRVCURLState *s = opaque;
+
+    DPRINTF("CURL: timer callback timeout_ms %ld\n", timeout_ms);
+    if (timeout_ms == -1) {
+        timer_del(&s->timer);
+    } else {
+        int64_t timeout_ns = (int64_t)timeout_ms * 1000 * 1000;
+        timer_mod(&s->timer,
+                  qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ns);
+    }
+    return 0;
+}
+#endif
+
 static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
                        void *s, void *sp)
 {
@@ -209,20 +232,10 @@ static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len,
    return FIND_RET_NONE;
 }

-static void curl_multi_do(void *arg)
+static void curl_multi_read(BDRVCURLState *s)
 {
-    BDRVCURLState *s = (BDRVCURLState *)arg;
-    int running;
-    int r;
    int msgs_in_queue;

-    if (!s->multi)
-        return;
-
-    do {
-        r = curl_multi_socket_all(s->multi, &running);
-    } while(r == CURLM_CALL_MULTI_PERFORM);
-
    /* Try to find done transfers, so we can free the easy
     * handle again. */
    do {
@@ -266,6 +279,41 @@ static void curl_multi_do(void *arg)
    } while(msgs_in_queue);
 }

+static void curl_multi_do(void *arg)
+{
+    BDRVCURLState *s = (BDRVCURLState *)arg;
+    int running;
+    int r;
+
+    if (!s->multi) {
+        return;
+    }
+
+    do {
+        r = curl_multi_socket_all(s->multi, &running);
+    } while(r == CURLM_CALL_MULTI_PERFORM);
+
+    curl_multi_read(s);
+}
+
+static void curl_multi_timeout_do(void *arg)
+{
+#ifdef NEED_CURL_TIMER_CALLBACK
+    BDRVCURLState *s = (BDRVCURLState *)arg;
+    int running;
+
+    if (!s->multi) {
+        return;
+    }
+
+    curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
+
+    curl_multi_read(s);
+#else
+    abort();
+#endif
+}
+
 static CURLState *curl_init_state(BDRVCURLState *s)
 {
    CURLState *state = NULL;
@@ -473,12 +521,20 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
    curl_easy_cleanup(state->curl);
    state->curl = NULL;

+    aio_timer_init(bdrv_get_aio_context(bs), &s->timer,
+                   QEMU_CLOCK_REALTIME, SCALE_NS,
+                   curl_multi_timeout_do, s);
+
    // Now we know the file exists and its size, so let's
    // initialize the multi interface!

    s->multi = curl_multi_init();
    curl_multi_setopt(s->multi, CURLMOPT_SOCKETDATA, s);
    curl_multi_setopt(s->multi, CURLMOPT_SOCKETFUNCTION, curl_sock_cb);
+#ifdef NEED_CURL_TIMER_CALLBACK
+    curl_multi_setopt(s->multi, CURLMOPT_TIMERDATA, s);
+    curl_multi_setopt(s->multi, CURLMOPT_TIMERFUNCTION, curl_timer_cb);
+#endif
    curl_multi_do(s);

    qemu_opts_del(opts);
@@ -597,6 +653,9 @@ static void curl_close(BlockDriverState *bs)
    }
    if (s->multi)
        curl_multi_cleanup(s->multi);
+
+    timer_del(&s->timer);
+
    g_free(s->url);
 }

--- a/block/gluster.c
+++ b/block/gluster.c
@@ -21,19 +21,15 @@
 #include "qemu/uri.h"

 typedef struct GlusterAIOCB {
-    BlockDriverAIOCB common;
    int64_t size;
    int ret;
-    bool *finished;
    QEMUBH *bh;
+    Coroutine *coroutine;
 } GlusterAIOCB;

 typedef struct BDRVGlusterState {
    struct glfs *glfs;
-    int fds[2];
    struct glfs_fd *fd;
-    int event_reader_pos;
-    GlusterAIOCB *event_acb;
 } BDRVGlusterState;

 #define GLUSTER_FD_READ  0
@@ -231,46 +227,32 @@ out:
    return NULL;
 }

-static void qemu_gluster_complete_aio(GlusterAIOCB *acb, BDRVGlusterState *s)
+static void qemu_gluster_complete_aio(void *opaque)
 {
-    int ret;
-    bool *finished = acb->finished;
-    BlockDriverCompletionFunc *cb = acb->common.cb;
-    void *opaque = acb->common.opaque;
+    GlusterAIOCB *acb = (GlusterAIOCB *)opaque;

-    if (!acb->ret || acb->ret == acb->size) {
-        ret = 0; /* Success */
-    } else if (acb->ret < 0) {
-        ret = acb->ret; /* Read/Write failed */
-    } else {
-        ret = -EIO; /* Partial read/write - fail it */
-    }
-
-    qemu_aio_release(acb);
-    cb(opaque, ret);
-    if (finished) {
-        *finished = true;
-    }
+    qemu_bh_delete(acb->bh);
+    acb->bh = NULL;
+    qemu_coroutine_enter(acb->coroutine, NULL);
 }

-static void qemu_gluster_aio_event_reader(void *opaque)
+/*
+ * AIO callback routine called from GlusterFS thread.
+ */
+static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
 {
-    BDRVGlusterState *s = opaque;
-    ssize_t ret;
+    GlusterAIOCB *acb = (GlusterAIOCB *)arg;

-    do {
-        char *p = (char *)&s->event_acb;
+    if (!ret || ret == acb->size) {
+        acb->ret = 0; /* Success */
+    } else if (ret < 0) {
+        acb->ret = ret; /* Read/Write failed */
+    } else {
+        acb->ret = -EIO; /* Partial read/write - fail it */
+    }

-        ret = read(s->fds[GLUSTER_FD_READ], p + s->event_reader_pos,
-                   sizeof(s->event_acb) - s->event_reader_pos);
-        if (ret > 0) {
-            s->event_reader_pos += ret;
-            if (s->event_reader_pos == sizeof(s->event_acb)) {
-                s->event_reader_pos = 0;
-                qemu_gluster_complete_aio(s->event_acb, s);
-            }
-        }
-    } while (ret < 0 && errno == EINTR);
+    acb->bh = qemu_bh_new(qemu_gluster_complete_aio, acb);
+    qemu_bh_schedule(acb->bh);
 }

 /* TODO Convert to fine grained options */
@@ -309,7 +291,6 @@ static int qemu_gluster_open(BlockDriverState *bs,  QDict *options,

    filename = qemu_opt_get(opts, "filename");

-
    s->glfs = qemu_gluster_init(gconf, filename);
    if (!s->glfs) {
        ret = -errno;
@@ -329,18 +310,8 @@ static int qemu_gluster_open(BlockDriverState *bs,  QDict *options,
    s->fd = glfs_open(s->glfs, gconf->image, open_flags);
    if (!s->fd) {
        ret = -errno;
-        goto out;
    }

-    ret = qemu_pipe(s->fds);
-    if (ret < 0) {
-        ret = -errno;
-        goto out;
-    }
-    fcntl(s->fds[GLUSTER_FD_READ], F_SETFL, O_NONBLOCK);
-    qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ],
-        qemu_gluster_aio_event_reader, NULL, s);
-
 out:
    qemu_opts_del(opts);
    qemu_gluster_gconf_free(gconf);
@@ -356,12 +327,65 @@ out:
    return ret;
 }

+#ifdef CONFIG_GLUSTERFS_ZEROFILL
+static coroutine_fn int qemu_gluster_co_write_zeroes(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+{
+    int ret;
+    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
+    BDRVGlusterState *s = bs->opaque;
+    off_t size = nb_sectors * BDRV_SECTOR_SIZE;
+    off_t offset = sector_num * BDRV_SECTOR_SIZE;
+
+    acb->size = size;
+    acb->ret = 0;
+    acb->coroutine = qemu_coroutine_self();
+
+    ret = glfs_zerofill_async(s->fd, offset, size, &gluster_finish_aiocb, acb);
+    if (ret < 0) {
+        ret = -errno;
+        goto out;
+    }
+
+    qemu_coroutine_yield();
+    ret = acb->ret;
+
+out:
+    g_slice_free(GlusterAIOCB, acb);
+    return ret;
+}
+
+static inline bool gluster_supports_zerofill(void)
+{
+    return 1;
+}
+
+static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
+        int64_t size)
+{
+    return glfs_zerofill(fd, offset, size);
+}
+
+#else
+static inline bool gluster_supports_zerofill(void)
+{
+    return 0;
+}
+
+static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
+        int64_t size)
+{
+    return 0;
+}
+#endif
+
 static int qemu_gluster_create(const char *filename,
        QEMUOptionParameter *options, Error **errp)
 {
    struct glfs *glfs;
    struct glfs_fd *fd;
    int ret = 0;
+    int prealloc = 0;
    int64_t total_size = 0;
    GlusterConf *gconf = g_malloc0(sizeof(GlusterConf));

@@ -374,6 +398,19 @@ static int qemu_gluster_create(const char *filename,
    while (options && options->name) {
        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
            total_size = options->value.n / BDRV_SECTOR_SIZE;
+        } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
+            if (!options->value.s || !strcmp(options->value.s, "off")) {
+                prealloc = 0;
+            } else if (!strcmp(options->value.s, "full") &&
+                    gluster_supports_zerofill()) {
+                prealloc = 1;
+            } else {
+                error_setg(errp, "Invalid preallocation mode: '%s'"
+                    " or GlusterFS doesn't support zerofill API",
+                           options->value.s);
+                ret = -EINVAL;
+                goto out;
+            }
        }
        options++;
    }
@@ -383,9 +420,15 @@ static int qemu_gluster_create(const char *filename,
    if (!fd) {
        ret = -errno;
    } else {
-        if (glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) {
+        if (!glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE)) {
+            if (prealloc && qemu_gluster_zerofill(fd, 0,
+                    total_size * BDRV_SECTOR_SIZE)) {
+                ret = -errno;
+            }
+        } else {
            ret = -errno;
        }
+
        if (glfs_close(fd) != 0) {
            ret = -errno;
        }
@@ -398,58 +441,18 @@ out:
    return ret;
 }

-static void qemu_gluster_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-    GlusterAIOCB *acb = (GlusterAIOCB *)blockacb;
-    bool finished = false;
-
-    acb->finished = &finished;
-    while (!finished) {
-        qemu_aio_wait();
-    }
-}
-
-static const AIOCBInfo gluster_aiocb_info = {
-    .aiocb_size = sizeof(GlusterAIOCB),
-    .cancel = qemu_gluster_aio_cancel,
-};
-
-static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
-{
-    GlusterAIOCB *acb = (GlusterAIOCB *)arg;
-    BlockDriverState *bs = acb->common.bs;
-    BDRVGlusterState *s = bs->opaque;
-    int retval;
-
-    acb->ret = ret;
-    retval = qemu_write_full(s->fds[GLUSTER_FD_WRITE], &acb, sizeof(acb));
-    if (retval != sizeof(acb)) {
-        /*
-         * Gluster AIO callback thread failed to notify the waiting
-         * QEMU thread about IO completion.
-         */
-        error_report("Gluster AIO completion failed: %s", strerror(errno));
-        abort();
-    }
-}
-
-static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int write)
+static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write)
 {
    int ret;
-    GlusterAIOCB *acb;
+    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
    BDRVGlusterState *s = bs->opaque;
-    size_t size;
-    off_t offset;
+    size_t size = nb_sectors * BDRV_SECTOR_SIZE;
+    off_t offset = sector_num * BDRV_SECTOR_SIZE;

-    offset = sector_num * BDRV_SECTOR_SIZE;
-    size = nb_sectors * BDRV_SECTOR_SIZE;
-
-    acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque);
    acb->size = size;
    acb->ret = 0;
-    acb->finished = NULL;
+    acb->coroutine = qemu_coroutine_self();

    if (write) {
        ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
@@ -460,13 +463,16 @@ static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs,
    }

    if (ret < 0) {
+        ret = -errno;
        goto out;
    }
-    return &acb->common;
+
+    qemu_coroutine_yield();
+    ret = acb->ret;

 out:
-    qemu_aio_release(acb);
-    return NULL;
+    g_slice_free(GlusterAIOCB, acb);
+    return ret;
 }

 static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset)
@@ -482,71 +488,68 @@ static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset)
    return 0;
 }

-static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
-    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+    return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 0);
 }

-static BlockDriverAIOCB *qemu_gluster_aio_writev(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
-    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+    return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1);
 }

-static BlockDriverAIOCB *qemu_gluster_aio_flush(BlockDriverState *bs,
-        BlockDriverCompletionFunc *cb, void *opaque)
+static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs)
 {
    int ret;
-    GlusterAIOCB *acb;
+    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
    BDRVGlusterState *s = bs->opaque;

-    acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque);
    acb->size = 0;
    acb->ret = 0;
-    acb->finished = NULL;
+    acb->coroutine = qemu_coroutine_self();

    ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb);
    if (ret < 0) {
+        ret = -errno;
        goto out;
    }
-    return &acb->common;
+
+    qemu_coroutine_yield();
+    ret = acb->ret;

 out:
-    qemu_aio_release(acb);
-    return NULL;
+    g_slice_free(GlusterAIOCB, acb);
+    return ret;
 }

 #ifdef CONFIG_GLUSTERFS_DISCARD
-static BlockDriverAIOCB *qemu_gluster_aio_discard(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, BlockDriverCompletionFunc *cb,
-        void *opaque)
+static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors)
 {
    int ret;
-    GlusterAIOCB *acb;
+    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
    BDRVGlusterState *s = bs->opaque;
-    size_t size;
-    off_t offset;
+    size_t size = nb_sectors * BDRV_SECTOR_SIZE;
+    off_t offset = sector_num * BDRV_SECTOR_SIZE;

-    offset = sector_num * BDRV_SECTOR_SIZE;
-    size = nb_sectors * BDRV_SECTOR_SIZE;
-
-    acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque);
    acb->size = 0;
    acb->ret = 0;
-    acb->finished = NULL;
+    acb->coroutine = qemu_coroutine_self();

    ret = glfs_discard_async(s->fd, offset, size, &gluster_finish_aiocb, acb);
    if (ret < 0) {
+        ret = -errno;
        goto out;
    }
-    return &acb->common;
+
+    qemu_coroutine_yield();
+    ret = acb->ret;

 out:
-    qemu_aio_release(acb);
-    return NULL;
+    g_slice_free(GlusterAIOCB, acb);
+    return ret;
 }
 #endif

@@ -581,10 +584,6 @@ static void qemu_gluster_close(BlockDriverState *bs)
 {
    BDRVGlusterState *s = bs->opaque;

-    close(s->fds[GLUSTER_FD_READ]);
-    close(s->fds[GLUSTER_FD_WRITE]);
-    qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], NULL, NULL, NULL);
-
    if (s->fd) {
        glfs_close(s->fd);
        s->fd = NULL;
@@ -604,6 +603,11 @@ static QEMUOptionParameter qemu_gluster_create_options[] = {
        .type = OPT_SIZE,
        .help = "Virtual disk size"
    },
+    {
+        .name = BLOCK_OPT_PREALLOC,
+        .type = OPT_STRING,
+        .help = "Preallocation mode (allowed values: off, full)"
+    },
    { NULL }
 };

@@ -618,12 +622,15 @@ static BlockDriver bdrv_gluster = {
    .bdrv_getlength               = qemu_gluster_getlength,
    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
    .bdrv_truncate                = qemu_gluster_truncate,
-    .bdrv_aio_readv               = qemu_gluster_aio_readv,
-    .bdrv_aio_writev              = qemu_gluster_aio_writev,
-    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .bdrv_co_readv                = qemu_gluster_co_readv,
+    .bdrv_co_writev               = qemu_gluster_co_writev,
+    .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk,
    .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-    .bdrv_aio_discard             = qemu_gluster_aio_discard,
+    .bdrv_co_discard              = qemu_gluster_co_discard,
+#endif
+#ifdef CONFIG_GLUSTERFS_ZEROFILL
+    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
 #endif
    .create_options               = qemu_gluster_create_options,
 };
@@ -639,12 +646,15 @@ static BlockDriver bdrv_gluster_tcp = {
    .bdrv_getlength               = qemu_gluster_getlength,
    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
    .bdrv_truncate                = qemu_gluster_truncate,
-    .bdrv_aio_readv               = qemu_gluster_aio_readv,
-    .bdrv_aio_writev              = qemu_gluster_aio_writev,
-    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .bdrv_co_readv                = qemu_gluster_co_readv,
+    .bdrv_co_writev               = qemu_gluster_co_writev,
+    .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk,
    .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-    .bdrv_aio_discard             = qemu_gluster_aio_discard,
+    .bdrv_co_discard              = qemu_gluster_co_discard,
+#endif
+#ifdef CONFIG_GLUSTERFS_ZEROFILL
+    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
 #endif
    .create_options               = qemu_gluster_create_options,
 };
@@ -660,12 +670,15 @@ static BlockDriver bdrv_gluster_unix = {
    .bdrv_getlength               = qemu_gluster_getlength,
    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
    .bdrv_truncate                = qemu_gluster_truncate,
-    .bdrv_aio_readv               = qemu_gluster_aio_readv,
-    .bdrv_aio_writev              = qemu_gluster_aio_writev,
-    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .bdrv_co_readv                = qemu_gluster_co_readv,
+    .bdrv_co_writev               = qemu_gluster_co_writev,
+    .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk,
    .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-    .bdrv_aio_discard             = qemu_gluster_aio_discard,
+    .bdrv_co_discard              = qemu_gluster_co_discard,
+#endif
+#ifdef CONFIG_GLUSTERFS_ZEROFILL
+    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
 #endif
    .create_options               = qemu_gluster_create_options,
 };
@@ -681,12 +694,15 @@ static BlockDriver bdrv_gluster_rdma = {
    .bdrv_getlength               = qemu_gluster_getlength,
    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
    .bdrv_truncate                = qemu_gluster_truncate,
-    .bdrv_aio_readv               = qemu_gluster_aio_readv,
-    .bdrv_aio_writev              = qemu_gluster_aio_writev,
-    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .bdrv_co_readv                = qemu_gluster_co_readv,
+    .bdrv_co_writev               = qemu_gluster_co_writev,
+    .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk,
    .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-    .bdrv_aio_discard             = qemu_gluster_aio_discard,
+    .bdrv_co_discard              = qemu_gluster_co_discard,
+#endif
+#ifdef CONFIG_GLUSTERFS_ZEROFILL
+    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
 #endif
    .create_options               = qemu_gluster_create_options,
 };
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -308,7 +308,7 @@ retry:
                                    iscsi_co_generic_cb, &iTask);
    if (iTask.task == NULL) {
        g_free(buf);
-        return -EIO;
+        return -ENOMEM;
    }
 #if defined(LIBISCSI_FEATURE_IOVECTOR)
    scsi_task_set_iov_out(iTask.task, (struct scsi_iovec *) iov->iov,
@@ -376,7 +376,7 @@ retry:
        break;
    }
    if (iTask.task == NULL) {
-        return -EIO;
+        return -ENOMEM;
    }
 #if defined(LIBISCSI_FEATURE_IOVECTOR)
    scsi_task_set_iov_in(iTask.task, (struct scsi_iovec *) iov->iov, iov->niov);
@@ -419,7 +419,7 @@ static int coroutine_fn iscsi_co_flush(BlockDriverState *bs)
 retry:
    if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, 0,
                                      0, iscsi_co_generic_cb, &iTask) == NULL) {
-        return -EIO;
+        return -ENOMEM;
    }

    while (!iTask.complete) {
@@ -669,7 +669,7 @@ retry:
                                  sector_qemu2lun(sector_num, iscsilun),
                                  8 + 16, iscsi_co_generic_cb,
                                  &iTask) == NULL) {
-        ret = -EIO;
+        ret = -ENOMEM;
        goto out;
    }

@@ -753,7 +753,7 @@ coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num,
 retry:
    if (iscsi_unmap_task(iscsilun->iscsi, iscsilun->lun, 0, 0, &list, 1,
                     iscsi_co_generic_cb, &iTask) == NULL) {
-        return -EIO;
+        return -ENOMEM;
    }

    while (!iTask.complete) {
@@ -822,7 +822,7 @@ retry:
                               iscsilun->zeroblock, iscsilun->block_size,
                               nb_blocks, 0, !!(flags & BDRV_REQ_MAY_UNMAP),
                               0, 0, iscsi_co_generic_cb, &iTask) == NULL) {
-        return -EIO;
+        return -ENOMEM;
    }

    while (!iTask.complete) {
@@ -1217,6 +1217,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
        goto out;
    }
    bs->total_sectors = sector_lun2qemu(iscsilun->num_blocks, iscsilun);
+    bs->request_alignment = iscsilun->block_size;

    /* Medium changer or tape. We dont have any emulation for this so this must
     * be sg ioctl compatible. We force it to be sg, otherwise qemu will try
@@ -1265,23 +1266,6 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
               sizeof(struct scsi_inquiry_block_limits));
        scsi_free_scsi_task(task);
        task = NULL;
-
-        if (iscsilun->bl.max_unmap < 0xffffffff) {
-            bs->bl.max_discard = sector_lun2qemu(iscsilun->bl.max_unmap,
-                                                 iscsilun);
-        }
-        bs->bl.discard_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran,
-                                                   iscsilun);
-
-        if (iscsilun->bl.max_ws_len < 0xffffffff) {
-            bs->bl.max_write_zeroes = sector_lun2qemu(iscsilun->bl.max_ws_len,
-                                                      iscsilun);
-        }
-        bs->bl.write_zeroes_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran,
-                                                        iscsilun);
-
-        bs->bl.opt_transfer_length = sector_lun2qemu(iscsilun->bl.opt_xfer_len,
-                                                     iscsilun);
    }

 #if defined(LIBISCSI_FEATURE_NOP_COUNTER)
@@ -1326,6 +1310,41 @@ static void iscsi_close(BlockDriverState *bs)
    memset(iscsilun, 0, sizeof(IscsiLun));
 }

+static int iscsi_refresh_limits(BlockDriverState *bs)
+{
+    IscsiLun *iscsilun = bs->opaque;
+
+    /* We don't actually refresh here, but just return data queried in
+     * iscsi_open(): iscsi targets don't change their limits. */
+    if (iscsilun->lbp.lbpu || iscsilun->lbp.lbpws) {
+        if (iscsilun->bl.max_unmap < 0xffffffff) {
+            bs->bl.max_discard = sector_lun2qemu(iscsilun->bl.max_unmap,
+                                                 iscsilun);
+        }
+        bs->bl.discard_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran,
+                                                   iscsilun);
+
+        if (iscsilun->bl.max_ws_len < 0xffffffff) {
+            bs->bl.max_write_zeroes = sector_lun2qemu(iscsilun->bl.max_ws_len,
+                                                      iscsilun);
+        }
+        bs->bl.write_zeroes_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran,
+                                                        iscsilun);
+
+        bs->bl.opt_transfer_length = sector_lun2qemu(iscsilun->bl.opt_xfer_len,
+                                                     iscsilun);
+    }
+    return 0;
+}
+
+/* We have nothing to do for iSCSI reopen, stub just returns
+ * success */
+static int iscsi_reopen_prepare(BDRVReopenState *state,
+                                BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
 static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
 {
    IscsiLun *iscsilun = bs->opaque;
@@ -1434,10 +1453,12 @@ static BlockDriver bdrv_iscsi = {
    .bdrv_close      = iscsi_close,
    .bdrv_create     = iscsi_create,
    .create_options  = iscsi_create_options,
+    .bdrv_reopen_prepare  = iscsi_reopen_prepare,

    .bdrv_getlength  = iscsi_getlength,
    .bdrv_get_info   = iscsi_get_info,
    .bdrv_truncate   = iscsi_truncate,
+    .bdrv_refresh_limits = iscsi_refresh_limits,

 #if defined(LIBISCSI_FEATURE_IOVECTOR)
    .bdrv_co_get_block_status = iscsi_co_get_block_status,
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -96,6 +96,7 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
        bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
    }

+    qemu_iovec_destroy(&op->qiov);
    g_slice_free(MirrorOp, op);
    qemu_coroutine_enter(s->common.co, NULL);
 }
@@ -630,11 +631,49 @@ void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
                         BlockDriverCompletionFunc *cb,
                         void *opaque, Error **errp)
 {
+    int64_t length, base_length;
+    int orig_base_flags;
+
+    orig_base_flags = bdrv_get_flags(base);
+
    if (bdrv_reopen(base, bs->open_flags, errp)) {
        return;
    }
+
+    length = bdrv_getlength(bs);
+    if (length < 0) {
+        error_setg(errp, "Unable to determine length of %s", bs->filename);
+        goto error_restore_flags;
+    }
+
+    base_length = bdrv_getlength(base);
+    if (base_length < 0) {
+        error_setg(errp, "Unable to determine length of %s", base->filename);
+        goto error_restore_flags;
+    }
+
+    if (length > base_length) {
+        if (bdrv_truncate(base, length) < 0) {
+            error_setg(errp, "Top image %s is larger than base image %s, and "
+                             "resize of base image failed",
+                             bs->filename, base->filename);
+            goto error_restore_flags;
+        }
+    }
+
    bdrv_ref(base);
    mirror_start_job(bs, base, speed, 0, 0,
                     on_error, on_error, cb, opaque, errp,
                     &commit_active_job_driver, false, base);
+    if (error_is_set(errp)) {
+        goto error_restore_flags;
+    }
+
+    return;
+
+error_restore_flags:
+    /* ignore error and errp for bdrv_reopen, because we want to propagate
+     * the original error */
+    bdrv_reopen(base, orig_base_flags, NULL);
+    return;
 }
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -29,6 +29,60 @@
 #include "qapi/qmp-output-visitor.h"
 #include "qapi/qmp/types.h"

+BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
+{
+    BlockDeviceInfo *info = g_malloc0(sizeof(*info));
+
+    info->file                   = g_strdup(bs->filename);
+    info->ro                     = bs->read_only;
+    info->drv                    = g_strdup(bs->drv->format_name);
+    info->encrypted              = bs->encrypted;
+    info->encryption_key_missing = bdrv_key_required(bs);
+
+    if (bs->node_name[0]) {
+        info->has_node_name = true;
+        info->node_name = g_strdup(bs->node_name);
+    }
+
+    if (bs->backing_file[0]) {
+        info->has_backing_file = true;
+        info->backing_file = g_strdup(bs->backing_file);
+    }
+
+    info->backing_file_depth = bdrv_get_backing_file_depth(bs);
+
+    if (bs->io_limits_enabled) {
+        ThrottleConfig cfg;
+        throttle_get_config(&bs->throttle_state, &cfg);
+        info->bps     = cfg.buckets[THROTTLE_BPS_TOTAL].avg;
+        info->bps_rd  = cfg.buckets[THROTTLE_BPS_READ].avg;
+        info->bps_wr  = cfg.buckets[THROTTLE_BPS_WRITE].avg;
+
+        info->iops    = cfg.buckets[THROTTLE_OPS_TOTAL].avg;
+        info->iops_rd = cfg.buckets[THROTTLE_OPS_READ].avg;
+        info->iops_wr = cfg.buckets[THROTTLE_OPS_WRITE].avg;
+
+        info->has_bps_max     = cfg.buckets[THROTTLE_BPS_TOTAL].max;
+        info->bps_max         = cfg.buckets[THROTTLE_BPS_TOTAL].max;
+        info->has_bps_rd_max  = cfg.buckets[THROTTLE_BPS_READ].max;
+        info->bps_rd_max      = cfg.buckets[THROTTLE_BPS_READ].max;
+        info->has_bps_wr_max  = cfg.buckets[THROTTLE_BPS_WRITE].max;
+        info->bps_wr_max      = cfg.buckets[THROTTLE_BPS_WRITE].max;
+
+        info->has_iops_max    = cfg.buckets[THROTTLE_OPS_TOTAL].max;
+        info->iops_max        = cfg.buckets[THROTTLE_OPS_TOTAL].max;
+        info->has_iops_rd_max = cfg.buckets[THROTTLE_OPS_READ].max;
+        info->iops_rd_max     = cfg.buckets[THROTTLE_OPS_READ].max;
+        info->has_iops_wr_max = cfg.buckets[THROTTLE_OPS_WRITE].max;
+        info->iops_wr_max     = cfg.buckets[THROTTLE_OPS_WRITE].max;
+
+        info->has_iops_size = cfg.op_size;
+        info->iops_size = cfg.op_size;
+    }
+
+    return info;
+}
+
 /*
 * Returns 0 on success, with *p_list either set to describe snapshot
 * information, or NULL because there are no snapshots.  Returns -errno on
@@ -211,60 +265,7 @@ void bdrv_query_info(BlockDriverState *bs,

    if (bs->drv) {
        info->has_inserted = true;
-        info->inserted = g_malloc0(sizeof(*info->inserted));
-        info->inserted->file = g_strdup(bs->filename);
-        info->inserted->ro = bs->read_only;
-        info->inserted->drv = g_strdup(bs->drv->format_name);
-        info->inserted->encrypted = bs->encrypted;
-        info->inserted->encryption_key_missing = bdrv_key_required(bs);
-
-        if (bs->backing_file[0]) {
-            info->inserted->has_backing_file = true;
-            info->inserted->backing_file = g_strdup(bs->backing_file);
-        }
-
-        info->inserted->backing_file_depth = bdrv_get_backing_file_depth(bs);
-
-        if (bs->io_limits_enabled) {
-            ThrottleConfig cfg;
-            throttle_get_config(&bs->throttle_state, &cfg);
-            info->inserted->bps     = cfg.buckets[THROTTLE_BPS_TOTAL].avg;
-            info->inserted->bps_rd  = cfg.buckets[THROTTLE_BPS_READ].avg;
-            info->inserted->bps_wr  = cfg.buckets[THROTTLE_BPS_WRITE].avg;
-
-            info->inserted->iops    = cfg.buckets[THROTTLE_OPS_TOTAL].avg;
-            info->inserted->iops_rd = cfg.buckets[THROTTLE_OPS_READ].avg;
-            info->inserted->iops_wr = cfg.buckets[THROTTLE_OPS_WRITE].avg;
-
-            info->inserted->has_bps_max     =
-                cfg.buckets[THROTTLE_BPS_TOTAL].max;
-            info->inserted->bps_max         =
-                cfg.buckets[THROTTLE_BPS_TOTAL].max;
-            info->inserted->has_bps_rd_max  =
-                cfg.buckets[THROTTLE_BPS_READ].max;
-            info->inserted->bps_rd_max      =
-                cfg.buckets[THROTTLE_BPS_READ].max;
-            info->inserted->has_bps_wr_max  =
-                cfg.buckets[THROTTLE_BPS_WRITE].max;
-            info->inserted->bps_wr_max      =
-                cfg.buckets[THROTTLE_BPS_WRITE].max;
-
-            info->inserted->has_iops_max    =
-                cfg.buckets[THROTTLE_OPS_TOTAL].max;
-            info->inserted->iops_max        =
-                cfg.buckets[THROTTLE_OPS_TOTAL].max;
-            info->inserted->has_iops_rd_max =
-                cfg.buckets[THROTTLE_OPS_READ].max;
-            info->inserted->iops_rd_max     =
-                cfg.buckets[THROTTLE_OPS_READ].max;
-            info->inserted->has_iops_wr_max =
-                cfg.buckets[THROTTLE_OPS_WRITE].max;
-            info->inserted->iops_wr_max     =
-                cfg.buckets[THROTTLE_OPS_WRITE].max;
-
-            info->inserted->has_iops_size = cfg.op_size;
-            info->inserted->iops_size = cfg.op_size;
-        }
+        info->inserted = bdrv_block_device_info(bs);

        bs0 = bs;
        p_image_info = &info->inserted->image;
@@ -318,6 +319,11 @@ BlockStats *bdrv_query_stats(const BlockDriverState *bs)
        s->parent = bdrv_query_stats(bs->file);
    }

+    if (bs->backing_hd) {
+        s->has_backing = true;
+        s->backing = bdrv_query_stats(bs->backing_hd);
+    }
+
    return s;
 }

--- a/block/qcow.c
+++ b/block/qcow.c
@@ -691,7 +691,8 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options,
        return ret;
    }

-    ret = bdrv_file_open(&qcow_bs, filename, NULL, BDRV_O_RDWR, &local_err);
+    ret = bdrv_file_open(&qcow_bs, filename, NULL, NULL, BDRV_O_RDWR,
+                         &local_err);
    if (ret < 0) {
        qerror_report_err(local_err);
        error_free(local_err);
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -718,7 +718,6 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    }

    qemu_opts_del(opts);
-    bs->bl.write_zeroes_alignment = s->cluster_sectors;

    if (s->use_lazy_refcounts && s->qcow_version < 3) {
        error_setg(errp, "Lazy refcounts require a qcow2 image with at least "
@@ -751,6 +750,15 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    return ret;
 }

+static int qcow2_refresh_limits(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    bs->bl.write_zeroes_alignment = s->cluster_sectors;
+
+    return 0;
+}
+
 static int qcow2_set_key(BlockDriverState *bs, const char *key)
 {
    BDRVQcowState *s = bs->opaque;
@@ -1483,7 +1491,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
        return ret;
    }

-    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR, &local_err);
+    ret = bdrv_file_open(&bs, filename, NULL, NULL, BDRV_O_RDWR, &local_err);
    if (ret < 0) {
        error_propagate(errp, local_err);
        return ret;
@@ -2268,6 +2276,7 @@ static BlockDriver bdrv_qcow2 = {

    .bdrv_change_backing_file   = qcow2_change_backing_file,

+    .bdrv_refresh_limits        = qcow2_refresh_limits,
    .bdrv_invalidate_cache      = qcow2_invalidate_cache,

    .create_options = qcow2_create_options,
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -340,11 +340,11 @@ typedef enum QCow2MetadataOverlap {
 #define QCOW2_OL_ALL \
    (QCOW2_OL_CACHED | QCOW2_OL_INACTIVE_L2)

-#define L1E_OFFSET_MASK 0x00ffffffffffff00ULL
-#define L2E_OFFSET_MASK 0x00ffffffffffff00ULL
+#define L1E_OFFSET_MASK 0x00fffffffffffe00ULL
+#define L2E_OFFSET_MASK 0x00fffffffffffe00ULL
 #define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL

-#define REFT_OFFSET_MASK 0xffffffffffffff00ULL
+#define REFT_OFFSET_MASK 0xfffffffffffffe00ULL

 static inline int64_t start_of_cluster(BDRVQcowState *s, int64_t offset)
 {
--- a/block/qed.c
+++ b/block/qed.c
@@ -495,7 +495,6 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
        }
    }

-    bs->bl.write_zeroes_alignment = s->header.cluster_size >> BDRV_SECTOR_BITS;
    s->need_check_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
                                            qed_need_check_timer_cb, s);

@@ -507,6 +506,15 @@ out:
    return ret;
 }

+static int bdrv_qed_refresh_limits(BlockDriverState *bs)
+{
+    BDRVQEDState *s = bs->opaque;
+
+    bs->bl.write_zeroes_alignment = s->header.cluster_size >> BDRV_SECTOR_BITS;
+
+    return 0;
+}
+
 /* We have nothing to do for QED reopen, stubs just return
 * success */
 static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
@@ -563,8 +571,8 @@ static int qed_create(const char *filename, uint32_t cluster_size,
        return ret;
    }

-    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR | BDRV_O_CACHE_WB,
-                         &local_err);
+    ret = bdrv_file_open(&bs, filename, NULL, NULL,
+                         BDRV_O_RDWR | BDRV_O_CACHE_WB, &local_err);
    if (ret < 0) {
        qerror_report_err(local_err);
        error_free(local_err);
@@ -1616,6 +1624,7 @@ static BlockDriver bdrv_qed = {
    .bdrv_truncate            = bdrv_qed_truncate,
    .bdrv_getlength           = bdrv_qed_getlength,
    .bdrv_get_info            = bdrv_qed_get_info,
+    .bdrv_refresh_limits      = bdrv_qed_refresh_limits,
    .bdrv_change_backing_file = bdrv_qed_change_backing_file,
    .bdrv_invalidate_cache    = bdrv_qed_invalidate_cache,
    .bdrv_check               = bdrv_qed_check,
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -127,6 +127,8 @@ typedef struct BDRVRawState {
    int fd;
    int type;
    int open_flags;
+    size_t buf_align;
+
 #if defined(__linux__)
    /* linux floppy specific */
    int64_t fd_open_time;
@@ -213,6 +215,76 @@ static int raw_normalize_devicepath(const char **filename)
 }
 #endif

+static void raw_probe_alignment(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    char *buf;
+    unsigned int sector_size;
+
+    /* For /dev/sg devices the alignment is not really used.
+       With buffered I/O, we don't have any restrictions. */
+    if (bs->sg || !(s->open_flags & O_DIRECT)) {
+        bs->request_alignment = 1;
+        s->buf_align = 1;
+        return;
+    }
+
+    /* Try a few ioctls to get the right size */
+    bs->request_alignment = 0;
+    s->buf_align = 0;
+
+#ifdef BLKSSZGET
+    if (ioctl(s->fd, BLKSSZGET, &sector_size) >= 0) {
+        bs->request_alignment = sector_size;
+    }
+#endif
+#ifdef DKIOCGETBLOCKSIZE
+    if (ioctl(s->fd, DKIOCGETBLOCKSIZE, &sector_size) >= 0) {
+        bs->request_alignment = sector_size;
+    }
+#endif
+#ifdef DIOCGSECTORSIZE
+    if (ioctl(s->fd, DIOCGSECTORSIZE, &sector_size) >= 0) {
+        bs->request_alignment = sector_size;
+    }
+#endif
+#ifdef CONFIG_XFS
+    if (s->is_xfs) {
+        struct dioattr da;
+        if (xfsctl(NULL, s->fd, XFS_IOC_DIOINFO, &da) >= 0) {
+            bs->request_alignment = da.d_miniosz;
+            /* The kernel returns wrong information for d_mem */
+            /* s->buf_align = da.d_mem; */
+        }
+    }
+#endif
+
+    /* If we could not get the sizes so far, we can only guess them */
+    if (!s->buf_align) {
+        size_t align;
+        buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE);
+        for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
+            if (pread(s->fd, buf + align, MAX_BLOCKSIZE, 0) >= 0) {
+                s->buf_align = align;
+                break;
+            }
+        }
+        qemu_vfree(buf);
+    }
+
+    if (!bs->request_alignment) {
+        size_t align;
+        buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE);
+        for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
+            if (pread(s->fd, buf, align, 0) >= 0) {
+                bs->request_alignment = align;
+                break;
+            }
+        }
+        qemu_vfree(buf);
+    }
+}
+
 static void raw_parse_flags(int bdrv_flags, int *open_flags)
 {
    assert(open_flags != NULL);
@@ -463,7 +535,6 @@ static int raw_reopen_prepare(BDRVReopenState *state,
    return ret;
 }

-
 static void raw_reopen_commit(BDRVReopenState *state)
 {
    BDRVRawReopenState *raw_s = state->opaque;
@@ -499,23 +570,15 @@ static void raw_reopen_abort(BDRVReopenState *state)
    state->opaque = NULL;
 }

+static int raw_refresh_limits(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;

-/* XXX: use host sector size if necessary with:
-#ifdef DIOCGSECTORSIZE
-        {
-            unsigned int sectorsize = 512;
-            if (!ioctl(fd, DIOCGSECTORSIZE, &sectorsize) &&
-                sectorsize > bufsize)
-                bufsize = sectorsize;
-        }
-#endif
-#ifdef CONFIG_COCOA
-        uint32_t blockSize = 512;
-        if ( !ioctl( fd, DKIOCGETBLOCKSIZE, &blockSize ) && blockSize > bufsize) {
-            bufsize = blockSize;
-        }
-#endif
-*/
+    raw_probe_alignment(bs);
+    bs->bl.opt_mem_alignment = s->buf_align;
+
+    return 0;
+}

 static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
 {
@@ -1363,6 +1426,7 @@ static BlockDriver bdrv_file = {
    .bdrv_aio_writev = raw_aio_writev,
    .bdrv_aio_flush = raw_aio_flush,
    .bdrv_aio_discard = raw_aio_discard,
+    .bdrv_refresh_limits = raw_refresh_limits,

    .bdrv_truncate = raw_truncate,
    .bdrv_getlength = raw_getlength,
@@ -1740,6 +1804,7 @@ static BlockDriver bdrv_host_device = {
    .bdrv_aio_writev	= raw_aio_writev,
    .bdrv_aio_flush	= raw_aio_flush,
    .bdrv_aio_discard   = hdev_aio_discard,
+    .bdrv_refresh_limits = raw_refresh_limits,

    .bdrv_truncate      = raw_truncate,
    .bdrv_getlength	= raw_getlength,
@@ -1871,6 +1936,7 @@ static BlockDriver bdrv_host_floppy = {
    .bdrv_aio_readv     = raw_aio_readv,
    .bdrv_aio_writev    = raw_aio_writev,
    .bdrv_aio_flush	= raw_aio_flush,
+    .bdrv_refresh_limits = raw_refresh_limits,

    .bdrv_truncate      = raw_truncate,
    .bdrv_getlength      = raw_getlength,
@@ -1981,6 +2047,7 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_aio_readv     = raw_aio_readv,
    .bdrv_aio_writev    = raw_aio_writev,
    .bdrv_aio_flush	= raw_aio_flush,
+    .bdrv_refresh_limits = raw_refresh_limits,

    .bdrv_truncate      = raw_truncate,
    .bdrv_getlength      = raw_getlength,
@@ -2110,6 +2177,7 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_aio_readv     = raw_aio_readv,
    .bdrv_aio_writev    = raw_aio_writev,
    .bdrv_aio_flush	= raw_aio_flush,
+    .bdrv_refresh_limits = raw_refresh_limits,

    .bdrv_truncate      = raw_truncate,
    .bdrv_getlength      = raw_getlength,
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -202,6 +202,35 @@ static int set_sparse(int fd)
 				 NULL, 0, NULL, 0, &returned, NULL);
 }

+static void raw_probe_alignment(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    DWORD sectorsPerCluster, freeClusters, totalClusters, count;
+    DISK_GEOMETRY_EX dg;
+    BOOL status;
+
+    if (s->type == FTYPE_CD) {
+        bs->request_alignment = 2048;
+        return;
+    }
+    if (s->type == FTYPE_HARDDISK) {
+        status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX,
+                                 NULL, 0, &dg, sizeof(dg), &count, NULL);
+        if (status != 0) {
+            bs->request_alignment = dg.Geometry.BytesPerSector;
+            return;
+        }
+        /* try GetDiskFreeSpace too */
+    }
+
+    if (s->drive_path[0]) {
+        GetDiskFreeSpace(s->drive_path, &sectorsPerCluster,
+                         &dg.Geometry.BytesPerSector,
+                         &freeClusters, &totalClusters);
+        bs->request_alignment = dg.Geometry.BytesPerSector;
+    }
+}
+
 static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped)
 {
    assert(access_flags != NULL);
@@ -269,6 +298,17 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
        }
    }

+    if (filename[0] && filename[1] == ':') {
+        snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", filename[0]);
+    } else if (filename[0] == '\\' && filename[1] == '\\') {
+        s->drive_path[0] = 0;
+    } else {
+        /* Relative path.  */
+        char buf[MAX_PATH];
+        GetCurrentDirectory(MAX_PATH, buf);
+        snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", buf[0]);
+    }
+
    s->hfile = CreateFile(filename, access_flags,
                          FILE_SHARE_READ, NULL,
                          OPEN_EXISTING, overlapped, NULL);
@@ -293,6 +333,7 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
        s->aio = aio;
    }

+    raw_probe_alignment(bs);
    ret = 0;
 fail:
    qemu_opts_del(opts);
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -95,18 +95,13 @@ typedef struct RADOSCB {
 #define RBD_FD_WRITE 1

 typedef struct BDRVRBDState {
-    int fds[2];
    rados_t cluster;
    rados_ioctx_t io_ctx;
    rbd_image_t image;
    char name[RBD_MAX_IMAGE_NAME_SIZE];
    char *snap;
-    int event_reader_pos;
-    RADOSCB *event_rcb;
 } BDRVRBDState;

-static void rbd_aio_bh_cb(void *opaque);
-
 static int qemu_rbd_next_tok(char *dst, int dst_len,
                             char *src, char delim,
                             const char *name,
@@ -369,9 +364,8 @@ static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options,
 }

 /*
- * This aio completion is being called from qemu_rbd_aio_event_reader()
- * and runs in qemu context. It schedules a bh, but just in case the aio
- * was not cancelled before.
+ * This aio completion is being called from rbd_finish_bh() and runs in qemu
+ * BH context.
 */
 static void qemu_rbd_complete_aio(RADOSCB *rcb)
 {
@@ -401,36 +395,19 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
            acb->ret = r;
        }
    }
-    /* Note that acb->bh can be NULL in case where the aio was cancelled */
-    acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb);
-    qemu_bh_schedule(acb->bh);
+
    g_free(rcb);
-}

-/*
- * aio fd read handler. It runs in the qemu context and calls the
- * completion handling of completed rados aio operations.
- */
-static void qemu_rbd_aio_event_reader(void *opaque)
-{
-    BDRVRBDState *s = opaque;
+    if (acb->cmd == RBD_AIO_READ) {
+        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
+    }
+    qemu_vfree(acb->bounce);
+    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
+    acb->status = 0;

-    ssize_t ret;
-
-    do {
-        char *p = (char *)&s->event_rcb;
-
-        /* now read the rcb pointer that was sent from a non qemu thread */
-        ret = read(s->fds[RBD_FD_READ], p + s->event_reader_pos,
-                   sizeof(s->event_rcb) - s->event_reader_pos);
-        if (ret > 0) {
-            s->event_reader_pos += ret;
-            if (s->event_reader_pos == sizeof(s->event_rcb)) {
-                s->event_reader_pos = 0;
-                qemu_rbd_complete_aio(s->event_rcb);
-            }
-        }
-    } while (ret < 0 && errno == EINTR);
+    if (!acb->cancelled) {
+        qemu_aio_release(acb);
+    }
 }

 /* TODO Convert to fine grained options */
@@ -538,23 +515,9 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,

    bs->read_only = (s->snap != NULL);

-    s->event_reader_pos = 0;
-    r = qemu_pipe(s->fds);
-    if (r < 0) {
-        error_report("error opening eventfd");
-        goto failed;
-    }
-    fcntl(s->fds[0], F_SETFL, O_NONBLOCK);
-    fcntl(s->fds[1], F_SETFL, O_NONBLOCK);
-    qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], qemu_rbd_aio_event_reader,
-                            NULL, s);
-
-
    qemu_opts_del(opts);
    return 0;

-failed:
-    rbd_close(s->image);
 failed_open:
    rados_ioctx_destroy(s->io_ctx);
 failed_shutdown:
@@ -569,10 +532,6 @@ static void qemu_rbd_close(BlockDriverState *bs)
 {
    BDRVRBDState *s = bs->opaque;

-    close(s->fds[0]);
-    close(s->fds[1]);
-    qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], NULL, NULL, NULL);
-
    rbd_close(s->image);
    rados_ioctx_destroy(s->io_ctx);
    g_free(s->snap);
@@ -600,34 +559,11 @@ static const AIOCBInfo rbd_aiocb_info = {
    .cancel = qemu_rbd_aio_cancel,
 };

-static int qemu_rbd_send_pipe(BDRVRBDState *s, RADOSCB *rcb)
+static void rbd_finish_bh(void *opaque)
 {
-    int ret = 0;
-    while (1) {
-        fd_set wfd;
-        int fd = s->fds[RBD_FD_WRITE];
-
-        /* send the op pointer to the qemu thread that is responsible
-           for the aio/op completion. Must do it in a qemu thread context */
-        ret = write(fd, (void *)&rcb, sizeof(rcb));
-        if (ret >= 0) {
-            break;
-        }
-        if (errno == EINTR) {
-            continue;
-        }
-        if (errno != EAGAIN) {
-            break;
-        }
-
-        FD_ZERO(&wfd);
-        FD_SET(fd, &wfd);
-        do {
-            ret = select(fd + 1, NULL, &wfd, NULL, NULL);
-        } while (ret < 0 && errno == EINTR);
-    }
-
-    return ret;
+    RADOSCB *rcb = opaque;
+    qemu_bh_delete(rcb->acb->bh);
+    qemu_rbd_complete_aio(rcb);
 }

 /*
@@ -635,40 +571,18 @@ static int qemu_rbd_send_pipe(BDRVRBDState *s, RADOSCB *rcb)
 *
 * Note: this function is being called from a non qemu thread so
 * we need to be careful about what we do here. Generally we only
- * write to the block notification pipe, and do the rest of the
- * io completion handling from qemu_rbd_aio_event_reader() which
- * runs in a qemu context.
+ * schedule a BH, and do the rest of the io completion handling
+ * from rbd_finish_bh() which runs in a qemu context.
 */
 static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb)
 {
-    int ret;
+    RBDAIOCB *acb = rcb->acb;
+
    rcb->ret = rbd_aio_get_return_value(c);
    rbd_aio_release(c);
-    ret = qemu_rbd_send_pipe(rcb->s, rcb);
-    if (ret < 0) {
-        error_report("failed writing to acb->s->fds");
-        g_free(rcb);
-    }
-}

-/* Callback when all queued rbd_aio requests are complete */
-
-static void rbd_aio_bh_cb(void *opaque)
-{
-    RBDAIOCB *acb = opaque;
-
-    if (acb->cmd == RBD_AIO_READ) {
-        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
-    }
-    qemu_vfree(acb->bounce);
-    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
-    qemu_bh_delete(acb->bh);
-    acb->bh = NULL;
-    acb->status = 0;
-
-    if (!acb->cancelled) {
-        qemu_aio_release(acb);
-    }
+    acb->bh = qemu_bh_new(rbd_finish_bh, rcb);
+    qemu_bh_schedule(acb->bh);
 }

 static int rbd_aio_discard_wrapper(rbd_image_t image,
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -161,7 +161,7 @@ typedef struct SheepdogVdiReq {
    uint32_t id;
    uint32_t data_length;
    uint64_t vdi_size;
-    uint32_t vdi_id;
+    uint32_t base_vdi_id;
    uint8_t copies;
    uint8_t copy_policy;
    uint8_t reserved[2];
@@ -1493,7 +1493,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot)

    memset(&hdr, 0, sizeof(hdr));
    hdr.opcode = SD_OP_NEW_VDI;
-    hdr.vdi_id = s->inode.vdi_id;
+    hdr.base_vdi_id = s->inode.vdi_id;

    wlen = SD_MAX_VDI_LEN;

@@ -1534,7 +1534,7 @@ static int sd_prealloc(const char *filename)
    Error *local_err = NULL;
    int ret;

-    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR, &local_err);
+    ret = bdrv_file_open(&bs, filename, NULL, NULL, BDRV_O_RDWR, &local_err);
    if (ret < 0) {
        qerror_report_err(local_err);
        error_free(local_err);
@@ -1684,7 +1684,7 @@ static int sd_create(const char *filename, QEMUOptionParameter *options,

    if (backing_file) {
        BlockDriverState *bs;
-        BDRVSheepdogState *s;
+        BDRVSheepdogState *base;
        BlockDriver *drv;

        /* Currently, only Sheepdog backing image is supported. */
@@ -1695,22 +1695,22 @@ static int sd_create(const char *filename, QEMUOptionParameter *options,
            goto out;
        }

-        ret = bdrv_file_open(&bs, backing_file, NULL, 0, &local_err);
+        ret = bdrv_file_open(&bs, backing_file, NULL, NULL, 0, &local_err);
        if (ret < 0) {
            qerror_report_err(local_err);
            error_free(local_err);
            goto out;
        }

-        s = bs->opaque;
+        base = bs->opaque;

-        if (!is_snapshot(&s->inode)) {
+        if (!is_snapshot(&base->inode)) {
            error_report("cannot clone from a non snapshot vdi");
            bdrv_unref(bs);
            ret = -EINVAL;
            goto out;
        }
-
+        s->inode.vdi_id = base->inode.vdi_id;
        bdrv_unref(bs);
    }

@@ -1743,7 +1743,7 @@ static void sd_close(BlockDriverState *bs)
    memset(&hdr, 0, sizeof(hdr));

    hdr.opcode = SD_OP_RELEASE_VDI;
-    hdr.vdi_id = s->inode.vdi_id;
+    hdr.base_vdi_id = s->inode.vdi_id;
    wlen = strlen(s->name) + 1;
    hdr.data_length = wlen;
    hdr.flags = SD_FLAG_CMD_WRITE;
@@ -1846,7 +1846,7 @@ static bool sd_delete(BDRVSheepdogState *s)
    unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0;
    SheepdogVdiReq hdr = {
        .opcode = SD_OP_DEL_VDI,
-        .vdi_id = s->inode.vdi_id,
+        .base_vdi_id = s->inode.vdi_id,
        .data_length = wlen,
        .flags = SD_FLAG_CMD_WRITE,
    };
@@ -2442,11 +2442,12 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
 {
    BDRVSheepdogState *s = bs->opaque;
    SheepdogInode *inode = &s->inode;
-    unsigned long start = sector_num * BDRV_SECTOR_SIZE / SD_DATA_OBJ_SIZE,
+    uint64_t offset = sector_num * BDRV_SECTOR_SIZE;
+    unsigned long start = offset / SD_DATA_OBJ_SIZE,
                  end = DIV_ROUND_UP((sector_num + nb_sectors) *
                                     BDRV_SECTOR_SIZE, SD_DATA_OBJ_SIZE);
    unsigned long idx;
-    int64_t ret = BDRV_BLOCK_DATA;
+    int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;

    for (idx = start; idx < end; idx++) {
        if (inode->data_vdi_id[idx] == 0) {
--- a/block/stream.c
+++ b/block/stream.c
@@ -75,6 +75,8 @@ static void close_unused_images(BlockDriverState *top, BlockDriverState *base,
        unused->backing_hd = NULL;
        bdrv_unref(unused);
    }
+
+    bdrv_refresh_limits(top);
 }

 static void coroutine_fn stream_run(void *opaque)
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -1797,7 +1797,7 @@ static int vhdx_create(const char *filename, QEMUOptionParameter *options,
        goto exit;
    }

-    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR, &local_err);
+    ret = bdrv_file_open(&bs, filename, NULL, NULL, BDRV_O_RDWR, &local_err);
    if (ret < 0) {
        error_propagate(errp, local_err);
        goto exit;
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -428,10 +428,6 @@ static int vmdk_add_extent(BlockDriverState *bs,
    extent->l2_size = l2_size;
    extent->cluster_sectors = flat ? sectors : cluster_sectors;

-    if (!flat) {
-        bs->bl.write_zeroes_alignment =
-            MAX(bs->bl.write_zeroes_alignment, cluster_sectors);
-    }
    if (s->num_extents > 1) {
        extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
    } else {
@@ -640,6 +636,13 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
    if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) {
        l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9;
    }
+    if (bdrv_getlength(file) <
+            le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE) {
+        error_report("File truncated, expecting at least %lld bytes",
+                le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE);
+        return -EINVAL;
+    }
+
    ret = vmdk_add_extent(bs, file, false,
                          le64_to_cpu(header.capacity),
                          le64_to_cpu(header.gd_offset) << 9,
@@ -654,6 +657,10 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
    }
    extent->compressed =
        le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
+    if (extent->compressed) {
+        g_free(s->create_type);
+        s->create_type = g_strdup("streamOptimized");
+    }
    extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
    extent->version = le32_to_cpu(header.version);
    extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN;
@@ -769,8 +776,8 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,

        path_combine(extent_path, sizeof(extent_path),
                desc_file_path, fname);
-        ret = bdrv_file_open(&extent_file, extent_path, NULL, bs->open_flags,
-                             errp);
+        ret = bdrv_file_open(&extent_file, extent_path, NULL, NULL,
+                             bs->open_flags, errp);
        if (ret) {
            return ret;
        }
@@ -891,6 +898,23 @@ fail:
    return ret;
 }

+
+static int vmdk_refresh_limits(BlockDriverState *bs)
+{
+    BDRVVmdkState *s = bs->opaque;
+    int i;
+
+    for (i = 0; i < s->num_extents; i++) {
+        if (!s->extents[i].flat) {
+            bs->bl.write_zeroes_alignment =
+                MAX(bs->bl.write_zeroes_alignment,
+                    s->extents[i].cluster_sectors);
+        }
+    }
+
+    return 0;
+}
+
 static int get_whole_cluster(BlockDriverState *bs,
                VmdkExtent *extent,
                uint64_t cluster_offset,
@@ -1325,8 +1349,8 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
 {
    BDRVVmdkState *s = bs->opaque;
    VmdkExtent *extent = NULL;
-    int n, ret;
-    int64_t index_in_cluster;
+    int ret;
+    int64_t index_in_cluster, n;
    uint64_t extent_begin_sector, extent_relative_sector_num;
    uint64_t cluster_offset;
    VmdkMetaData m_data;
@@ -1469,7 +1493,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
        goto exit;
    }

-    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR, &local_err);
+    ret = bdrv_file_open(&bs, filename, NULL, NULL, BDRV_O_RDWR, &local_err);
    if (ret < 0) {
        error_propagate(errp, local_err);
        goto exit;
@@ -1807,7 +1831,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options,
            goto exit;
        }
    }
-    ret = bdrv_file_open(&new_bs, filename, NULL, BDRV_O_RDWR, &local_err);
+    ret = bdrv_file_open(&new_bs, filename, NULL, NULL, BDRV_O_RDWR, &local_err);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not write description");
        goto exit;
@@ -2002,6 +2026,7 @@ static BlockDriver bdrv_vmdk = {
    .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size,
    .bdrv_has_zero_init           = vmdk_has_zero_init,
    .bdrv_get_specific_info       = vmdk_get_specific_info,
+    .bdrv_refresh_limits          = vmdk_refresh_limits,

    .create_options               = vmdk_create_options,
 };
--- a/blockdev.c
+++ b/blockdev.c
@@ -307,12 +307,11 @@ static bool check_throttle_config(ThrottleConfig *cfg, Error **errp)
 typedef enum { MEDIA_DISK, MEDIA_CDROM } DriveMediaType;

 /* Takes the ownership of bs_opts */
-static DriveInfo *blockdev_init(QDict *bs_opts,
+static DriveInfo *blockdev_init(const char *file, QDict *bs_opts,
                                BlockInterfaceType type,
                                Error **errp)
 {
    const char *buf;
-    const char *file = NULL;
    const char *serial;
    int ro = 0;
    int bdrv_flags = 0;
@@ -354,7 +353,6 @@ static DriveInfo *blockdev_init(QDict *bs_opts,
    ro = qemu_opt_get_bool(opts, "read-only", 0);
    copy_on_read = qemu_opt_get_bool(opts, "copy-on-read", false);

-    file = qemu_opt_get(opts, "file");
    serial = qemu_opt_get(opts, "serial");

    if ((buf = qemu_opt_get(opts, "discard")) != NULL) {
@@ -599,6 +597,10 @@ QemuOptsList qemu_legacy_drive_opts = {
            .name = "addr",
            .type = QEMU_OPT_STRING,
            .help = "pci address (virtio only)",
+        },{
+            .name = "file",
+            .type = QEMU_OPT_STRING,
+            .help = "file name",
        },

        /* Options that are passed on, but have special semantics with -drive */
@@ -629,6 +631,7 @@ DriveInfo *drive_init(QemuOpts *all_opts, BlockInterfaceType block_default_type)
    const char *devaddr;
    bool read_only = false;
    bool copy_on_read;
+    const char *filename;
    Error *local_err = NULL;

    /* Change legacy command line options into QMP ones */
@@ -867,8 +870,10 @@ DriveInfo *drive_init(QemuOpts *all_opts, BlockInterfaceType block_default_type)
        }
    }

+    filename = qemu_opt_get(legacy_opts, "file");
+
    /* Actual block device init: Functionality shared with blockdev-add */
-    dinfo = blockdev_init(bs_opts, type, &local_err);
+    dinfo = blockdev_init(filename, bs_opts, type, &local_err);
    if (dinfo == NULL) {
        if (error_is_set(&local_err)) {
            qerror_report_err(local_err);
@@ -942,14 +947,22 @@ static void blockdev_do_action(int kind, void *data, Error **errp)
    qmp_transaction(&list, errp);
 }

-void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
+void qmp_blockdev_snapshot_sync(bool has_device, const char *device,
+                                bool has_node_name, const char *node_name,
+                                const char *snapshot_file,
+                                bool has_snapshot_node_name,
+                                const char *snapshot_node_name,
                                bool has_format, const char *format,
-                                bool has_mode, enum NewImageMode mode,
-                                Error **errp)
+                                bool has_mode, NewImageMode mode, Error **errp)
 {
    BlockdevSnapshot snapshot = {
+        .has_device = has_device,
        .device = (char *) device,
+        .has_node_name = has_node_name,
+        .node_name = (char *) node_name,
        .snapshot_file = (char *) snapshot_file,
+        .has_snapshot_node_name = has_snapshot_node_name,
+        .snapshot_node_name = (char *) snapshot_node_name,
        .has_format = has_format,
        .format = (char *) format,
        .has_mode = has_mode,
@@ -1187,8 +1200,14 @@ static void external_snapshot_prepare(BlkTransactionState *common,
 {
    BlockDriver *drv;
    int flags, ret;
+    QDict *options = NULL;
    Error *local_err = NULL;
+    bool has_device = false;
    const char *device;
+    bool has_node_name = false;
+    const char *node_name;
+    bool has_snapshot_node_name = false;
+    const char *snapshot_node_name;
    const char *new_image_file;
    const char *format = "qcow2";
    enum NewImageMode mode = NEW_IMAGE_MODE_ABSOLUTE_PATHS;
@@ -1199,7 +1218,14 @@ static void external_snapshot_prepare(BlkTransactionState *common,
    /* get parameters */
    g_assert(action->kind == TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_SYNC);

+    has_device = action->blockdev_snapshot_sync->has_device;
    device = action->blockdev_snapshot_sync->device;
+    has_node_name = action->blockdev_snapshot_sync->has_node_name;
+    node_name = action->blockdev_snapshot_sync->node_name;
+    has_snapshot_node_name =
+        action->blockdev_snapshot_sync->has_snapshot_node_name;
+    snapshot_node_name = action->blockdev_snapshot_sync->snapshot_node_name;
+
    new_image_file = action->blockdev_snapshot_sync->snapshot_file;
    if (action->blockdev_snapshot_sync->has_format) {
        format = action->blockdev_snapshot_sync->format;
@@ -1215,9 +1241,21 @@ static void external_snapshot_prepare(BlkTransactionState *common,
        return;
    }

-    state->old_bs = bdrv_find(device);
-    if (!state->old_bs) {
-        error_set(errp, QERR_DEVICE_NOT_FOUND, device);
+    state->old_bs = bdrv_lookup_bs(has_device ? device : NULL,
+                                   has_node_name ? node_name : NULL,
+                                   &local_err);
+    if (error_is_set(&local_err)) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    if (has_node_name && !has_snapshot_node_name) {
+        error_setg(errp, "New snapshot node name missing");
+        return;
+    }
+
+    if (has_snapshot_node_name && bdrv_find_node(snapshot_node_name)) {
+        error_setg(errp, "New snapshot node name already existing");
        return;
    }

@@ -1238,7 +1276,7 @@ static void external_snapshot_prepare(BlkTransactionState *common,
        }
    }

-    if (bdrv_check_ext_snapshot(state->old_bs) != EXT_SNAPSHOT_ALLOWED) {
+    if (!bdrv_is_first_non_filter(state->old_bs)) {
        error_set(errp, QERR_FEATURE_DISABLED, "snapshot");
        return;
    }
@@ -1257,15 +1295,23 @@ static void external_snapshot_prepare(BlkTransactionState *common,
        }
    }

+    if (has_snapshot_node_name) {
+        options = qdict_new();
+        qdict_put(options, "node-name",
+                  qstring_from_str(snapshot_node_name));
+    }
+
    /* We will manually add the backing_hd field to the bs later */
    state->new_bs = bdrv_new("");
    /* TODO Inherit bs->options or only take explicit options with an
     * extended QMP command? */
-    ret = bdrv_open(state->new_bs, new_image_file, NULL,
+    ret = bdrv_open(state->new_bs, new_image_file, options,
                    flags | BDRV_O_NO_BACKING, drv, &local_err);
    if (ret != 0) {
        error_propagate(errp, local_err);
    }
+
+    QDECREF(options);
 }

 static void external_snapshot_commit(BlkTransactionState *common)
@@ -1476,14 +1522,19 @@ void qmp_eject(const char *device, bool has_force, bool force, Error **errp)
    eject_device(bs, force, errp);
 }

-void qmp_block_passwd(const char *device, const char *password, Error **errp)
+void qmp_block_passwd(bool has_device, const char *device,
+                      bool has_node_name, const char *node_name,
+                      const char *password, Error **errp)
 {
+    Error *local_err = NULL;
    BlockDriverState *bs;
    int err;

-    bs = bdrv_find(device);
-    if (!bs) {
-        error_set(errp, QERR_DEVICE_NOT_FOUND, device);
+    bs = bdrv_lookup_bs(has_device ? device : NULL,
+                        has_node_name ? node_name : NULL,
+                        &local_err);
+    if (error_is_set(&local_err)) {
+        error_propagate(errp, local_err);
        return;
    }

@@ -1673,14 +1724,24 @@ int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
    return 0;
 }

-void qmp_block_resize(const char *device, int64_t size, Error **errp)
+void qmp_block_resize(bool has_device, const char *device,
+                      bool has_node_name, const char *node_name,
+                      int64_t size, Error **errp)
 {
+    Error *local_err = NULL;
    BlockDriverState *bs;
    int ret;

-    bs = bdrv_find(device);
-    if (!bs) {
-        error_set(errp, QERR_DEVICE_NOT_FOUND, device);
+    bs = bdrv_lookup_bs(has_device ? device : NULL,
+                        has_node_name ? node_name : NULL,
+                        &local_err);
+    if (error_is_set(&local_err)) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    if (!bdrv_is_first_non_filter(bs)) {
+        error_set(errp, QERR_FEATURE_DISABLED, "resize");
        return;
    }

@@ -1947,6 +2008,11 @@ void qmp_drive_backup(const char *device, const char *target,
    }
 }

+BlockDeviceInfoList *qmp_query_named_block_nodes(Error **errp)
+{
+    return bdrv_named_nodes_list();
+}
+
 #define DEFAULT_MIRROR_BUF_SIZE   (10 << 20)

 void qmp_drive_mirror(const char *device, const char *target,
@@ -2210,7 +2276,7 @@ void qmp_blockdev_add(BlockdevOptions *options, Error **errp)

    qdict_flatten(qdict);

-    blockdev_init(qdict, IF_NONE, &local_err);
+    blockdev_init(NULL, qdict, IF_NONE, &local_err);
    if (error_is_set(&local_err)) {
        error_propagate(errp, local_err);
        goto fail;
@@ -2250,10 +2316,6 @@ QemuOptsList qemu_common_drive_opts = {
            .name = "snapshot",
            .type = QEMU_OPT_BOOL,
            .help = "enable/disable snapshot mode",
-        },{
-            .name = "file",
-            .type = QEMU_OPT_STRING,
-            .help = "disk image",
        },{
            .name = "discard",
            .type = QEMU_OPT_STRING,
--- a/12
+++ b/12
@@ -256,6 +256,7 @@ coroutine_pool=""
 seccomp=""
 glusterfs=""
 glusterfs_discard="no"
+glusterfs_zerofill="no"
 virtio_blk_data_plane=""
 gtk=""
 gtkabi="2.0"
@@ -2701,6 +2702,9 @@ if test "$glusterfs" != "no" ; then
    if $pkg_config --atleast-version=5 glusterfs-api; then
      glusterfs_discard="yes"
    fi
+    if $pkg_config --atleast-version=6 glusterfs-api; then
+      glusterfs_zerofill="yes"
+    fi
  else
    if test "$glusterfs" = "yes" ; then
      feature_not_found "GlusterFS backend support"
@@ -4229,6 +4233,10 @@ if test "$glusterfs_discard" = "yes" ; then
  echo "CONFIG_GLUSTERFS_DISCARD=y" >> $config_host_mak
 fi

+if test "$glusterfs_zerofill" = "yes" ; then
+  echo "CONFIG_GLUSTERFS_ZEROFILL=y" >> $config_host_mak
+fi
+
 if test "$libssh2" = "yes" ; then
  echo "CONFIG_LIBSSH2=y" >> $config_host_mak
 fi
@@ -4766,6 +4774,10 @@ for bios_file in \
 do
    FILES="$FILES pc-bios/`basename $bios_file`"
 done
+for test_file in `find $source_path/tests/acpi-test-data -type f`
+do
+    FILES="$FILES tests/acpi-test-data`echo $test_file | sed -e 's/.*acpi-test-data//'`"
+done
 mkdir -p $DIRS
 for f in $FILES ; do
    if [ -e "$source_path/$f" ] && [ "$source_path" != `pwd` ]; then
--- a/docs/specs/acpi_cpu_hotplug.txt
+++ b/docs/specs/acpi_cpu_hotplug.txt
@@ -10,7 +10,9 @@ ACPI GPE block (IO ports 0xafe0-0xafe3, byte access):
 Generic ACPI GPE block. Bit 2 (GPE.2) used to notify CPU
 hot-add/remove event to ACPI BIOS, via SCI interrupt.

-CPU present bitmap (IO port 0xaf00-0xaf1f, 1-byte access):
+CPU present bitmap for:
+  ICH9-LPC (IO port 0x0cd8-0xcf7, 1-byte access)
+  PIIX-PM  (IO port 0xaf00-0xaf1f, 1-byte access)
 ---------------------------------------------------------------
 One bit per CPU. Bit position reflects corresponding CPU APIC ID.
 Read-only.
--- a/exec.c
+++ b/exec.c
@@ -1070,7 +1070,7 @@ static void *file_ram_alloc(RAMBlock *block,
        }

        /* MAP_POPULATE silently ignores failures */
-        for (i = 0; i < (memory/hpagesize)-1; i++) {
+        for (i = 0; i < (memory/hpagesize); i++) {
            memset(area + (hpagesize*i), 0, 1);
        }

--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -35,6 +35,11 @@ STEXI
@item commit
@findex commit
 Commit changes to the disk images (if -snapshot is used) or backing files.
+If the backing file is smaller than the snapshot, then the backing file will be
+resized to be the same size as the snapshot.  If the snapshot is smaller than
+the backing file, the backing file will not be truncated.  If you want the
+backing file to match the size of the smaller snapshot, you can safely truncate
+it yourself once the commit operation successfully completes.
 ETEXI

    {
--- a/hmp.c
+++ b/hmp.c
@@ -871,7 +871,7 @@ void hmp_block_passwd(Monitor *mon, const QDict *qdict)
    const char *password = qdict_get_str(qdict, "password");
    Error *errp = NULL;

-    qmp_block_passwd(device, password, &errp);
+    qmp_block_passwd(true, device, false, NULL, password, &errp);
    hmp_handle_error(mon, &errp);
 }

@@ -893,7 +893,7 @@ void hmp_block_resize(Monitor *mon, const QDict *qdict)
    int64_t size = qdict_get_int(qdict, "size");
    Error *errp = NULL;

-    qmp_block_resize(device, size, &errp);
+    qmp_block_resize(true, device, false, NULL, size, &errp);
    hmp_handle_error(mon, &errp);
 }

@@ -972,7 +972,9 @@ void hmp_snapshot_blkdev(Monitor *mon, const QDict *qdict)
    }

    mode = reuse ? NEW_IMAGE_MODE_EXISTING : NEW_IMAGE_MODE_ABSOLUTE_PATHS;
-    qmp_blockdev_snapshot_sync(device, filename, !!format, format,
+    qmp_blockdev_snapshot_sync(true, device, false, NULL,
+                               filename, false, NULL,
+                               !!format, format,
                               true, mode, &errp);
    hmp_handle_error(mon, &errp);
 }
@@ -1092,11 +1094,11 @@ void hmp_eject(Monitor *mon, const QDict *qdict)
    hmp_handle_error(mon, &err);
 }

-static void hmp_change_read_arg(Monitor *mon, const char *password,
-                                void *opaque)
+static void hmp_change_read_arg(void *opaque, const char *password,
+                                void *readline_opaque)
 {
    qmp_change_vnc_password(password, NULL);
-    monitor_read_command(mon, 1);
+    monitor_read_command(opaque, 1);
 }

 void hmp_change(Monitor *mon, const QDict *qdict)
--- a/hw/acpi/Makefile.objs
+++ b/hw/acpi/Makefile.objs
@@ -1,2 +1 @@
-common-obj-$(CONFIG_ACPI) += core.o piix4.o ich9.o
-
+common-obj-$(CONFIG_ACPI) += core.o piix4.o ich9.o pcihp.o cpu_hotplug.o
--- a/hw/acpi/cpu_hotplug.c
+++ b/hw/acpi/cpu_hotplug.c
@@ -0,0 +1,64 @@
+/*
+ * QEMU ACPI hotplug utilities
+ *
+ * Copyright (C) 2013 Red Hat Inc
+ *
+ * Authors:
+ *   Igor Mammedov <imammedo@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#include "hw/hw.h"
+#include "hw/acpi/cpu_hotplug.h"
+
+static uint64_t cpu_status_read(void *opaque, hwaddr addr, unsigned int size)
+{
+    AcpiCpuHotplug *cpus = opaque;
+    uint64_t val = cpus->sts[addr];
+
+    return val;
+}
+
+static void cpu_status_write(void *opaque, hwaddr addr, uint64_t data,
+                             unsigned int size)
+{
+    /* TODO: implement VCPU removal on guest signal that CPU can be removed */
+}
+
+static const MemoryRegionOps AcpiCpuHotplug_ops = {
+    .read = cpu_status_read,
+    .write = cpu_status_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .valid = {
+        .min_access_size = 1,
+        .max_access_size = 1,
+    },
+};
+
+void AcpiCpuHotplug_add(ACPIGPE *gpe, AcpiCpuHotplug *g, CPUState *cpu)
+{
+    CPUClass *k = CPU_GET_CLASS(cpu);
+    int64_t cpu_id;
+
+    *gpe->sts = *gpe->sts | ACPI_CPU_HOTPLUG_STATUS;
+    cpu_id = k->get_arch_id(CPU(cpu));
+    g->sts[cpu_id / 8] |= (1 << (cpu_id % 8));
+}
+
+void AcpiCpuHotplug_init(MemoryRegion *parent, Object *owner,
+                         AcpiCpuHotplug *gpe_cpu, uint16_t base)
+{
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu) {
+        CPUClass *cc = CPU_GET_CLASS(cpu);
+        int64_t id = cc->get_arch_id(cpu);
+
+        g_assert((id / 8) < ACPI_GPE_PROC_LEN);
+        gpe_cpu->sts[id / 8] |= (1 << (id % 8));
+    }
+    memory_region_init_io(&gpe_cpu->io, owner, &AcpiCpuHotplug_ops,
+                          gpe_cpu, "acpi-cpu-hotplug", ACPI_GPE_PROC_LEN);
+    memory_region_add_subregion(parent, base, &gpe_cpu->io);
+}
--- a/hw/acpi/ich9.c
+++ b/hw/acpi/ich9.c
@@ -185,6 +185,15 @@ static void pm_powerdown_req(Notifier *n, void *opaque)
    acpi_pm1_evt_power_down(&pm->acpi_regs);
 }

+static void ich9_cpu_added_req(Notifier *n, void *opaque)
+{
+    ICH9LPCPMRegs *pm = container_of(n, ICH9LPCPMRegs, cpu_added_notifier);
+
+    assert(pm != NULL);
+    AcpiCpuHotplug_add(&pm->acpi_regs.gpe, &pm->gpe_cpu, CPU(opaque));
+    acpi_update_sci(&pm->acpi_regs, pm->irq);
+}
+
 void ich9_pm_init(PCIDevice *lpc_pci, ICH9LPCPMRegs *pm,
                  qemu_irq sci_irq)
 {
@@ -210,6 +219,11 @@ void ich9_pm_init(PCIDevice *lpc_pci, ICH9LPCPMRegs *pm,
    qemu_register_reset(pm_reset, pm);
    pm->powerdown_notifier.notify = pm_powerdown_req;
    qemu_register_powerdown_notifier(&pm->powerdown_notifier);
+
+    AcpiCpuHotplug_init(pci_address_space_io(lpc_pci), OBJECT(lpc_pci),
+                        &pm->gpe_cpu, ICH9_CPU_HOTPLUG_IO_BASE);
+    pm->cpu_added_notifier.notify = ich9_cpu_added_req;
+    qemu_register_cpu_added_notifier(&pm->cpu_added_notifier);
 }

 static void ich9_pm_get_gpe0_blk(Object *obj, Visitor *v,
--- a/hw/acpi/pcihp.c
+++ b/hw/acpi/pcihp.c
@@ -0,0 +1,316 @@
+/*
+ * QEMU<->ACPI BIOS PCI hotplug interface
+ *
+ * QEMU supports PCI hotplug via ACPI. This module
+ * implements the interface between QEMU and the ACPI BIOS.
+ * Interface specification - see docs/specs/acpi_pci_hotplug.txt
+ *
+ * Copyright (c) 2013, Red Hat Inc, Michael S. Tsirkin (mst@redhat.com)
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "hw/acpi/pcihp.h"
+
+#include "hw/hw.h"
+#include "hw/i386/pc.h"
+#include "hw/pci/pci.h"
+#include "hw/acpi/acpi.h"
+#include "sysemu/sysemu.h"
+#include "qemu/range.h"
+#include "exec/ioport.h"
+#include "exec/address-spaces.h"
+#include "hw/pci/pci_bus.h"
+#include "qom/qom-qobject.h"
+#include "qapi/qmp/qint.h"
+
+//#define DEBUG
+
+#ifdef DEBUG
+# define ACPI_PCIHP_DPRINTF(format, ...)     printf(format, ## __VA_ARGS__)
+#else
+# define ACPI_PCIHP_DPRINTF(format, ...)     do { } while (0)
+#endif
+
+#define PCI_HOTPLUG_ADDR 0xae00
+#define PCI_HOTPLUG_SIZE 0x0014
+#define PCI_UP_BASE 0xae00
+#define PCI_DOWN_BASE 0xae04
+#define PCI_EJ_BASE 0xae08
+#define PCI_RMV_BASE 0xae0c
+#define PCI_SEL_BASE 0xae10
+
+typedef struct AcpiPciHpFind {
+    int bsel;
+    PCIBus *bus;
+} AcpiPciHpFind;
+
+static int acpi_pcihp_get_bsel(PCIBus *bus)
+{
+    QObject *o = object_property_get_qobject(OBJECT(bus),
+                                             ACPI_PCIHP_PROP_BSEL, NULL);
+    int64_t bsel = -1;
+    if (o) {
+        bsel = qint_get_int(qobject_to_qint(o));
+    }
+    if (bsel < 0) {
+        return -1;
+    }
+    return bsel;
+}
+
+static void acpi_pcihp_test_hotplug_bus(PCIBus *bus, void *opaque)
+{
+    AcpiPciHpFind *find = opaque;
+    if (find->bsel == acpi_pcihp_get_bsel(bus)) {
+        find->bus = bus;
+    }
+}
+
+static PCIBus *acpi_pcihp_find_hotplug_bus(AcpiPciHpState *s, int bsel)
+{
+    AcpiPciHpFind find = { .bsel = bsel, .bus = NULL };
+
+    if (bsel < 0) {
+        return NULL;
+    }
+
+    pci_for_each_bus(s->root, acpi_pcihp_test_hotplug_bus, &find);
+
+    /* Make bsel 0 eject root bus if bsel property is not set,
+     * for compatibility with non acpi setups.
+     * TODO: really needed?
+     */
+    if (!bsel && !find.bus) {
+        find.bus = s->root;
+    }
+    return find.bus;
+}
+
+static bool acpi_pcihp_pc_no_hotplug(AcpiPciHpState *s, PCIDevice *dev)
+{
+    PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev);
+    /*
+     * ACPI doesn't allow hotplug of bridge devices.  Don't allow
+     * hot-unplug of bridge devices unless they were added by hotplug
+     * (and so, not described by acpi).
+     */
+    return (pc->is_bridge && !dev->qdev.hotplugged) || pc->no_hotplug;
+}
+
+static void acpi_pcihp_eject_slot(AcpiPciHpState *s, unsigned bsel, unsigned slots)
+{
+    BusChild *kid, *next;
+    int slot = ffs(slots) - 1;
+    bool slot_free = true;
+    PCIBus *bus = acpi_pcihp_find_hotplug_bus(s, bsel);
+
+    if (!bus) {
+        return;
+    }
+
+    /* Mark request as complete */
+    s->acpi_pcihp_pci_status[bsel].down &= ~(1U << slot);
+
+    QTAILQ_FOREACH_SAFE(kid, &bus->qbus.children, sibling, next) {
+        DeviceState *qdev = kid->child;
+        PCIDevice *dev = PCI_DEVICE(qdev);
+        if (PCI_SLOT(dev->devfn) == slot) {
+            if (acpi_pcihp_pc_no_hotplug(s, dev)) {
+                slot_free = false;
+            } else {
+                object_unparent(OBJECT(qdev));
+            }
+        }
+    }
+    if (slot_free) {
+        s->acpi_pcihp_pci_status[bsel].device_present &= ~(1U << slot);
+    }
+}
+
+static void acpi_pcihp_update_hotplug_bus(AcpiPciHpState *s, int bsel)
+{
+    BusChild *kid, *next;
+    PCIBus *bus = acpi_pcihp_find_hotplug_bus(s, bsel);
+
+    /* Execute any pending removes during reset */
+    while (s->acpi_pcihp_pci_status[bsel].down) {
+        acpi_pcihp_eject_slot(s, bsel, s->acpi_pcihp_pci_status[bsel].down);
+    }
+
+    s->acpi_pcihp_pci_status[bsel].hotplug_enable = ~0;
+    s->acpi_pcihp_pci_status[bsel].device_present = 0;
+
+    if (!bus) {
+        return;
+    }
+    QTAILQ_FOREACH_SAFE(kid, &bus->qbus.children, sibling, next) {
+        DeviceState *qdev = kid->child;
+        PCIDevice *pdev = PCI_DEVICE(qdev);
+        int slot = PCI_SLOT(pdev->devfn);
+
+        if (acpi_pcihp_pc_no_hotplug(s, pdev)) {
+            s->acpi_pcihp_pci_status[bsel].hotplug_enable &= ~(1U << slot);
+        }
+
+        s->acpi_pcihp_pci_status[bsel].device_present |= (1U << slot);
+    }
+}
+
+static void acpi_pcihp_update(AcpiPciHpState *s)
+{
+    int i;
+
+    for (i = 0; i < ACPI_PCIHP_MAX_HOTPLUG_BUS; ++i) {
+        acpi_pcihp_update_hotplug_bus(s, i);
+    }
+}
+
+void acpi_pcihp_reset(AcpiPciHpState *s)
+{
+    acpi_pcihp_update(s);
+}
+
+static void enable_device(AcpiPciHpState *s, unsigned bsel, int slot)
+{
+    s->acpi_pcihp_pci_status[bsel].device_present |= (1U << slot);
+}
+
+static void disable_device(AcpiPciHpState *s, unsigned bsel, int slot)
+{
+    s->acpi_pcihp_pci_status[bsel].down |= (1U << slot);
+}
+
+int acpi_pcihp_device_hotplug(AcpiPciHpState *s, PCIDevice *dev,
+                              PCIHotplugState state)
+{
+    int slot = PCI_SLOT(dev->devfn);
+    int bsel = acpi_pcihp_get_bsel(dev->bus);
+    if (bsel < 0) {
+        return -1;
+    }
+
+    /* Don't send event when device is enabled during qemu machine creation:
+     * it is present on boot, no hotplug event is necessary. We do send an
+     * event when the device is disabled later. */
+    if (state == PCI_COLDPLUG_ENABLED) {
+        s->acpi_pcihp_pci_status[bsel].device_present |= (1U << slot);
+        return 0;
+    }
+
+    if (state == PCI_HOTPLUG_ENABLED) {
+        enable_device(s, bsel, slot);
+    } else {
+        disable_device(s, bsel, slot);
+    }
+
+    return 0;
+}
+
+static uint64_t pci_read(void *opaque, hwaddr addr, unsigned int size)
+{
+    AcpiPciHpState *s = opaque;
+    uint32_t val = 0;
+    int bsel = s->hotplug_select;
+
+    if (bsel < 0 || bsel > ACPI_PCIHP_MAX_HOTPLUG_BUS) {
+        return 0;
+    }
+
+    switch (addr) {
+    case PCI_UP_BASE - PCI_HOTPLUG_ADDR:
+        /* Manufacture an "up" value to cause a device check on any hotplug
+         * slot with a device.  Extra device checks are harmless. */
+        val = s->acpi_pcihp_pci_status[bsel].device_present &
+            s->acpi_pcihp_pci_status[bsel].hotplug_enable;
+        ACPI_PCIHP_DPRINTF("pci_up_read %" PRIu32 "\n", val);
+        break;
+    case PCI_DOWN_BASE - PCI_HOTPLUG_ADDR:
+        val = s->acpi_pcihp_pci_status[bsel].down;
+        ACPI_PCIHP_DPRINTF("pci_down_read %" PRIu32 "\n", val);
+        break;
+    case PCI_EJ_BASE - PCI_HOTPLUG_ADDR:
+        /* No feature defined yet */
+        ACPI_PCIHP_DPRINTF("pci_features_read %" PRIu32 "\n", val);
+        break;
+    case PCI_RMV_BASE - PCI_HOTPLUG_ADDR:
+        val = s->acpi_pcihp_pci_status[bsel].hotplug_enable;
+        ACPI_PCIHP_DPRINTF("pci_rmv_read %" PRIu32 "\n", val);
+        break;
+    case PCI_SEL_BASE - PCI_HOTPLUG_ADDR:
+        val = s->hotplug_select;
+        ACPI_PCIHP_DPRINTF("pci_sel_read %" PRIu32 "\n", val);
+    default:
+        break;
+    }
+
+    return val;
+}
+
+static void pci_write(void *opaque, hwaddr addr, uint64_t data,
+                      unsigned int size)
+{
+    AcpiPciHpState *s = opaque;
+    switch (addr) {
+    case PCI_EJ_BASE - PCI_HOTPLUG_ADDR:
+        if (s->hotplug_select >= ACPI_PCIHP_MAX_HOTPLUG_BUS) {
+            break;
+        }
+        acpi_pcihp_eject_slot(s, s->hotplug_select, data);
+        ACPI_PCIHP_DPRINTF("pciej write %" HWADDR_PRIx " <== %" PRIu64 "\n",
+                      addr, data);
+        break;
+    case PCI_SEL_BASE - PCI_HOTPLUG_ADDR:
+        s->hotplug_select = data;
+        ACPI_PCIHP_DPRINTF("pcisel write %" HWADDR_PRIx " <== %" PRIu64 "\n",
+                      addr, data);
+    default:
+        break;
+    }
+}
+
+static const MemoryRegionOps acpi_pcihp_io_ops = {
+    .read = pci_read,
+    .write = pci_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    },
+};
+
+void acpi_pcihp_init(AcpiPciHpState *s, PCIBus *root_bus,
+                     MemoryRegion *address_space_io)
+{
+    s->root= root_bus;
+    memory_region_init_io(&s->io, NULL, &acpi_pcihp_io_ops, s,
+                          "acpi-pci-hotplug",
+                          PCI_HOTPLUG_SIZE);
+    memory_region_add_subregion(address_space_io, PCI_HOTPLUG_ADDR, &s->io);
+}
+
+const VMStateDescription vmstate_acpi_pcihp_pci_status = {
+    .name = "acpi_pcihp_pci_status",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .fields      = (VMStateField []) {
+        VMSTATE_UINT32(up, AcpiPciHpPciStatus),
+        VMSTATE_UINT32(down, AcpiPciHpPciStatus),
+        VMSTATE_END_OF_LIST()
+    }
+};
--- a/hw/acpi/piix4.c
+++ b/hw/acpi/piix4.c
@@ -30,6 +30,8 @@
 #include "hw/nvram/fw_cfg.h"
 #include "exec/address-spaces.h"
 #include "hw/acpi/piix4.h"
+#include "hw/acpi/pcihp.h"
+#include "hw/acpi/cpu_hotplug.h"

 //#define DEBUG

@@ -49,21 +51,13 @@
 #define PCI_EJ_BASE 0xae08
 #define PCI_RMV_BASE 0xae0c

-#define PIIX4_PROC_BASE 0xaf00
-#define PIIX4_PROC_LEN 32
-
 #define PIIX4_PCI_HOTPLUG_STATUS 2
-#define PIIX4_CPU_HOTPLUG_STATUS 4

 struct pci_status {
    uint32_t up; /* deprecated, maintained for migration compatibility */
    uint32_t down;
 };

-typedef struct CPUStatus {
-    uint8_t sts[PIIX4_PROC_LEN];
-} CPUStatus;
-
 typedef struct PIIX4PMState {
    /*< private >*/
    PCIDevice parent_obj;
@@ -73,8 +67,6 @@ typedef struct PIIX4PMState {
    uint32_t io_base;

    MemoryRegion io_gpe;
-    MemoryRegion io_pci;
-    MemoryRegion io_cpu;
    ACPIREGS ar;

    APMState apm;
@@ -88,16 +80,21 @@ typedef struct PIIX4PMState {
    Notifier machine_ready;
    Notifier powerdown_notifier;

-    /* for pci hotplug */
+    /* for legacy pci hotplug (compatible with qemu 1.6 and older) */
+    MemoryRegion io_pci;
    struct pci_status pci0_status;
    uint32_t pci0_hotplug_enable;
    uint32_t pci0_slot_device_present;

+    /* for new pci hotplug (with PCI2PCI bridge support) */
+    AcpiPciHpState acpi_pci_hotplug;
+    bool use_acpi_pci_hotplug;
+
    uint8_t disable_s3;
    uint8_t disable_s4;
    uint8_t s4_val;

-    CPUStatus gpe_cpu;
+    AcpiCpuHotplug gpe_cpu;
    Notifier cpu_added_notifier;
 } PIIX4PMState;

@@ -263,6 +260,18 @@ static int acpi_load_old(QEMUFile *f, void *opaque, int version_id)
    return ret;
 }

+static bool vmstate_test_use_acpi_pci_hotplug(void *opaque, int version_id)
+{
+    PIIX4PMState *s = opaque;
+    return s->use_acpi_pci_hotplug;
+}
+
+static bool vmstate_test_no_use_acpi_pci_hotplug(void *opaque, int version_id)
+{
+    PIIX4PMState *s = opaque;
+    return !s->use_acpi_pci_hotplug;
+}
+
 /* qemu-kvm 1.2 uses version 3 but advertised as 2
 * To support incoming qemu-kvm 1.2 migration, change version_id
 * and minimum_version_id to 2 below (which breaks migration from
@@ -285,8 +294,12 @@ static const VMStateDescription vmstate_acpi = {
        VMSTATE_TIMER(ar.tmr.timer, PIIX4PMState),
        VMSTATE_INT64(ar.tmr.overflow_time, PIIX4PMState),
        VMSTATE_STRUCT(ar.gpe, PIIX4PMState, 2, vmstate_gpe, ACPIGPE),
-        VMSTATE_STRUCT(pci0_status, PIIX4PMState, 2, vmstate_pci_status,
-                       struct pci_status),
+        VMSTATE_STRUCT_TEST(pci0_status, PIIX4PMState,
+                            vmstate_test_no_use_acpi_pci_hotplug,
+                            2, vmstate_pci_status,
+                            struct pci_status),
+        VMSTATE_PCI_HOTPLUG(acpi_pci_hotplug, PIIX4PMState,
+                            vmstate_test_use_acpi_pci_hotplug),
        VMSTATE_END_OF_LIST()
    }
 };
@@ -364,7 +377,11 @@ static void piix4_reset(void *opaque)
        pci_conf[0x5B] = 0x02;
    }
    pm_io_space_update(s);
-    piix4_update_hotplug(s);
+    if (s->use_acpi_pci_hotplug) {
+        acpi_pcihp_reset(&s->acpi_pci_hotplug);
+    } else {
+        piix4_update_hotplug(s);
+    }
 }

 static void piix4_pm_powerdown_req(Notifier *n, void *opaque)
@@ -375,6 +392,26 @@ static void piix4_pm_powerdown_req(Notifier *n, void *opaque)
    acpi_pm1_evt_power_down(&s->ar);
 }

+static int piix4_acpi_pci_hotplug(DeviceState *qdev, PCIDevice *dev,
+                                  PCIHotplugState state)
+{
+    PIIX4PMState *s = PIIX4_PM(qdev);
+    int ret = acpi_pcihp_device_hotplug(&s->acpi_pci_hotplug, dev, state);
+    if (ret < 0) {
+        return ret;
+    }
+    s->ar.gpe.sts[0] |= PIIX4_PCI_HOTPLUG_STATUS;
+
+    acpi_update_sci(&s->ar, s->irq);
+    return 0;
+}
+
+static void piix4_update_bus_hotplug(PCIBus *bus, void *opaque)
+{
+    PIIX4PMState *s = opaque;
+    pci_bus_hotplug(bus, piix4_acpi_pci_hotplug, DEVICE(s));
+}
+
 static void piix4_pm_machine_ready(Notifier *n, void *opaque)
 {
    PIIX4PMState *s = container_of(n, PIIX4PMState, machine_ready);
@@ -388,6 +425,10 @@ static void piix4_pm_machine_ready(Notifier *n, void *opaque)
    pci_conf[0x63] = 0x60;
    pci_conf[0x67] = (memory_region_present(io_as, 0x3f8) ? 0x08 : 0) |
        (memory_region_present(io_as, 0x2f8) ? 0x90 : 0);
+
+    if (s->use_acpi_pci_hotplug) {
+        pci_for_each_bus(d->bus, piix4_update_bus_hotplug, s);
+    }
 }

 static void piix4_pm_add_propeties(PIIX4PMState *s)
@@ -509,6 +550,8 @@ static Property piix4_pm_properties[] = {
    DEFINE_PROP_UINT8(ACPI_PM_PROP_S3_DISABLED, PIIX4PMState, disable_s3, 0),
    DEFINE_PROP_UINT8(ACPI_PM_PROP_S4_DISABLED, PIIX4PMState, disable_s4, 0),
    DEFINE_PROP_UINT8(ACPI_PM_PROP_S4_VAL, PIIX4PMState, s4_val, 2),
+    DEFINE_PROP_BOOL("acpi-pci-hotplug-with-bridge-support", PIIX4PMState,
+                     use_acpi_pci_hotplug, true),
    DEFINE_PROP_END_OF_LIST(),
 };

@@ -632,61 +675,13 @@ static const MemoryRegionOps piix4_pci_ops = {
    },
 };

-static uint64_t cpu_status_read(void *opaque, hwaddr addr, unsigned int size)
-{
-    PIIX4PMState *s = opaque;
-    CPUStatus *cpus = &s->gpe_cpu;
-    uint64_t val = cpus->sts[addr];
-
-    return val;
-}
-
-static void cpu_status_write(void *opaque, hwaddr addr, uint64_t data,
-                             unsigned int size)
-{
-    /* TODO: implement VCPU removal on guest signal that CPU can be removed */
-}
-
-static const MemoryRegionOps cpu_hotplug_ops = {
-    .read = cpu_status_read,
-    .write = cpu_status_write,
-    .endianness = DEVICE_LITTLE_ENDIAN,
-    .valid = {
-        .min_access_size = 1,
-        .max_access_size = 1,
-    },
-};
-
-typedef enum {
-    PLUG,
-    UNPLUG,
-} HotplugEventType;
-
-static void piix4_cpu_hotplug_req(PIIX4PMState *s, CPUState *cpu,
-                                  HotplugEventType action)
-{
-    CPUStatus *g = &s->gpe_cpu;
-    ACPIGPE *gpe = &s->ar.gpe;
-    CPUClass *k = CPU_GET_CLASS(cpu);
-    int64_t cpu_id;
-
-    assert(s != NULL);
-
-    *gpe->sts = *gpe->sts | PIIX4_CPU_HOTPLUG_STATUS;
-    cpu_id = k->get_arch_id(CPU(cpu));
-    if (action == PLUG) {
-        g->sts[cpu_id / 8] |= (1 << (cpu_id % 8));
-    } else {
-        g->sts[cpu_id / 8] &= ~(1 << (cpu_id % 8));
-    }
-    acpi_update_sci(&s->ar, s->irq);
-}
-
 static void piix4_cpu_added_req(Notifier *n, void *opaque)
 {
    PIIX4PMState *s = container_of(n, PIIX4PMState, cpu_added_notifier);

-    piix4_cpu_hotplug_req(s, CPU(opaque), PLUG);
+    assert(s != NULL);
+    AcpiCpuHotplug_add(&s->ar.gpe, &s->gpe_cpu, CPU(opaque));
+    acpi_update_sci(&s->ar, s->irq);
 }

 static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev,
@@ -695,28 +690,22 @@ static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev,
 static void piix4_acpi_system_hot_add_init(MemoryRegion *parent,
                                           PCIBus *bus, PIIX4PMState *s)
 {
-    CPUState *cpu;
-
    memory_region_init_io(&s->io_gpe, OBJECT(s), &piix4_gpe_ops, s,
                          "acpi-gpe0", GPE_LEN);
    memory_region_add_subregion(parent, GPE_BASE, &s->io_gpe);

-    memory_region_init_io(&s->io_pci, OBJECT(s), &piix4_pci_ops, s,
-                          "acpi-pci-hotplug", PCI_HOTPLUG_SIZE);
-    memory_region_add_subregion(parent, PCI_HOTPLUG_ADDR,
-                                &s->io_pci);
-    pci_bus_hotplug(bus, piix4_device_hotplug, DEVICE(s));
-
-    CPU_FOREACH(cpu) {
-        CPUClass *cc = CPU_GET_CLASS(cpu);
-        int64_t id = cc->get_arch_id(cpu);
-
-        g_assert((id / 8) < PIIX4_PROC_LEN);
-        s->gpe_cpu.sts[id / 8] |= (1 << (id % 8));
+    if (s->use_acpi_pci_hotplug) {
+        acpi_pcihp_init(&s->acpi_pci_hotplug, bus, parent);
+    } else {
+        memory_region_init_io(&s->io_pci, OBJECT(s), &piix4_pci_ops, s,
+                              "acpi-pci-hotplug", PCI_HOTPLUG_SIZE);
+        memory_region_add_subregion(parent, PCI_HOTPLUG_ADDR,
+                                    &s->io_pci);
+        pci_bus_hotplug(bus, piix4_device_hotplug, DEVICE(s));
    }
-    memory_region_init_io(&s->io_cpu, OBJECT(s), &cpu_hotplug_ops, s,
-                          "acpi-cpu-hotplug", PIIX4_PROC_LEN);
-    memory_region_add_subregion(parent, PIIX4_PROC_BASE, &s->io_cpu);
+
+    AcpiCpuHotplug_init(parent, OBJECT(s), &s->gpe_cpu,
+                        PIIX4_CPU_HOTPLUG_IO_BASE);
    s->cpu_added_notifier.notify = piix4_cpu_added_req;
    qemu_register_cpu_added_notifier(&s->cpu_added_notifier);
 }
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -731,7 +731,7 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
    register_savevm(dev, "virtio-blk", virtio_blk_id++, 2,
                    virtio_blk_save, virtio_blk_load, s);
    bdrv_set_dev_ops(s->bs, &virtio_block_ops, s);
-    bdrv_set_buffer_alignment(s->bs, s->conf->logical_block_size);
+    bdrv_set_guest_block_size(s->bs, s->conf->logical_block_size);

    bdrv_iostatus_enable(s->bs);

--- a/hw/i386/Makefile.objs
+++ b/hw/i386/Makefile.objs
@@ -17,7 +17,7 @@ iasl-option=$(shell if test -z "`$(1) $(2) 2>&1 > /dev/null`" \
 ifdef IASL
 #IASL Present. Generate hex files from .dsl
 hw/i386/%.hex: $(SRC_PATH)/hw/i386/%.dsl $(SRC_PATH)/scripts/acpi_extract_preprocess.py $(SRC_PATH)/scripts/acpi_extract.py
-	$(call quiet-command, cpp -P $< -o $*.dsl.i.orig, "  CPP $(TARGET_DIR)$*.dsl.i.orig")
+	$(call quiet-command, cpp -P $(QEMU_DGFLAGS) $(QEMU_INCLUDES) $< -o $*.dsl.i.orig, "  CPP $(TARGET_DIR)$*.dsl.i.orig")
 	$(call quiet-command, $(PYTHON) $(SRC_PATH)/scripts/acpi_extract_preprocess.py $*.dsl.i.orig > $*.dsl.i, "  ACPI_PREPROCESS $(TARGET_DIR)$*.dsl.i")
 	$(call quiet-command, $(IASL) $(call iasl-option,$(IASL),-Pn,) -vs -l -tc -p $* $*.dsl.i $(if $(V), , > /dev/null) 2>&1 ,"  IASL $(TARGET_DIR)$*.dsl.i")
 	$(call quiet-command, $(PYTHON) $(SRC_PATH)/scripts/acpi_extract.py $*.lst > $*.off, "  ACPI_EXTRACT $(TARGET_DIR)$*.off")
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -36,9 +36,11 @@
 #include "hw/nvram/fw_cfg.h"
 #include "bios-linker-loader.h"
 #include "hw/loader.h"
+#include "hw/isa/isa.h"

 /* Supported chipsets: */
 #include "hw/acpi/piix4.h"
+#include "hw/acpi/pcihp.h"
 #include "hw/i386/ich9.h"
 #include "hw/pci/pci_bus.h"
 #include "hw/pci-host/q35.h"
@@ -78,8 +80,15 @@ typedef struct AcpiMiscInfo {
    uint16_t pvpanic_port;
 } AcpiMiscInfo;

+typedef struct AcpiBuildPciBusHotplugState {
+    GArray *device_table;
+    GArray *notify_table;
+    struct AcpiBuildPciBusHotplugState *parent;
+} AcpiBuildPciBusHotplugState;
+
 static void acpi_get_dsdt(AcpiMiscInfo *info)
 {
+    uint16_t *applesmc_sta;
    Object *piix = piix4_pm_find();
    Object *lpc = ich9_lpc_find();
    assert(!!piix != !!lpc);
@@ -87,11 +96,17 @@ static void acpi_get_dsdt(AcpiMiscInfo *info)
    if (piix) {
        info->dsdt_code = AcpiDsdtAmlCode;
        info->dsdt_size = sizeof AcpiDsdtAmlCode;
+        applesmc_sta = piix_dsdt_applesmc_sta;
    }
    if (lpc) {
        info->dsdt_code = Q35AcpiDsdtAmlCode;
        info->dsdt_size = sizeof Q35AcpiDsdtAmlCode;
+        applesmc_sta = q35_dsdt_applesmc_sta;
    }
+
+    /* Patch in appropriate value for AppleSMC _STA */
+    *(uint8_t *)(info->dsdt_code + *applesmc_sta) =
+        applesmc_find() ? 0x0b : 0x00;
 }

 static
@@ -171,38 +186,6 @@ static void acpi_get_pm_info(AcpiPmInfo *pm)
                                               NULL);
 }

-static void acpi_get_hotplug_info(AcpiMiscInfo *misc)
-{
-    int i;
-    PCIBus *bus = find_i440fx();
-
-    if (!bus) {
-        /* Only PIIX supports ACPI hotplug */
-        memset(misc->slot_hotplug_enable, 0, sizeof misc->slot_hotplug_enable);
-        return;
-    }
-
-    memset(misc->slot_hotplug_enable, 0xff,
-           DIV_ROUND_UP(PCI_SLOT_MAX, BITS_PER_BYTE));
-
-    for (i = 0; i < ARRAY_SIZE(bus->devices); ++i) {
-        PCIDeviceClass *pc;
-        PCIDevice *pdev = bus->devices[i];
-
-        if (!pdev) {
-            continue;
-        }
-
-        pc = PCI_DEVICE_GET_CLASS(pdev);
-
-        if (pc->no_hotplug) {
-            int slot = PCI_SLOT(i);
-
-            clear_bit(slot, misc->slot_hotplug_enable);
-        }
-    }
-}
-
 static void acpi_get_misc_info(AcpiMiscInfo *info)
 {
    info->has_hpet = hpet_find();
@@ -368,6 +351,12 @@ static void build_package(GArray *package, uint8_t op, unsigned min_bytes)
    build_prepend_byte(package, op);
 }

+static void build_extop_package(GArray *package, uint8_t op)
+{
+    build_package(package, op, 1);
+    build_prepend_byte(package, 0x5B); /* ExtOpPrefix */
+}
+
 static void build_append_value(GArray *table, uint32_t value, int size)
 {
    uint8_t prefix;
@@ -394,8 +383,44 @@ static void build_append_value(GArray *table, uint32_t value, int size)
    }
 }

-static void build_append_notify_target(GArray *method, GArray *target_name,
-                                       uint32_t value, int size)
+static void build_append_int(GArray *table, uint32_t value)
+{
+    if (value == 0x00) {
+        build_append_byte(table, 0x00); /* ZeroOp */
+    } else if (value == 0x01) {
+        build_append_byte(table, 0x01); /* OneOp */
+    } else if (value <= 0xFF) {
+        build_append_value(table, value, 1);
+    } else if (value <= 0xFFFFF) {
+        build_append_value(table, value, 2);
+    } else {
+        build_append_value(table, value, 4);
+    }
+}
+
+static GArray *build_alloc_method(const char *name, uint8_t arg_count)
+{
+    GArray *method = build_alloc_array();
+
+    build_append_nameseg(method, "%s", name);
+    build_append_byte(method, arg_count); /* MethodFlags: ArgCount */
+
+    return method;
+}
+
+static void build_append_and_cleanup_method(GArray *device, GArray *method)
+{
+    uint8_t op = 0x14; /* MethodOp */
+
+    build_package(method, op, 0);
+
+    build_append_array(device, method);
+    build_free_array(method);
+}
+
+static void build_append_notify_target_ifequal(GArray *method,
+                                               GArray *target_name,
+                                               uint32_t value, int size)
 {
    GArray *notify = build_alloc_array();
    uint8_t op = 0xA0; /* IfOp */
@@ -415,6 +440,7 @@ static void build_append_notify_target(GArray *method, GArray *target_name,
    build_free_array(notify);
 }

+/* End here */
 #define ACPI_PORT_SMI_CMD           0x00b2 /* TODO: this is APM_CNT_IOPORT */

 static inline void *acpi_data_push(GArray *table_data, unsigned size)
@@ -624,44 +650,236 @@ static inline char acpi_get_hex(uint32_t val)
 #include "hw/i386/ssdt-pcihp.hex"

 static void
-build_append_notify(GArray *device, const char *name,
-                    const char *format, int skip, int count)
+build_append_notify_method(GArray *device, const char *name,
+                           const char *format, int count)
 {
    int i;
-    GArray *method = build_alloc_array();
-    uint8_t op = 0x14; /* MethodOp */
+    GArray *method = build_alloc_method(name, 2);

-    build_append_nameseg(method, "%s", name);
-    build_append_byte(method, 0x02); /* MethodFlags: ArgCount */
-    for (i = skip; i < count; i++) {
+    for (i = 0; i < count; i++) {
        GArray *target = build_alloc_array();
        build_append_nameseg(target, format, i);
        assert(i < 256); /* Fits in 1 byte */
-        build_append_notify_target(method, target, i, 1);
+        build_append_notify_target_ifequal(method, target, i, 1);
        build_free_array(target);
    }
-    build_package(method, op, 2);

-    build_append_array(device, method);
-    build_free_array(method);
+    build_append_and_cleanup_method(device, method);
 }

-static void patch_pcihp(int slot, uint8_t *ssdt_ptr, uint32_t eject)
+static void patch_pcihp(int slot, uint8_t *ssdt_ptr)
 {
-    ssdt_ptr[ACPI_PCIHP_OFFSET_HEX] = acpi_get_hex(slot >> 4);
-    ssdt_ptr[ACPI_PCIHP_OFFSET_HEX + 1] = acpi_get_hex(slot);
+    unsigned devfn = PCI_DEVFN(slot, 0);
+
+    ssdt_ptr[ACPI_PCIHP_OFFSET_HEX] = acpi_get_hex(devfn >> 4);
+    ssdt_ptr[ACPI_PCIHP_OFFSET_HEX + 1] = acpi_get_hex(devfn);
    ssdt_ptr[ACPI_PCIHP_OFFSET_ID] = slot;
    ssdt_ptr[ACPI_PCIHP_OFFSET_ADR + 2] = slot;
+}

-    /* Runtime patching of ACPI_EJ0: to disable hotplug for a slot,
-     * replace the method name: _EJ0 by ACPI_EJ0_.
-     */
-    /* Sanity check */
-    assert(!memcmp(ssdt_ptr + ACPI_PCIHP_OFFSET_EJ0, "_EJ0", 4));
+/* Assign BSEL property to all buses.  In the future, this can be changed
+ * to only assign to buses that support hotplug.
+ */
+static void *acpi_set_bsel(PCIBus *bus, void *opaque)
+{
+    unsigned *bsel_alloc = opaque;
+    unsigned *bus_bsel;

-    if (!eject) {
-        memcpy(ssdt_ptr + ACPI_PCIHP_OFFSET_EJ0, "EJ0_", 4);
+    if (bus->qbus.allow_hotplug) {
+        bus_bsel = g_malloc(sizeof *bus_bsel);
+
+        *bus_bsel = (*bsel_alloc)++;
+        object_property_add_uint32_ptr(OBJECT(bus), ACPI_PCIHP_PROP_BSEL,
+                                       bus_bsel, NULL);
    }
+
+    return bsel_alloc;
+}
+
+static void acpi_set_pci_info(void)
+{
+    PCIBus *bus = find_i440fx(); /* TODO: Q35 support */
+    unsigned bsel_alloc = 0;
+
+    if (bus) {
+        /* Scan all PCI buses. Set property to enable acpi based hotplug. */
+        pci_for_each_bus_depth_first(bus, acpi_set_bsel, NULL, &bsel_alloc);
+    }
+}
+
+static void build_pci_bus_state_init(AcpiBuildPciBusHotplugState *state,
+                                     AcpiBuildPciBusHotplugState *parent)
+{
+    state->parent = parent;
+    state->device_table = build_alloc_array();
+    state->notify_table = build_alloc_array();
+}
+
+static void build_pci_bus_state_cleanup(AcpiBuildPciBusHotplugState *state)
+{
+    build_free_array(state->device_table);
+    build_free_array(state->notify_table);
+}
+
+static void *build_pci_bus_begin(PCIBus *bus, void *parent_state)
+{
+    AcpiBuildPciBusHotplugState *parent = parent_state;
+    AcpiBuildPciBusHotplugState *child = g_malloc(sizeof *child);
+
+    build_pci_bus_state_init(child, parent);
+
+    return child;
+}
+
+static void build_pci_bus_end(PCIBus *bus, void *bus_state)
+{
+    AcpiBuildPciBusHotplugState *child = bus_state;
+    AcpiBuildPciBusHotplugState *parent = child->parent;
+    GArray *bus_table = build_alloc_array();
+    DECLARE_BITMAP(slot_hotplug_enable, PCI_SLOT_MAX);
+    uint8_t op;
+    int i;
+    QObject *bsel;
+    GArray *method;
+    bool bus_hotplug_support = false;
+
+    if (bus->parent_dev) {
+        op = 0x82; /* DeviceOp */
+        build_append_nameseg(bus_table, "S%.02X_",
+                             bus->parent_dev->devfn);
+        build_append_byte(bus_table, 0x08); /* NameOp */
+        build_append_nameseg(bus_table, "_SUN");
+        build_append_value(bus_table, PCI_SLOT(bus->parent_dev->devfn), 1);
+        build_append_byte(bus_table, 0x08); /* NameOp */
+        build_append_nameseg(bus_table, "_ADR");
+        build_append_value(bus_table, (PCI_SLOT(bus->parent_dev->devfn) << 16) |
+                           PCI_FUNC(bus->parent_dev->devfn), 4);
+    } else {
+        op = 0x10; /* ScopeOp */;
+        build_append_nameseg(bus_table, "PCI0");
+    }
+
+    bsel = object_property_get_qobject(OBJECT(bus), ACPI_PCIHP_PROP_BSEL, NULL);
+    if (bsel) {
+        build_append_byte(bus_table, 0x08); /* NameOp */
+        build_append_nameseg(bus_table, "BSEL");
+        build_append_int(bus_table, qint_get_int(qobject_to_qint(bsel)));
+
+        memset(slot_hotplug_enable, 0xff, sizeof slot_hotplug_enable);
+
+        for (i = 0; i < ARRAY_SIZE(bus->devices); ++i) {
+            PCIDeviceClass *pc;
+            PCIDevice *pdev = bus->devices[i];
+
+            if (!pdev) {
+                continue;
+            }
+
+            pc = PCI_DEVICE_GET_CLASS(pdev);
+
+            if (pc->no_hotplug || pc->is_bridge) {
+                int slot = PCI_SLOT(i);
+
+                clear_bit(slot, slot_hotplug_enable);
+            }
+        }
+
+        /* Append Device object for each slot which supports eject */
+        for (i = 0; i < PCI_SLOT_MAX; i++) {
+            bool can_eject = test_bit(i, slot_hotplug_enable);
+            if (can_eject) {
+                void *pcihp = acpi_data_push(bus_table,
+                                             ACPI_PCIHP_SIZEOF);
+                memcpy(pcihp, ACPI_PCIHP_AML, ACPI_PCIHP_SIZEOF);
+                patch_pcihp(i, pcihp);
+                bus_hotplug_support = true;
+            }
+        }
+
+        method = build_alloc_method("DVNT", 2);
+
+        for (i = 0; i < PCI_SLOT_MAX; i++) {
+            GArray *notify;
+            uint8_t op;
+
+            if (!test_bit(i, slot_hotplug_enable)) {
+                continue;
+            }
+
+            notify = build_alloc_array();
+            op = 0xA0; /* IfOp */
+
+            build_append_byte(notify, 0x7B); /* AndOp */
+            build_append_byte(notify, 0x68); /* Arg0Op */
+            build_append_int(notify, 0x1 << i);
+            build_append_byte(notify, 0x00); /* NullName */
+            build_append_byte(notify, 0x86); /* NotifyOp */
+            build_append_nameseg(notify, "S%.02X_", PCI_DEVFN(i, 0));
+            build_append_byte(notify, 0x69); /* Arg1Op */
+
+            /* Pack it up */
+            build_package(notify, op, 0);
+
+            build_append_array(method, notify);
+
+            build_free_array(notify);
+        }
+
+        build_append_and_cleanup_method(bus_table, method);
+    }
+
+    /* Append PCNT method to notify about events on local and child buses.
+     * Add unconditionally for root since DSDT expects it.
+     */
+    if (bus_hotplug_support || child->notify_table->len || !bus->parent_dev) {
+        method = build_alloc_method("PCNT", 0);
+
+        /* If bus supports hotplug select it and notify about local events */
+        if (bsel) {
+            build_append_byte(method, 0x70); /* StoreOp */
+            build_append_int(method, qint_get_int(qobject_to_qint(bsel)));
+            build_append_nameseg(method, "BNUM");
+            build_append_nameseg(method, "DVNT");
+            build_append_nameseg(method, "PCIU");
+            build_append_int(method, 1); /* Device Check */
+            build_append_nameseg(method, "DVNT");
+            build_append_nameseg(method, "PCID");
+            build_append_int(method, 3); /* Eject Request */
+        }
+
+        /* Notify about child bus events in any case */
+        build_append_array(method, child->notify_table);
+
+        build_append_and_cleanup_method(bus_table, method);
+
+        /* Append description of child buses */
+        build_append_array(bus_table, child->device_table);
+
+        /* Pack it up */
+        if (bus->parent_dev) {
+            build_extop_package(bus_table, op);
+        } else {
+            build_package(bus_table, op, 0);
+        }
+
+        /* Append our bus description to parent table */
+        build_append_array(parent->device_table, bus_table);
+
+        /* Also tell parent how to notify us, invoking PCNT method.
+         * At the moment this is not needed for root as we have a single root.
+         */
+        if (bus->parent_dev) {
+            build_append_byte(parent->notify_table, '^'); /* ParentPrefixChar */
+            build_append_byte(parent->notify_table, 0x2E); /* DualNamePrefix */
+            build_append_nameseg(parent->notify_table, "S%.02X_",
+                                 bus->parent_dev->devfn);
+            build_append_nameseg(parent->notify_table, "PCNT");
+        }
+    }
+
+    build_free_array(bus_table);
+    build_pci_bus_state_cleanup(child);
+    g_free(child);
 }

 static void patch_pci_windows(PcPciInfo *pci, uint8_t *start, unsigned size)
@@ -733,7 +951,7 @@ build_ssdt(GArray *table_data, GArray *linker,
         *   Method(NTFY, 2) {If (LEqual(Arg0, 0x00)) {Notify(CP00, Arg1)} ...}
         */
        /* Arg0 = Processor ID = APIC ID */
-        build_append_notify(sb_scope, "NTFY", "CP%0.02X", 0, acpi_cpus);
+        build_append_notify_method(sb_scope, "NTFY", "CP%0.02X", acpi_cpus);

        /* build "Name(CPON, Package() { One, One, ..., Zero, Zero, ... })" */
        build_append_byte(sb_scope, 0x08); /* NameOp */
@@ -755,24 +973,19 @@ build_ssdt(GArray *table_data, GArray *linker,
        }

        {
-            GArray *pci0 = build_alloc_array();
-            uint8_t op = 0x10; /* ScopeOp */;
+            AcpiBuildPciBusHotplugState hotplug_state;
+            PCIBus *bus = find_i440fx(); /* TODO: Q35 support */

-            build_append_nameseg(pci0, "PCI0");
+            build_pci_bus_state_init(&hotplug_state, NULL);

-            /* build Device object for each slot */
-            for (i = 1; i < PCI_SLOT_MAX; i++) {
-                bool eject = test_bit(i, misc->slot_hotplug_enable);
-                void *pcihp = acpi_data_push(pci0, ACPI_PCIHP_SIZEOF);
-
-                memcpy(pcihp, ACPI_PCIHP_AML, ACPI_PCIHP_SIZEOF);
-                patch_pcihp(i, pcihp, eject);
+            if (bus) {
+                /* Scan all PCI buses. Generate tables to support hotplug. */
+                pci_for_each_bus_depth_first(bus, build_pci_bus_begin,
+                                             build_pci_bus_end, &hotplug_state);
            }

-            build_append_notify(pci0, "PCNT", "S%0.02X_", 1, PCI_SLOT_MAX);
-            build_package(pci0, op, 3);
-            build_append_array(sb_scope, pci0);
-            build_free_array(pci0);
+            build_append_array(sb_scope, hotplug_state.device_table);
+            build_pci_bus_state_cleanup(&hotplug_state);
        }

        build_package(sb_scope, op, 3);
@@ -867,16 +1080,16 @@ build_srat(GArray *table_data, GArray *linker,
        next_base = mem_base + mem_len;

        /* Cut out the ACPI_PCI hole */
-        if (mem_base <= guest_info->ram_size &&
-            next_base > guest_info->ram_size) {
-            mem_len -= next_base - guest_info->ram_size;
+        if (mem_base <= guest_info->ram_size_below_4g &&
+            next_base > guest_info->ram_size_below_4g) {
+            mem_len -= next_base - guest_info->ram_size_below_4g;
            if (mem_len > 0) {
                numamem = acpi_data_push(table_data, sizeof *numamem);
                acpi_build_srat_memory(numamem, mem_base, mem_len, i-1, 1);
            }
            mem_base = 1ULL << 32;
-            mem_len = next_base - guest_info->ram_size;
-            next_base += (1ULL << 32) - guest_info->ram_size;
+            mem_len = next_base - guest_info->ram_size_below_4g;
+            next_base += (1ULL << 32) - guest_info->ram_size_below_4g;
        }
        numamem = acpi_data_push(table_data, sizeof *numamem);
        acpi_build_srat_memory(numamem, mem_base, mem_len, i - 1, 1);
@@ -1055,7 +1268,6 @@ void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables *tables)
    acpi_get_cpu_info(&cpu);
    acpi_get_pm_info(&pm);
    acpi_get_dsdt(&misc);
-    acpi_get_hotplug_info(&misc);
    acpi_get_misc_info(&misc);
    acpi_get_pci_info(&pci);

@@ -1200,6 +1412,8 @@ void acpi_setup(PcGuestInfo *guest_info)

    build_state->guest_info = guest_info;

+    acpi_set_pci_info();
+
    acpi_build_tables_init(&tables);
    acpi_build(build_state->guest_info, &tables);

--- a/hw/i386/acpi-dsdt-cpu-hotplug.dsl
+++ b/hw/i386/acpi-dsdt-cpu-hotplug.dsl
@@ -16,6 +16,7 @@
 /****************************************************************
 * CPU hotplug
 ****************************************************************/
+#define CPU_HOTPLUG_RESOURCE_DEVICE PRES

 Scope(\_SB) {
    /* Objects filled in by run-time generated SSDT */
@@ -52,7 +53,8 @@ Scope(\_SB) {
        Sleep(200)
    }

-    OperationRegion(PRST, SystemIO, 0xaf00, 32)
+#define CPU_STATUS_LEN ACPI_GPE_PROC_LEN
+    OperationRegion(PRST, SystemIO, CPU_STATUS_BASE, CPU_STATUS_LEN)
    Field(PRST, ByteAcc, NoLock, Preserve) {
        PRS, 256
    }
@@ -89,4 +91,14 @@ Scope(\_SB) {
            Increment(Local0)
        }
    }
+
+    Device(CPU_HOTPLUG_RESOURCE_DEVICE) {
+        Name(_HID, "ACPI0004")
+
+        Name(_CRS, ResourceTemplate() {
+            IO(Decode16, CPU_STATUS_BASE, CPU_STATUS_BASE, 0, CPU_STATUS_LEN)
+        })
+
+        Name(_STA, 0xB) /* present, functioning, decoding, not shown in UI */
+    }
 }
--- a/hw/i386/acpi-dsdt-isa.dsl
+++ b/hw/i386/acpi-dsdt-isa.dsl
@@ -16,6 +16,17 @@
 /* Common legacy ISA style devices. */
 Scope(\_SB.PCI0.ISA) {

+    Device (SMC) {
+        Name(_HID, EisaId("APP0001"))
+        /* _STA will be patched to 0x0B if AppleSMC is present */
+        ACPI_EXTRACT_NAME_BYTE_CONST DSDT_APPLESMC_STA
+        Name(_STA, 0xF0)
+        Name(_CRS, ResourceTemplate () {
+            IO (Decode16, 0x0300, 0x0300, 0x01, 0x20)
+            IRQNoFlags() { 6 }
+        })
+    }
+
    Device(RTC) {
        Name(_HID, EisaId("PNP0B00"))
        Name(_CRS, ResourceTemplate() {
--- a/hw/i386/acpi-dsdt-pci-crs.dsl
+++ b/hw/i386/acpi-dsdt-pci-crs.dsl
@@ -30,20 +30,7 @@ Scope(\_SB.PCI0) {
            0x01,               // Address Alignment
            0x08,               // Address Length
            )
-        WordIO(ResourceProducer, MinFixed, MaxFixed, PosDecode, EntireRange,
-            0x0000,             // Address Space Granularity
-            0x0000,             // Address Range Minimum
-            0x0CF7,             // Address Range Maximum
-            0x0000,             // Address Translation Offset
-            0x0CF8,             // Address Length
-            ,, , TypeStatic)
-        WordIO(ResourceProducer, MinFixed, MaxFixed, PosDecode, EntireRange,
-            0x0000,             // Address Space Granularity
-            0x0D00,             // Address Range Minimum
-            0xFFFF,             // Address Range Maximum
-            0x0000,             // Address Translation Offset
-            0xF300,             // Address Length
-            ,, , TypeStatic)
+        BOARD_SPECIFIC_PCI_RESOURSES
        DWordMemory(ResourceProducer, PosDecode, MinFixed, MaxFixed, Cacheable, ReadWrite,
            0x00000000,         // Address Space Granularity
            0x000A0000,         // Address Range Minimum
--- a/hw/i386/acpi-dsdt.dsl
+++ b/hw/i386/acpi-dsdt.dsl
@@ -35,6 +35,45 @@ DefinitionBlock (
 /****************************************************************
 * PCI Bus definition
 ****************************************************************/
+#define BOARD_SPECIFIC_PCI_RESOURSES \
+     WordIO(ResourceProducer, MinFixed, MaxFixed, PosDecode, EntireRange, \
+         0x0000, \
+         0x0000, \
+         0x0CF7, \
+         0x0000, \
+         0x0CF8, \
+         ,, , TypeStatic) \
+     WordIO(ResourceProducer, MinFixed, MaxFixed, PosDecode, EntireRange, \
+         0x0000, \
+         0x0D00, \
+         0xADFF, \
+         0x0000, \
+         0xA100, \
+         ,, , TypeStatic) \
+     /* 0xae00-0xae0e hole for PCI hotplug, hw/acpi/piix4.c:PCI_HOTPLUG_ADDR */ \
+     WordIO(ResourceProducer, MinFixed, MaxFixed, PosDecode, EntireRange, \
+         0x0000, \
+         0xAE0F, \
+         0xAEFF, \
+         0x0000, \
+         0x00F1, \
+         ,, , TypeStatic) \
+     /* 0xaf00-0xaf1f hole for CPU hotplug, hw/acpi/piix4.c:PIIX4_PROC_BASE */ \
+     WordIO(ResourceProducer, MinFixed, MaxFixed, PosDecode, EntireRange, \
+         0x0000, \
+         0xAF20, \
+         0xAFDF, \
+         0x0000, \
+         0x00C0, \
+         ,, , TypeStatic) \
+     /* 0xafe0-0xafe3 hole for ACPI.GPE0, hw/acpi/piix4.c:GPE_BASE */ \
+     WordIO(ResourceProducer, MinFixed, MaxFixed, PosDecode, EntireRange, \
+         0x0000, \
+         0xAFE4, \
+         0xFFFF, \
+         0x0000, \
+         0x501C, \
+         ,, , TypeStatic)

    Scope(\_SB) {
        Device(PCI0) {
@@ -114,6 +153,7 @@ DefinitionBlock (
        }
    }

+#define DSDT_APPLESMC_STA piix_dsdt_applesmc_sta
 #include "acpi-dsdt-isa.dsl"


@@ -133,32 +173,28 @@ DefinitionBlock (
            B0EJ, 32,
        }

+        OperationRegion(BNMR, SystemIO, 0xae10, 0x04)
+        Field(BNMR, DWordAcc, NoLock, WriteAsZeros) {
+            BNUM, 32,
+        }
+
+        /* Lock to protect access to fields above. */
+        Mutex(BLCK, 0)
+
        /* Methods called by bulk generated PCI devices below */

        /* Methods called by hotplug devices */
-        Method(PCEJ, 1, NotSerialized) {
+        Method(PCEJ, 2, NotSerialized) {
            // _EJ0 method - eject callback
-            Store(ShiftLeft(1, Arg0), B0EJ)
+            Acquire(BLCK, 0xFFFF)
+            Store(Arg0, BNUM)
+            Store(ShiftLeft(1, Arg1), B0EJ)
+            Release(BLCK)
            Return (0x0)
        }

        /* Hotplug notification method supplied by SSDT */
        External(\_SB.PCI0.PCNT, MethodObj)
-
-        /* PCI hotplug notify method */
-        Method(PCNF, 0) {
-            // Local0 = iterator
-            Store(Zero, Local0)
-            While (LLess(Local0, 31)) {
-                Increment(Local0)
-                If (And(PCIU, ShiftLeft(1, Local0))) {
-                    PCNT(Local0, 1)
-                }
-                If (And(PCID, ShiftLeft(1, Local0))) {
-                    PCNT(Local0, 3)
-                }
-            }
-        }
    }


@@ -293,6 +329,8 @@ DefinitionBlock (
        }
    }

+#include "hw/acpi/cpu_hotplug_defs.h"
+#define CPU_STATUS_BASE PIIX4_CPU_HOTPLUG_IO_BASE
 #include "acpi-dsdt-cpu-hotplug.dsl"


@@ -307,7 +345,9 @@ DefinitionBlock (
        }
        Method(_E01) {
            // PCI hotplug event
-            \_SB.PCI0.PCNF()
+            Acquire(\_SB.PCI0.BLCK, 0xFFFF)
+            \_SB.PCI0.PCNT()
+            Release(\_SB.PCI0.BLCK)
        }
        Method(_E02) {
            // CPU hotplug event
--- a/hw/i386/acpi-dsdt.hex.generated
+++ b/hw/i386/acpi-dsdt.hex.generated
@@ -3,12 +3,12 @@ static unsigned char AcpiDsdtAmlCode[] = {
 0x53,
 0x44,
 0x54,
-0x37,
+0x87,
 0x11,
 0x0,
 0x0,
 0x1,
-0xd8,
+0xb8,
 0x42,
 0x58,
 0x50,
@@ -860,8 +860,8 @@ static unsigned char AcpiDsdtAmlCode[] = {
 0x4e,
 0x1,
 0x10,
-0x4c,
-0x1b,
+0x4b,
+0x1e,
 0x2f,
 0x3,
 0x5f,
@@ -879,6 +879,53 @@ static unsigned char AcpiDsdtAmlCode[] = {
 0x5b,
 0x82,
 0x2d,
+0x53,
+0x4d,
+0x43,
+0x5f,
+0x8,
+0x5f,
+0x48,
+0x49,
+0x44,
+0xc,
+0x6,
+0x10,
+0x0,
+0x1,
+0x8,
+0x5f,
+0x53,
+0x54,
+0x41,
+0xb,
+0x0,
+0xff,
+0x8,
+0x5f,
+0x43,
+0x52,
+0x53,
+0x11,
+0x10,
+0xa,
+0xd,
+0x47,
+0x1,
+0x0,
+0x3,
+0x0,
+0x3,
+0x1,
+0x20,
+0x22,
+0x40,
+0x0,
+0x79,
+0x0,
+0x5b,
+0x82,
+0x2d,
 0x52,
 0x54,
 0x43,
@@ -1305,7 +1352,7 @@ static unsigned char AcpiDsdtAmlCode[] = {
 0x79,
 0x0,
 0x10,
-0x4b,
+0x48,
 0x8,
 0x2e,
 0x5f,
@@ -1371,79 +1418,76 @@ static unsigned char AcpiDsdtAmlCode[] = {
 0x45,
 0x4a,
 0x20,
+0x5b,
+0x80,
+0x42,
+0x4e,
+0x4d,
+0x52,
+0x1,
+0xb,
+0x10,
+0xae,
+0xa,
+0x4,
+0x5b,
+0x81,
+0xb,
+0x42,
+0x4e,
+0x4d,
+0x52,
+0x43,
+0x42,
+0x4e,
+0x55,
+0x4d,
+0x20,
+0x5b,
+0x1,
+0x42,
+0x4c,
+0x43,
+0x4b,
+0x0,
 0x14,
-0x11,
+0x25,
 0x50,
 0x43,
 0x45,
 0x4a,
-0x1,
+0x2,
+0x5b,
+0x23,
+0x42,
+0x4c,
+0x43,
+0x4b,
+0xff,
+0xff,
+0x70,
+0x68,
+0x42,
+0x4e,
+0x55,
+0x4d,
 0x70,
 0x79,
 0x1,
-0x68,
+0x69,
 0x0,
 0x42,
 0x30,
 0x45,
 0x4a,
+0x5b,
+0x27,
+0x42,
+0x4c,
+0x43,
+0x4b,
 0xa4,
 0x0,
-0x14,
-0x36,
-0x50,
-0x43,
-0x4e,
-0x46,
-0x0,
-0x70,
-0x0,
-0x60,
-0xa2,
-0x2c,
-0x95,
-0x60,
-0xa,
-0x1f,
-0x75,
-0x60,
-0xa0,
-0x11,
-0x7b,
-0x50,
-0x43,
-0x49,
-0x55,
-0x79,
-0x1,
-0x60,
-0x0,
-0x0,
-0x50,
-0x43,
-0x4e,
-0x54,
-0x60,
-0x1,
-0xa0,
-0x12,
-0x7b,
-0x50,
-0x43,
-0x49,
-0x44,
-0x79,
-0x1,
-0x60,
-0x0,
-0x0,
-0x50,
-0x43,
-0x4e,
-0x54,
-0x60,
-0xa,
-0x3,
 0x10,
 0x4a,
 0xa0,
@@ -4248,8 +4292,8 @@ static unsigned char AcpiDsdtAmlCode[] = {
 0x75,
 0x60,
 0x10,
-0x4e,
-0x9,
+0x42,
+0xc,
 0x5f,
 0x47,
 0x50,
@@ -4277,12 +4321,31 @@ static unsigned char AcpiDsdtAmlCode[] = {
 0x30,
 0x0,
 0x14,
-0x15,
+0x39,
 0x5f,
 0x45,
 0x30,
 0x31,
 0x0,
+0x5b,
+0x23,
+0x5c,
+0x2f,
+0x3,
+0x5f,
+0x53,
+0x42,
+0x5f,
+0x50,
+0x43,
+0x49,
+0x30,
+0x42,
+0x4c,
+0x43,
+0x4b,
+0xff,
+0xff,
 0x5c,
 0x2f,
 0x3,
@@ -4297,7 +4360,24 @@ static unsigned char AcpiDsdtAmlCode[] = {
 0x50,
 0x43,
 0x4e,
-0x46,
+0x54,
+0x5b,
+0x27,
+0x5c,
+0x2f,
+0x3,
+0x5f,
+0x53,
+0x42,
+0x5f,
+0x50,
+0x43,
+0x49,
+0x30,
+0x42,
+0x4c,
+0x43,
+0x4b,
 0x14,
 0x10,
 0x5f,
@@ -4407,3 +4487,6 @@ static unsigned char AcpiDsdtAmlCode[] = {
 0x46,
 0x0
 };
+static unsigned short piix_dsdt_applesmc_sta[] = {
+0x384
+};
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1072,6 +1072,7 @@ PcGuestInfo *pc_guest_info_init(ram_addr_t below_4g_mem_size,
    PcGuestInfo *guest_info = &guest_info_state->info;
    int i, j;

+    guest_info->ram_size_below_4g = below_4g_mem_size;
    guest_info->ram_size = below_4g_mem_size + above_4g_mem_size;
    guest_info->apic_id_limit = pc_apic_id_limit(max_cpus);
    guest_info->apic_xrupt_override = kvm_allows_irq0_override();
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -393,6 +393,10 @@ static QEMUMachine pc_i440fx_machine_v1_7 = {
    PC_I440FX_1_7_MACHINE_OPTIONS,
    .name = "pc-i440fx-1.7",
    .init = pc_init_pci_1_7,
+    .compat_props = (GlobalProperty[]) {
+        PC_COMPAT_1_7,
+        { /* end of list */ }
+    },
 };

 #define PC_I440FX_1_6_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -51,6 +51,11 @@
 static bool has_pci_info;
 static bool has_acpi_build = true;
 static bool smbios_type1_defaults = true;
+/* Make sure that guest addresses aligned at 1Gbyte boundaries get mapped to
+ * host addresses aligned at 1Gbyte boundaries.  This way we can use 1GByte
+ * pages in the host.
+ */
+static bool gigabyte_align = true;

 /* PC hardware initialisation */
 static void pc_q35_init(QEMUMachineInitArgs *args)
@@ -92,9 +97,19 @@ static void pc_q35_init(QEMUMachineInitArgs *args)

    kvmclock_create();

+    /* Check whether RAM fits below 4G (leaving 1/2 GByte for IO memory
+     * and 256 Mbytes for PCI Express Enhanced Configuration Access Mapping
+     * also known as MMCFG).
+     * If it doesn't, we need to split it in chunks below and above 4G.
+     * In any case, try to make sure that guest addresses aligned at
+     * 1G boundaries get mapped to host addresses aligned at 1G boundaries.
+     * For old machine types, use whatever split we used historically to avoid
+     * breaking migration.
+     */
    if (args->ram_size >= 0xb0000000) {
-        above_4g_mem_size = args->ram_size - 0xb0000000;
-        below_4g_mem_size = 0xb0000000;
+        ram_addr_t lowmem = gigabyte_align ? 0x80000000 : 0xb0000000;
+        above_4g_mem_size = args->ram_size - lowmem;
+        below_4g_mem_size = lowmem;
    } else {
        above_4g_mem_size = 0;
        below_4g_mem_size = args->ram_size;
@@ -228,6 +243,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
 static void pc_compat_1_7(QEMUMachineInitArgs *args)
 {
    smbios_type1_defaults = false;
+    gigabyte_align = false;
 }

 static void pc_compat_1_6(QEMUMachineInitArgs *args)
--- a/hw/i386/q35-acpi-dsdt.dsl
+++ b/hw/i386/q35-acpi-dsdt.dsl
@@ -48,6 +48,22 @@ DefinitionBlock (
 /****************************************************************
 * PCI Bus definition
 ****************************************************************/
+#define BOARD_SPECIFIC_PCI_RESOURSES \
+     WordIO(ResourceProducer, MinFixed, MaxFixed, PosDecode, EntireRange, \
+         0x0000, \
+         0x0000, \
+         0x0CD7, \
+         0x0000, \
+         0x0CD8, \
+         ,, , TypeStatic) \
+     /* 0xcd8-0xcf7 hole for CPU hotplug, hw/acpi/ich9.c:ICH9_PROC_BASE */ \
+     WordIO(ResourceProducer, MinFixed, MaxFixed, PosDecode, EntireRange, \
+         0x0000, \
+         0x0D00, \
+         0xFFFF, \
+         0x0000, \
+         0xF300, \
+         ,, , TypeStatic)

    Scope(\_SB) {
        Device(PCI0) {
@@ -171,6 +187,7 @@ DefinitionBlock (
        }
    }

+#define DSDT_APPLESMC_STA q35_dsdt_applesmc_sta
 #include "acpi-dsdt-isa.dsl"


@@ -404,6 +421,8 @@ DefinitionBlock (
        define_gsi_link(GSIH, 0, 0x17)
    }

+#include "hw/acpi/cpu_hotplug_defs.h"
+#define CPU_STATUS_BASE ICH9_CPU_HOTPLUG_IO_BASE
 #include "acpi-dsdt-cpu-hotplug.dsl"


--- a/hw/i386/q35-acpi-dsdt.hex.generated
+++ b/hw/i386/q35-acpi-dsdt.hex.generated
@@ -3,12 +3,12 @@ static unsigned char Q35AcpiDsdtAmlCode[] = {
 0x53,
 0x44,
 0x54,
-0xb0,
+0xdf,
 0x1c,
 0x0,
 0x0,
 0x1,
-0xfe,
+0xff,
 0x42,
 0x58,
 0x50,
@@ -1033,8 +1033,8 @@ static unsigned char Q35AcpiDsdtAmlCode[] = {
 0x4e,
 0x1,
 0x10,
-0x4c,
-0x1b,
+0x4b,
+0x1e,
 0x2f,
 0x3,
 0x5f,
@@ -1052,6 +1052,53 @@ static unsigned char Q35AcpiDsdtAmlCode[] = {
 0x5b,
 0x82,
 0x2d,
+0x53,
+0x4d,
+0x43,
+0x5f,
+0x8,
+0x5f,
+0x48,
+0x49,
+0x44,
+0xc,
+0x6,
+0x10,
+0x0,
+0x1,
+0x8,
+0x5f,
+0x53,
+0x54,
+0x41,
+0xb,
+0x0,
+0xff,
+0x8,
+0x5f,
+0x43,
+0x52,
+0x53,
+0x11,
+0x10,
+0xa,
+0xd,
+0x47,
+0x1,
+0x0,
+0x3,
+0x0,
+0x3,
+0x1,
+0x20,
+0x22,
+0x40,
+0x0,
+0x79,
+0x0,
+0x5b,
+0x82,
+0x2d,
 0x52,
 0x54,
 0x43,
@@ -7229,12 +7276,19 @@ static unsigned char Q35AcpiDsdtAmlCode[] = {
 0x30,
 0x0,
 0x14,
-0x10,
+0x6,
 0x5f,
 0x4c,
 0x30,
 0x31,
 0x0,
+0x14,
+0x10,
+0x5f,
+0x45,
+0x30,
+0x32,
+0x0,
 0x5c,
 0x2e,
 0x5f,
@@ -7250,13 +7304,6 @@ static unsigned char Q35AcpiDsdtAmlCode[] = {
 0x5f,
 0x4c,
 0x30,
-0x32,
-0x0,
-0x14,
-0x6,
-0x5f,
-0x4c,
-0x30,
 0x33,
 0x0,
 0x14,
@@ -7344,3 +7391,6 @@ static unsigned char Q35AcpiDsdtAmlCode[] = {
 0x46,
 0x0
 };
+static unsigned short q35_dsdt_applesmc_sta[] = {
+0x431
+};
--- a/hw/i386/ssdt-pcihp.dsl
+++ b/hw/i386/ssdt-pcihp.dsl
@@ -25,6 +25,7 @@ DefinitionBlock ("ssdt-pcihp.aml", "SSDT", 0x01, "BXPC", "BXSSDTPCIHP", 0x1)
    /* Objects supplied by DSDT */
    External(\_SB.PCI0, DeviceObj)
    External(\_SB.PCI0.PCEJ, MethodObj)
+    External(BSEL, IntObj)

    Scope(\_SB.PCI0) {

@@ -33,19 +34,17 @@ DefinitionBlock ("ssdt-pcihp.aml", "SSDT", 0x01, "BXPC", "BXSSDTPCIHP", 0x1)
        ACPI_EXTRACT_DEVICE_END ssdt_pcihp_end
        ACPI_EXTRACT_DEVICE_STRING ssdt_pcihp_name

-        // Method _EJ0 can be patched by BIOS to EJ0_
-        // at runtime, if the slot is detected to not support hotplug.
-        // Extract the offset of the address dword and the
-        // _EJ0 name to allow this patching.
+        // Extract the offsets of the device name, address dword and the slot
+        // name byte - we fill them in for each device.
        Device(SAA) {
            ACPI_EXTRACT_NAME_BYTE_CONST ssdt_pcihp_id
            Name(_SUN, 0xAA)
            ACPI_EXTRACT_NAME_DWORD_CONST ssdt_pcihp_adr
            Name(_ADR, 0xAA0000)
-            ACPI_EXTRACT_METHOD_STRING ssdt_pcihp_ej0
            Method(_EJ0, 1) {
-                Return (PCEJ(_SUN))
+                PCEJ(BSEL, _SUN)
            }
        }
+
    }
 }
--- a/hw/i386/ssdt-pcihp.hex.generated
+++ b/hw/i386/ssdt-pcihp.hex.generated
@@ -5,19 +5,19 @@ static unsigned char ssdt_pcihp_adr[] = {
 0x44
 };
 static unsigned char ssdt_pcihp_end[] = {
-0x58
+0x5b
 };
 static unsigned char ssdp_pcihp_aml[] = {
 0x53,
 0x53,
 0x44,
 0x54,
-0x58,
+0x5b,
 0x0,
 0x0,
 0x0,
 0x1,
-0x76,
+0xe8,
 0x42,
 0x58,
 0x50,
@@ -45,7 +45,7 @@ static unsigned char ssdp_pcihp_aml[] = {
 0x13,
 0x20,
 0x10,
-0x33,
+0x36,
 0x5c,
 0x2e,
 0x5f,
@@ -58,7 +58,7 @@ static unsigned char ssdp_pcihp_aml[] = {
 0x30,
 0x5b,
 0x82,
-0x26,
+0x29,
 0x53,
 0x41,
 0x41,
@@ -81,17 +81,20 @@ static unsigned char ssdp_pcihp_aml[] = {
 0xaa,
 0x0,
 0x14,
-0xf,
+0x12,
 0x5f,
 0x45,
 0x4a,
 0x30,
 0x1,
-0xa4,
 0x50,
 0x43,
 0x45,
 0x4a,
+0x42,
+0x53,
+0x45,
+0x4c,
 0x5f,
 0x53,
 0x55,
@@ -103,6 +106,3 @@ static unsigned char ssdt_pcihp_start[] = {
 static unsigned char ssdt_pcihp_id[] = {
 0x3d
 };
-static unsigned char ssdt_pcihp_ej0[] = {
-0x4a
-};
--- a/hw/i386/ssdt-proc.hex.generated
+++ b/hw/i386/ssdt-proc.hex.generated
@@ -11,7 +11,7 @@ static unsigned char ssdp_proc_aml[] = {
 0x0,
 0x0,
 0x1,
-0xb8,
+0x78,
 0x42,
 0x58,
 0x50,
@@ -47,8 +47,8 @@ static unsigned char ssdp_proc_aml[] = {
 0x41,
 0x41,
 0xaa,
-0x10,
-0xb0,
+0x0,
+0x0,
 0x0,
 0x0,
 0x0,
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -2103,7 +2103,7 @@ int ide_init_drive(IDEState *s, BlockDriverState *bs, IDEDriveKind kind,
    s->smart_selftest_count = 0;
    if (kind == IDE_CD) {
        bdrv_set_dev_ops(bs, &ide_cd_block_ops, s);
-        bdrv_set_buffer_alignment(bs, 2048);
+        bdrv_set_guest_block_size(bs, 2048);
    } else {
        if (!bdrv_is_inserted(s->bs)) {
            error_report("Device needs media, but drive is empty");
--- a/hw/misc/applesmc.c
+++ b/hw/misc/applesmc.c
@@ -66,7 +66,6 @@ struct AppleSMCData {
    QLIST_ENTRY(AppleSMCData) node;
 };

-#define TYPE_APPLE_SMC "isa-applesmc"
 #define APPLE_SMC(obj) OBJECT_CHECK(AppleSMCState, (obj), TYPE_APPLE_SMC)

 typedef struct AppleSMCState AppleSMCState;
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -321,7 +321,7 @@ void vhost_net_ack_features(struct vhost_net *net, unsigned features)

 bool vhost_net_virtqueue_pending(VHostNetState *net, int idx)
 {
-    return -ENOSYS;
+    return false;
 }

 void vhost_net_virtqueue_mask(VHostNetState *net, VirtIODevice *dev,
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -793,6 +793,15 @@ static void pci_config_free(PCIDevice *pci_dev)
    g_free(pci_dev->used);
 }

+static void do_pci_unregister_device(PCIDevice *pci_dev)
+{
+    pci_dev->bus->devices[pci_dev->devfn] = NULL;
+    pci_config_free(pci_dev);
+
+    address_space_destroy(&pci_dev->bus_master_as);
+    memory_region_destroy(&pci_dev->bus_master_enable_region);
+}
+
 /* -1 for devfn means auto assign */
 static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
                                         const char *name, int devfn)
@@ -858,7 +867,7 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
        pci_init_mask_bridge(pci_dev);
    }
    if (pci_init_multifunction(bus, pci_dev)) {
-        pci_config_free(pci_dev);
+        do_pci_unregister_device(pci_dev);
        return NULL;
    }

@@ -873,15 +882,6 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
    return pci_dev;
 }

-static void do_pci_unregister_device(PCIDevice *pci_dev)
-{
-    pci_dev->bus->devices[pci_dev->devfn] = NULL;
-    pci_config_free(pci_dev);
-
-    address_space_destroy(&pci_dev->bus_master_as);
-    memory_region_destroy(&pci_dev->bus_master_enable_region);
-}
-
 static void pci_unregister_io_regions(PCIDevice *pci_dev)
 {
    PCIIORegion *r;
@@ -1704,6 +1704,34 @@ static PCIBus *pci_find_bus_nr(PCIBus *bus, int bus_num)
    return NULL;
 }

+void pci_for_each_bus_depth_first(PCIBus *bus,
+                                  void *(*begin)(PCIBus *bus, void *parent_state),
+                                  void (*end)(PCIBus *bus, void *state),
+                                  void *parent_state)
+{
+    PCIBus *sec;
+    void *state;
+
+    if (!bus) {
+        return;
+    }
+
+    if (begin) {
+        state = begin(bus, parent_state);
+    } else {
+        state = parent_state;
+    }
+
+    QLIST_FOREACH(sec, &bus->child, sibling) {
+        pci_for_each_bus_depth_first(sec, begin, end, state);
+    }
+
+    if (end) {
+        end(bus, state);
+    }
+}
+
+
 PCIDevice *pci_find_device(PCIBus *bus, int bus_num, uint8_t devfn)
 {
    bus = pci_find_bus_nr(bus, bus_num);
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -469,6 +469,8 @@ static int32_t scsi_target_send_command(SCSIRequest *req, uint8_t *buf)
            r->req.dev->sense_is_ua = false;
        }
        break;
+    case TEST_UNIT_READY:
+        break;
    default:
        scsi_req_build_sense(req, SENSE_CODE(LUN_NOT_SUPPORTED));
        scsi_req_complete(req, CHECK_CONDITION);
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -2254,7 +2254,7 @@ static int scsi_initfn(SCSIDevice *dev)
    } else {
        bdrv_set_dev_ops(s->qdev.conf.bs, &scsi_disk_block_ops, s);
    }
-    bdrv_set_buffer_alignment(s->qdev.conf.bs, s->qdev.blocksize);
+    bdrv_set_guest_block_size(s->qdev.conf.bs, s->qdev.blocksize);

    bdrv_iostatus_enable(s->qdev.conf.bs);
    add_boot_device_path(s->qdev.conf.bootindex, &dev->qdev, NULL);
@@ -2306,6 +2306,7 @@ static const SCSIReqOps scsi_disk_emulate_reqops = {
    .send_command = scsi_disk_emulate_command,
    .read_data    = scsi_disk_emulate_read_data,
    .write_data   = scsi_disk_emulate_write_data,
+    .cancel_io    = scsi_cancel_io,
    .get_buf      = scsi_get_buf,
 };

--- a/hw/scsi/scsi-generic.c
+++ b/hw/scsi/scsi-generic.c
@@ -210,7 +210,7 @@ static void scsi_read_complete(void * opaque, int ret)
            s->blocksize = ldl_be_p(&r->buf[8]);
            s->max_lba = ldq_be_p(&r->buf[0]);
        }
-        bdrv_set_buffer_alignment(s->conf.bs, s->blocksize);
+        bdrv_set_guest_block_size(s->conf.bs, s->blocksize);

        scsi_req_data(&r->req, len);
        if (!r->req.io_canceled) {
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -306,6 +306,10 @@ static void virtio_scsi_command_complete(SCSIRequest *r, uint32_t status,
    VirtIOSCSIReq *req = r->hba_private;
    uint32_t sense_len;

+    if (r->io_canceled) {
+        return;
+    }
+
    req->resp.cmd->response = VIRTIO_SCSI_S_OK;
    req->resp.cmd->status = status;
    if (req->resp.cmd->status == GOOD) {
@@ -516,7 +520,7 @@ static void virtio_scsi_push_event(VirtIOSCSI *s, SCSIDevice *dev,
    evt->event = event;
    evt->reason = reason;
    if (!dev) {
-        assert(event == VIRTIO_SCSI_T_NO_EVENT);
+        assert(event == VIRTIO_SCSI_T_EVENTS_MISSED);
    } else {
        evt->lun[0] = 1;
        evt->lun[1] = dev->id;
--- a/hw/usb/Makefile.objs
+++ b/hw/usb/Makefile.objs
@@ -1,5 +1,5 @@
 # usb subsystem core
-common-obj-y += core.o combined-packet.o bus.o desc.o
+common-obj-y += core.o combined-packet.o bus.o desc.o desc-msos.o
 common-obj-y += libhw.o

 # usb host adapters
--- a/hw/usb/bus.c
+++ b/hw/usb/bus.c
@@ -16,6 +16,8 @@ static Property usb_props[] = {
    DEFINE_PROP_STRING("serial", USBDevice, serial),
    DEFINE_PROP_BIT("full-path", USBDevice, flags,
                    USB_DEV_FLAG_FULL_PATH, true),
+    DEFINE_PROP_BIT("msos-desc", USBDevice, flags,
+                    USB_DEV_FLAG_MSOS_DESC_ENABLE, true),
    DEFINE_PROP_END_OF_LIST()
 };

--- a/hw/usb/desc-msos.c
+++ b/hw/usb/desc-msos.c
@@ -0,0 +1,234 @@
+#include "hw/usb.h"
+#include "hw/usb/desc.h"
+
+/*
+ * Microsoft OS Descriptors
+ *
+ * Windows tries to fetch some special descriptors with informations
+ * specifically for windows.  Presence is indicated using a special
+ * string @ index 0xee.  There are two kinds of descriptors:
+ *
+ * compatid descriptor
+ *   Used to bind drivers, if usb class isn't specific enougth.
+ *   Used for PTP/MTP for example (both share the same usb class).
+ *
+ * properties descriptor
+ *   Does carry registry entries.  They show up in
+ *   HLM\SYSTEM\CurrentControlSet\Enum\USB\<devid>\<serial>\Device Parameters
+ *
+ * Note that Windows caches the stuff it got in the registry, so when
+ * playing with this you have to delete registry subtrees to make
+ * windows query the device again:
+ *   HLM\SYSTEM\CurrentControlSet\Control\usbflags
+ *   HLM\SYSTEM\CurrentControlSet\Enum\USB
+ * Windows will complain it can't delete entries on the second one.
+ * It has deleted everything it had permissions too, which is enouth
+ * as this includes "Device Parameters".
+ *
+ * http://msdn.microsoft.com/en-us/library/windows/hardware/ff537430.aspx
+ *
+ */
+
+/* ------------------------------------------------------------------ */
+
+typedef struct msos_compat_hdr {
+    uint32_t dwLength;
+    uint8_t  bcdVersion_lo;
+    uint8_t  bcdVersion_hi;
+    uint8_t  wIndex_lo;
+    uint8_t  wIndex_hi;
+    uint8_t  bCount;
+    uint8_t  reserved[7];
+} QEMU_PACKED msos_compat_hdr;
+
+typedef struct msos_compat_func {
+    uint8_t  bFirstInterfaceNumber;
+    uint8_t  reserved_1;
+    uint8_t  compatibleId[8];
+    uint8_t  subCompatibleId[8];
+    uint8_t  reserved_2[6];
+} QEMU_PACKED msos_compat_func;
+
+static int usb_desc_msos_compat(const USBDesc *desc, uint8_t *dest)
+{
+    msos_compat_hdr *hdr = (void *)dest;
+    msos_compat_func *func;
+    int length = sizeof(*hdr);
+    int count = 0;
+
+    func = (void *)(dest + length);
+    func->bFirstInterfaceNumber = 0;
+    func->reserved_1 = 0x01;
+    length += sizeof(*func);
+    count++;
+
+    hdr->dwLength      = cpu_to_le32(length);
+    hdr->bcdVersion_lo = 0x00;
+    hdr->bcdVersion_hi = 0x01;
+    hdr->wIndex_lo     = 0x04;
+    hdr->wIndex_hi     = 0x00;
+    hdr->bCount        = count;
+    return length;
+}
+
+/* ------------------------------------------------------------------ */
+
+typedef struct msos_prop_hdr {
+    uint32_t dwLength;
+    uint8_t  bcdVersion_lo;
+    uint8_t  bcdVersion_hi;
+    uint8_t  wIndex_lo;
+    uint8_t  wIndex_hi;
+    uint8_t  wCount_lo;
+    uint8_t  wCount_hi;
+} QEMU_PACKED msos_prop_hdr;
+
+typedef struct msos_prop {
+    uint32_t dwLength;
+    uint32_t dwPropertyDataType;
+    uint8_t  dwPropertyNameLength_lo;
+    uint8_t  dwPropertyNameLength_hi;
+    uint8_t  bPropertyName[];
+} QEMU_PACKED msos_prop;
+
+typedef struct msos_prop_data {
+    uint32_t dwPropertyDataLength;
+    uint8_t  bPropertyData[];
+} QEMU_PACKED msos_prop_data;
+
+typedef enum msos_prop_type {
+    MSOS_REG_SZ        = 1,
+    MSOS_REG_EXPAND_SZ = 2,
+    MSOS_REG_BINARY    = 3,
+    MSOS_REG_DWORD_LE  = 4,
+    MSOS_REG_DWORD_BE  = 5,
+    MSOS_REG_LINK      = 6,
+    MSOS_REG_MULTI_SZ  = 7,
+} msos_prop_type;
+
+static int usb_desc_msos_prop_name(struct msos_prop *prop,
+                                   const wchar_t *name)
+{
+    int length = wcslen(name) + 1;
+    int i;
+
+    prop->dwPropertyNameLength_lo = usb_lo(length*2);
+    prop->dwPropertyNameLength_hi = usb_hi(length*2);
+    for (i = 0; i < length; i++) {
+        prop->bPropertyName[i*2]   = usb_lo(name[i]);
+        prop->bPropertyName[i*2+1] = usb_hi(name[i]);
+    }
+    return length*2;
+}
+
+static int usb_desc_msos_prop_str(uint8_t *dest, msos_prop_type type,
+                                  const wchar_t *name, const wchar_t *value)
+{
+    struct msos_prop *prop = (void *)dest;
+    struct msos_prop_data *data;
+    int length = sizeof(*prop);
+    int i, vlen = wcslen(value) + 1;
+
+    prop->dwPropertyDataType = cpu_to_le32(type);
+    length += usb_desc_msos_prop_name(prop, name);
+    data = (void *)(dest + length);
+
+    data->dwPropertyDataLength = cpu_to_le32(vlen*2);
+    length += sizeof(*prop);
+
+    for (i = 0; i < vlen; i++) {
+        data->bPropertyData[i*2]   = usb_lo(value[i]);
+        data->bPropertyData[i*2+1] = usb_hi(value[i]);
+    }
+    length += vlen*2;
+
+    prop->dwLength = cpu_to_le32(length);
+    return length;
+}
+
+static int usb_desc_msos_prop_dword(uint8_t *dest, const wchar_t *name,
+                                    uint32_t value)
+{
+    struct msos_prop *prop = (void *)dest;
+    struct msos_prop_data *data;
+    int length = sizeof(*prop);
+
+    prop->dwPropertyDataType = cpu_to_le32(MSOS_REG_DWORD_LE);
+    length += usb_desc_msos_prop_name(prop, name);
+    data = (void *)(dest + length);
+
+    data->dwPropertyDataLength = cpu_to_le32(4);
+    data->bPropertyData[0] = (value)       & 0xff;
+    data->bPropertyData[1] = (value >>  8) & 0xff;
+    data->bPropertyData[2] = (value >> 16) & 0xff;
+    data->bPropertyData[3] = (value >> 24) & 0xff;
+    length += sizeof(*prop) + 4;
+
+    prop->dwLength = cpu_to_le32(length);
+    return length;
+}
+
+static int usb_desc_msos_prop(const USBDesc *desc, uint8_t *dest)
+{
+    msos_prop_hdr *hdr = (void *)dest;
+    int length = sizeof(*hdr);
+    int count = 0;
+
+    if (desc->msos->Label) {
+        /*
+         * Given as example in the specs.  Havn't figured yet where
+         * this label shows up in the windows gui.
+         */
+        length += usb_desc_msos_prop_str(dest+length, MSOS_REG_SZ,
+                                         L"Label", desc->msos->Label);
+        count++;
+    }
+
+    if (desc->msos->SelectiveSuspendEnabled) {
+        /*
+         * Signaling remote wakeup capability in the standard usb
+         * descriptors isn't enouth to make windows actually use it.
+         * This is the "Yes, we really mean it" registy entry to flip
+         * the switch in the windows drivers.
+         */
+        length += usb_desc_msos_prop_dword(dest+length,
+                                           L"SelectiveSuspendEnabled", 1);
+        count++;
+    }
+
+    hdr->dwLength      = cpu_to_le32(length);
+    hdr->bcdVersion_lo = 0x00;
+    hdr->bcdVersion_hi = 0x01;
+    hdr->wIndex_lo     = 0x05;
+    hdr->wIndex_hi     = 0x00;
+    hdr->wCount_lo     = usb_lo(count);
+    hdr->wCount_hi     = usb_hi(count);
+    return length;
+}
+
+/* ------------------------------------------------------------------ */
+
+int usb_desc_msos(const USBDesc *desc,  USBPacket *p,
+                  int index, uint8_t *dest, size_t len)
+{
+    void *buf = g_malloc0(4096);
+    int length = 0;
+
+    switch (index) {
+    case 0x0004:
+        length = usb_desc_msos_compat(desc, buf);
+        break;
+    case 0x0005:
+        length = usb_desc_msos_prop(desc, buf);
+        break;
+    }
+
+    if (length > len) {
+        length = len;
+    }
+    memcpy(dest, buf, length);
+    free(buf);
+
+    p->actual_length = length;
+    return 0;
+}
--- a/hw/usb/desc.c
+++ b/hw/usb/desc.c
@@ -7,7 +7,7 @@
 /* ------------------------------------------------------------------ */

 int usb_desc_device(const USBDescID *id, const USBDescDevice *dev,
-                    uint8_t *dest, size_t len)
+                    bool msos, uint8_t *dest, size_t len)
 {
    uint8_t bLength = 0x12;
    USBDescriptor *d = (void *)dest;
@@ -19,8 +19,18 @@ int usb_desc_device(const USBDescID *id, const USBDescDevice *dev,
    d->bLength                     = bLength;
    d->bDescriptorType             = USB_DT_DEVICE;

-    d->u.device.bcdUSB_lo          = usb_lo(dev->bcdUSB);
-    d->u.device.bcdUSB_hi          = usb_hi(dev->bcdUSB);
+    if (msos && dev->bcdUSB < 0x0200) {
+        /*
+         * Version 2.0+ required for microsoft os descriptors to work.
+         * Done this way so msos-desc compat property will handle both
+         * the version and the new descriptors being present.
+         */
+        d->u.device.bcdUSB_lo          = usb_lo(0x0200);
+        d->u.device.bcdUSB_hi          = usb_hi(0x0200);
+    } else {
+        d->u.device.bcdUSB_lo          = usb_lo(dev->bcdUSB);
+        d->u.device.bcdUSB_hi          = usb_hi(dev->bcdUSB);
+    }
    d->u.device.bDeviceClass       = dev->bDeviceClass;
    d->u.device.bDeviceSubClass    = dev->bDeviceSubClass;
    d->u.device.bDeviceProtocol    = dev->bDeviceProtocol;
@@ -499,6 +509,10 @@ void usb_desc_init(USBDevice *dev)
    if (desc->super) {
        dev->speedmask |= USB_SPEED_MASK_SUPER;
    }
+    if (desc->msos && (dev->flags & (1 << USB_DEV_FLAG_MSOS_DESC_ENABLE))) {
+        dev->flags |= (1 << USB_DEV_FLAG_MSOS_DESC_IN_USE);
+        usb_desc_set_string(dev, 0xee, "MSFT100Q");
+    }
    usb_desc_setdefaults(dev);
 }

@@ -626,6 +640,7 @@ int usb_desc_string(USBDevice *dev, int index, uint8_t *dest, size_t len)
 int usb_desc_get_descriptor(USBDevice *dev, USBPacket *p,
                            int value, uint8_t *dest, size_t len)
 {
+    bool msos = (dev->flags & (1 << USB_DEV_FLAG_MSOS_DESC_IN_USE));
    const USBDesc *desc = usb_device_get_usb_desc(dev);
    const USBDescDevice *other_dev;
    uint8_t buf[256];
@@ -646,7 +661,7 @@ int usb_desc_get_descriptor(USBDevice *dev, USBPacket *p,

    switch(type) {
    case USB_DT_DEVICE:
-        ret = usb_desc_device(&desc->id, dev->device, buf, sizeof(buf));
+        ret = usb_desc_device(&desc->id, dev->device, msos, buf, sizeof(buf));
        trace_usb_desc_device(dev->addr, len, ret);
        break;
    case USB_DT_CONFIG:
@@ -703,6 +718,7 @@ int usb_desc_get_descriptor(USBDevice *dev, USBPacket *p,
 int usb_desc_handle_control(USBDevice *dev, USBPacket *p,
        int request, int value, int index, int length, uint8_t *data)
 {
+    bool msos = (dev->flags & (1 << USB_DEV_FLAG_MSOS_DESC_IN_USE));
    const USBDesc *desc = usb_device_get_usb_desc(dev);
    int ret = -1;

@@ -782,6 +798,19 @@ int usb_desc_handle_control(USBDevice *dev, USBPacket *p,
        trace_usb_set_interface(dev->addr, index, value, ret);
        break;

+    case VendorDeviceRequest | 'Q':
+        if (msos) {
+            ret = usb_desc_msos(desc, p, index, data, length);
+            trace_usb_desc_msos(dev->addr, index, length, ret);
+        }
+        break;
+    case VendorInterfaceRequest | 'Q':
+        if (msos) {
+            ret = usb_desc_msos(desc, p, index, data, length);
+            trace_usb_desc_msos(dev->addr, index, length, ret);
+        }
+        break;
+
    }
    return ret;
 }
--- a/hw/usb/desc.h
+++ b/hw/usb/desc.h
@@ -2,6 +2,7 @@
 #define QEMU_HW_USB_DESC_H

 #include <inttypes.h>
+#include <wchar.h>

 /* binary representation */
 typedef struct USBDescriptor {
@@ -182,6 +183,11 @@ struct USBDescOther {
    const uint8_t             *data;
 };

+struct USBDescMSOS {
+    const wchar_t             *Label;
+    bool                      SelectiveSuspendEnabled;
+};
+
 typedef const char *USBDescStrings[256];

 struct USBDesc {
@@ -190,6 +196,7 @@ struct USBDesc {
    const USBDescDevice       *high;
    const USBDescDevice       *super;
    const char* const         *str;
+    const USBDescMSOS         *msos;
 };

 #define USB_DESC_FLAG_SUPER (1 << 1)
@@ -207,7 +214,7 @@ static inline uint8_t usb_hi(uint16_t val)

 /* generate usb packages from structs */
 int usb_desc_device(const USBDescID *id, const USBDescDevice *dev,
-                    uint8_t *dest, size_t len);
+                    bool msos, uint8_t *dest, size_t len);
 int usb_desc_device_qualifier(const USBDescDevice *dev,
                              uint8_t *dest, size_t len);
 int usb_desc_config(const USBDescConfig *conf, int flags,
@@ -219,6 +226,8 @@ int usb_desc_iface(const USBDescIface *iface, int flags,
 int usb_desc_endpoint(const USBDescEndpoint *ep, int flags,
                      uint8_t *dest, size_t len);
 int usb_desc_other(const USBDescOther *desc, uint8_t *dest, size_t len);
+int usb_desc_msos(const USBDesc *desc, USBPacket *p,
+                  int index, uint8_t *dest, size_t len);

 /* control message emulation helpers */
 void usb_desc_init(USBDevice *dev);
--- a/hw/usb/dev-hid.c
+++ b/hw/usb/dev-hid.c
@@ -261,6 +261,10 @@ static const USBDescDevice desc_device_keyboard = {
    },
 };

+static const USBDescMSOS desc_msos_suspend = {
+    .SelectiveSuspendEnabled = true,
+};
+
 static const USBDesc desc_mouse = {
    .id = {
        .idVendor          = 0x0627,
@@ -272,6 +276,7 @@ static const USBDesc desc_mouse = {
    },
    .full = &desc_device_mouse,
    .str  = desc_strings,
+    .msos = &desc_msos_suspend,
 };

 static const USBDesc desc_tablet = {
@@ -285,6 +290,7 @@ static const USBDesc desc_tablet = {
    },
    .full = &desc_device_tablet,
    .str  = desc_strings,
+    .msos = &desc_msos_suspend,
 };

 static const USBDesc desc_tablet2 = {
@@ -299,6 +305,7 @@ static const USBDesc desc_tablet2 = {
    .full = &desc_device_tablet,
    .high = &desc_device_tablet2,
    .str  = desc_strings,
+    .msos = &desc_msos_suspend,
 };

 static const USBDesc desc_keyboard = {
@@ -312,6 +319,7 @@ static const USBDesc desc_keyboard = {
    },
    .full = &desc_device_keyboard,
    .str  = desc_strings,
+    .msos = &desc_msos_suspend,
 };

 static const uint8_t qemu_mouse_hid_report_descriptor[] = {
--- a/hw/virtio/dataplane/vring.c
+++ b/hw/virtio/dataplane/vring.c
@@ -376,7 +376,7 @@ int vring_pop(VirtIODevice *vdev, Vring *vring,
        barrier();

        if (desc.flags & VRING_DESC_F_INDIRECT) {
-            int ret = get_indirect(vring, elem, &desc);
+            ret = get_indirect(vring, elem, &desc);
            if (ret < 0) {
                goto out;
            }
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -184,7 +184,11 @@ void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top);
 int bdrv_parse_cache_flags(const char *mode, int *flags);
 int bdrv_parse_discard_flags(const char *mode, int *flags);
 int bdrv_file_open(BlockDriverState **pbs, const char *filename,
-                   QDict *options, int flags, Error **errp);
+                   const char *reference, QDict *options, int flags,
+                   Error **errp);
+int bdrv_open_image(BlockDriverState **pbs, const char *filename,
+                    QDict *options, const char *bdref_key, int flags,
+                    bool force_raw, bool allow_none, Error **errp);
 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp);
 int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
              int flags, BlockDriver *drv, Error **errp);
@@ -220,7 +224,6 @@ BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, int64_t sector_num
                                        int nb_sectors, BdrvRequestFlags flags,
                                        BlockDriverCompletionFunc *cb, void *opaque);
 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags);
-int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov);
 int bdrv_pread(BlockDriverState *bs, int64_t offset,
               void *buf, int count);
 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
@@ -249,6 +252,7 @@ int bdrv_truncate(BlockDriverState *bs, int64_t offset);
 int64_t bdrv_getlength(BlockDriverState *bs);
 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs);
 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr);
+int bdrv_refresh_limits(BlockDriverState *bs);
 int bdrv_commit(BlockDriverState *bs);
 int bdrv_commit_all(void);
 int bdrv_change_backing_file(BlockDriverState *bs,
@@ -283,16 +287,16 @@ int bdrv_amend_options(BlockDriverState *bs_new, QEMUOptionParameter *options);
 /* external snapshots */

 typedef enum {
-    EXT_SNAPSHOT_ALLOWED,
-    EXT_SNAPSHOT_FORBIDDEN,
-} ExtSnapshotPerm;
+    BS_IS_A_FILTER,
+    BS_FILTER_PASS_DOWN,
+    BS_AUTHORIZATION_COUNT,
+} BsAuthorization;

-/* return EXT_SNAPSHOT_ALLOWED if external snapshot is allowed
- * return EXT_SNAPSHOT_FORBIDDEN if external snapshot is forbidden
- */
-ExtSnapshotPerm bdrv_check_ext_snapshot(BlockDriverState *bs);
-/* helper used to forbid external snapshots like in blkverify */
-ExtSnapshotPerm bdrv_check_ext_snapshot_forbidden(BlockDriverState *bs);
+bool bdrv_generic_is_first_non_filter(BlockDriverState *bs,
+                                      BlockDriverState *candidate);
+bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
+                                      BlockDriverState *candidate);
+bool bdrv_is_first_non_filter(BlockDriverState *candidate);

 /* async block I/O */
 typedef void BlockDriverDirtyHandler(BlockDriverState *bs, int64_t sector,
@@ -374,6 +378,11 @@ void bdrv_lock_medium(BlockDriverState *bs, bool locked);
 void bdrv_eject(BlockDriverState *bs, bool eject_flag);
 const char *bdrv_get_format_name(BlockDriverState *bs);
 BlockDriverState *bdrv_find(const char *name);
+BlockDriverState *bdrv_find_node(const char *node_name);
+BlockDeviceInfoList *bdrv_named_nodes_list(void);
+BlockDriverState *bdrv_lookup_bs(const char *device,
+                                 const char *node_name,
+                                 Error **errp);
 BlockDriverState *bdrv_next(BlockDriverState *bs);
 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs),
                  void *opaque);
@@ -418,7 +427,10 @@ void bdrv_img_create(const char *filename, const char *fmt,
                     char *options, uint64_t img_size, int flags,
                     Error **errp, bool quiet);

-void bdrv_set_buffer_alignment(BlockDriverState *bs, int align);
+/* Returns the alignment in bytes that is required so that no bounce buffer
+ * is required throughout the stack */
+size_t bdrv_opt_mem_align(BlockDriverState *bs);
+void bdrv_set_guest_block_size(BlockDriverState *bs, int align);
 void *qemu_blockalign(BlockDriverState *bs, size_t size);
 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov);

@@ -515,6 +527,14 @@ typedef enum {
    BLKDBG_FLUSH_TO_OS,
    BLKDBG_FLUSH_TO_DISK,

+    BLKDBG_PWRITEV_RMW_HEAD,
+    BLKDBG_PWRITEV_RMW_AFTER_HEAD,
+    BLKDBG_PWRITEV_RMW_TAIL,
+    BLKDBG_PWRITEV_RMW_AFTER_TAIL,
+    BLKDBG_PWRITEV,
+    BLKDBG_PWRITEV_ZERO,
+    BLKDBG_PWRITEV_DONE,
+
    BLKDBG_EVENT_MAX,
 } BlkDebugEvent;

--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -57,22 +57,35 @@

 typedef struct BdrvTrackedRequest {
    BlockDriverState *bs;
-    int64_t sector_num;
-    int nb_sectors;
+    int64_t offset;
+    unsigned int bytes;
    bool is_write;
+
+    bool serialising;
+    int64_t overlap_offset;
+    unsigned int overlap_bytes;
+
    QLIST_ENTRY(BdrvTrackedRequest) list;
    Coroutine *co; /* owner, used for deadlock detection */
    CoQueue wait_queue; /* coroutines blocked on this request */
+
+    struct BdrvTrackedRequest *waiting_for;
 } BdrvTrackedRequest;

 struct BlockDriver {
    const char *format_name;
    int instance_size;

-    /* if not defined external snapshots are allowed
-     * future block filters will query their children to build the response
+    /* this table of boolean contains authorizations for the block operations */
+    bool authorizations[BS_AUTHORIZATION_COUNT];
+    /* for snapshots complex block filter like Quorum can implement the
+     * following recursive callback instead of BS_IS_A_FILTER.
+     * It's purpose is to recurse on the filter children while calling
+     * bdrv_recurse_is_first_non_filter on them.
+     * For a sample implementation look in the future Quorum block filter.
     */
-    ExtSnapshotPerm (*bdrv_check_ext_snapshot)(BlockDriverState *bs);
+    bool (*bdrv_recurse_is_first_non_filter)(BlockDriverState *bs,
+                                             BlockDriverState *candidate);

    int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename);
    int (*bdrv_probe_device)(const char *filename);
@@ -226,6 +239,8 @@ struct BlockDriver {
    int (*bdrv_debug_resume)(BlockDriverState *bs, const char *tag);
    bool (*bdrv_debug_is_suspended)(BlockDriverState *bs, const char *tag);

+    int (*bdrv_refresh_limits)(BlockDriverState *bs);
+
    /*
     * Returns 1 if newly created images are guaranteed to contain only
     * zeros, 0 otherwise.
@@ -250,6 +265,9 @@ typedef struct BlockLimits {

    /* optimal transfer length in sectors */
    int opt_transfer_length;
+
+    /* memory alignment so that no bounce buffer is needed */
+    size_t opt_mem_alignment;
 } BlockLimits;

 /*
@@ -291,8 +309,8 @@ struct BlockDriverState {
    /* Callback before write request is processed */
    NotifierWithReturnList before_write_notifiers;

-    /* number of in-flight copy-on-read requests */
-    unsigned int copy_on_read_in_flight;
+    /* number of in-flight serialising requests */
+    unsigned int serialising_in_flight;

    /* I/O throttling */
    ThrottleState throttle_state;
@@ -314,8 +332,11 @@ struct BlockDriverState {
    /* Whether produces zeros when read beyond eof */
    bool zero_beyond_eof;

-    /* the memory alignment required for the buffers handled by this driver */
-    int buffer_alignment;
+    /* Alignment requirement for offset/length of I/O requests */
+    unsigned int request_alignment;
+
+    /* the block size for which the guest device expects atomicity */
+    int guest_block_size;

    /* do we need to tell the quest if we have a volatile write cache? */
    int enable_write_cache;
@@ -325,11 +346,18 @@ struct BlockDriverState {
    BlockdevOnError on_read_error, on_write_error;
    bool iostatus_enabled;
    BlockDeviceIoStatus iostatus;
+
+    /* the following member gives a name to every node on the bs graph. */
+    char node_name[32];
+    /* element of the list of named nodes building the graph */
+    QTAILQ_ENTRY(BlockDriverState) node_list;
+    /* Device name is the name associated with the "drive" the guest sees */
    char device_name[32];
+    /* element of the list of "drives" the guest sees */
+    QTAILQ_ENTRY(BlockDriverState) device_list;
    QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
    int refcnt;
    int in_use; /* users other than guest access, eg. block migration */
-    QTAILQ_ENTRY(BlockDriverState) list;

    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;

--- a/include/block/qapi.h
+++ b/include/block/qapi.h
@@ -29,6 +29,7 @@
 #include "block/block.h"
 #include "block/snapshot.h"

+BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs);
 int bdrv_query_snapshot_info_list(BlockDriverState *bs,
                                  SnapshotInfoList **p_list,
                                  Error **errp);
--- a/include/hw/acpi/cpu_hotplug.h
+++ b/include/hw/acpi/cpu_hotplug.h
@@ -0,0 +1,27 @@
+/*
+ * QEMU ACPI hotplug utilities
+ *
+ * Copyright (C) 2013 Red Hat Inc
+ *
+ * Authors:
+ *   Igor Mammedov <imammedo@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#ifndef ACPI_HOTPLUG_H
+#define ACPI_HOTPLUG_H
+
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/cpu_hotplug_defs.h"
+
+typedef struct AcpiCpuHotplug {
+    MemoryRegion io;
+    uint8_t sts[ACPI_GPE_PROC_LEN];
+} AcpiCpuHotplug;
+
+void AcpiCpuHotplug_add(ACPIGPE *gpe, AcpiCpuHotplug *g, CPUState *cpu);
+
+void AcpiCpuHotplug_init(MemoryRegion *parent, Object *owner,
+                         AcpiCpuHotplug *gpe_cpu, uint16_t base);
+#endif
--- a/include/hw/acpi/cpu_hotplug_defs.h
+++ b/include/hw/acpi/cpu_hotplug_defs.h
@@ -0,0 +1,24 @@
+/*
+ * QEMU ACPI hotplug utilities shared defines
+ *
+ * Copyright (C) 2013 Red Hat Inc
+ *
+ * Authors:
+ *   Igor Mammedov <imammedo@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#ifndef ACPI_HOTPLUG_DEFS_H
+#define ACPI_HOTPLUG_DEFS_H
+
+/*
+ * ONLY DEFINEs are permited in this file since it's shared
+ * between C and ASL code.
+ */
+#define ACPI_CPU_HOTPLUG_STATUS 4
+#define ACPI_GPE_PROC_LEN 32
+#define ICH9_CPU_HOTPLUG_IO_BASE 0x0CD8
+#define PIIX4_CPU_HOTPLUG_IO_BASE 0xaf00
+
+#endif
--- a/include/hw/acpi/ich9.h
+++ b/include/hw/acpi/ich9.h
@@ -22,6 +22,7 @@
 #define HW_ACPI_ICH9_H

 #include "hw/acpi/acpi.h"
+#include "hw/acpi/cpu_hotplug.h"

 typedef struct ICH9LPCPMRegs {
    /*
@@ -42,6 +43,9 @@ typedef struct ICH9LPCPMRegs {

    uint32_t pm_io_base;
    Notifier powerdown_notifier;
+
+    AcpiCpuHotplug gpe_cpu;
+    Notifier cpu_added_notifier;
 } ICH9LPCPMRegs;

 void ich9_pm_init(PCIDevice *lpc_pci, ICH9LPCPMRegs *pm,
--- a/include/hw/acpi/pcihp.h
+++ b/include/hw/acpi/pcihp.h
@@ -0,0 +1,72 @@
+/*
+ * QEMU<->ACPI BIOS PCI hotplug interface
+ *
+ * QEMU supports PCI hotplug via ACPI. This module
+ * implements the interface between QEMU and the ACPI BIOS.
+ * Interface specification - see docs/specs/acpi_pci_hotplug.txt
+ *
+ * Copyright (c) 2013, Red Hat Inc, Michael S. Tsirkin (mst@redhat.com)
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#ifndef HW_ACPI_PCIHP_H
+#define HW_ACPI_PCIHP_H
+
+#include <inttypes.h>
+#include <qemu/typedefs.h>
+#include "hw/pci/pci.h" /* for PCIHotplugState */
+
+typedef struct AcpiPciHpPciStatus {
+    uint32_t up; /* deprecated, maintained for migration compatibility */
+    uint32_t down;
+    uint32_t hotplug_enable;
+    uint32_t device_present;
+} AcpiPciHpPciStatus;
+
+#define ACPI_PCIHP_PROP_BSEL "acpi-pcihp-bsel"
+#define ACPI_PCIHP_MAX_HOTPLUG_BUS 256
+
+typedef struct AcpiPciHpState {
+    AcpiPciHpPciStatus acpi_pcihp_pci_status[ACPI_PCIHP_MAX_HOTPLUG_BUS];
+    uint32_t hotplug_select;
+    PCIBus *root;
+    MemoryRegion io;
+} AcpiPciHpState;
+
+void acpi_pcihp_init(AcpiPciHpState *, PCIBus *root,
+                     MemoryRegion *address_space_io);
+
+/* Invoke on device hotplug */
+int acpi_pcihp_device_hotplug(AcpiPciHpState *, PCIDevice *,
+                              PCIHotplugState state);
+
+/* Called on reset */
+void acpi_pcihp_reset(AcpiPciHpState *s);
+
+extern const VMStateDescription vmstate_acpi_pcihp_pci_status;
+
+#define VMSTATE_PCI_HOTPLUG(pcihp, state, test_pcihp) \
+        VMSTATE_UINT32_TEST(pcihp.hotplug_select, state, \
+                            test_pcihp), \
+        VMSTATE_STRUCT_ARRAY_TEST(pcihp.acpi_pcihp_pci_status, state, \
+                                  ACPI_PCIHP_MAX_HOTPLUG_BUS, \
+                                  test_pcihp, 1, \
+                                  vmstate_acpi_pcihp_pci_status, \
+                                  AcpiPciHpPciStatus)
+
+#endif
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -35,7 +35,7 @@ typedef struct PcPciInfo {
 struct PcGuestInfo {
    bool has_pci_info;
    bool isapc_ram_fw;
-    hwaddr ram_size;
+    hwaddr ram_size, ram_size_below_4g;
    unsigned apic_id_limit;
    bool apic_xrupt_override;
    uint64_t numa_nodes;
@@ -241,6 +241,7 @@ uint16_t pvpanic_port(void);
 int e820_add_entry(uint64_t, uint64_t, uint32_t);

 #define PC_Q35_COMPAT_1_7 \
+        PC_COMPAT_1_7, \
        {\
            .driver   = "hpet",\
            .property = HPET_INTCAP,\
@@ -259,7 +260,20 @@ int e820_add_entry(uint64_t, uint64_t, uint32_t);
        PC_COMPAT_1_4, \
        PC_Q35_COMPAT_1_5

+#define PC_COMPAT_1_7 \
+        {\
+            .driver   = TYPE_USB_DEVICE,\
+            .property = "msos-desc",\
+            .value    = "no",\
+        },\
+        {\
+            .driver   = "PIIX4_PM",\
+            .property = "acpi-pci-hotplug-with-bridge-support",\
+            .value    = "off",\
+        }
+
 #define PC_COMPAT_1_6 \
+        PC_COMPAT_1_7, \
        {\
            .driver   = "e1000",\
            .property = "mitigation",\
--- a/include/hw/isa/isa.h
+++ b/include/hw/isa/isa.h
@@ -20,6 +20,13 @@
 #define TYPE_ISA_BUS "ISA"
 #define ISA_BUS(obj) OBJECT_CHECK(ISABus, (obj), TYPE_ISA_BUS)

+#define TYPE_APPLE_SMC "isa-applesmc"
+
+static inline bool applesmc_find(void)
+{
+    return object_resolve_path_type("", TYPE_APPLE_SMC, NULL);
+}
+
 typedef struct ISADeviceClass {
    DeviceClass parent_class;
 } ISADeviceClass;
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -387,6 +387,20 @@ int pci_bus_num(PCIBus *s);
 void pci_for_each_device(PCIBus *bus, int bus_num,
                         void (*fn)(PCIBus *bus, PCIDevice *d, void *opaque),
                         void *opaque);
+void pci_for_each_bus_depth_first(PCIBus *bus,
+                                  void *(*begin)(PCIBus *bus, void *parent_state),
+                                  void (*end)(PCIBus *bus, void *state),
+                                  void *parent_state);
+
+/* Use this wrapper when specific scan order is not required. */
+static inline
+void pci_for_each_bus(PCIBus *bus,
+                      void (*fn)(PCIBus *bus, void *opaque),
+                      void *opaque)
+{
+    pci_for_each_bus_depth_first(bus, NULL, fn, opaque);
+}
+
 PCIBus *pci_find_primary_bus(void);
 PCIBus *pci_device_root_bus(const PCIDevice *d);
 const char *pci_root_bus_path(PCIDevice *dev);
--- a/include/hw/usb.h
+++ b/include/hw/usb.h
@@ -182,6 +182,7 @@ typedef struct USBDescIface USBDescIface;
 typedef struct USBDescEndpoint USBDescEndpoint;
 typedef struct USBDescOther USBDescOther;
 typedef struct USBDescString USBDescString;
+typedef struct USBDescMSOS USBDescMSOS;

 struct USBDescString {
    uint8_t index;
@@ -208,6 +209,8 @@ struct USBEndpoint {
 enum USBDeviceFlags {
    USB_DEV_FLAG_FULL_PATH,
    USB_DEV_FLAG_IS_HOST,
+    USB_DEV_FLAG_MSOS_DESC_ENABLE,
+    USB_DEV_FLAG_MSOS_DESC_IN_USE,
 };

 /* definition of a USB device */
--- a/include/monitor/monitor.h
+++ b/include/monitor/monitor.h
@@ -5,7 +5,7 @@
 #include "qapi/qmp/qerror.h"
 #include "qapi/qmp/qdict.h"
 #include "block/block.h"
-#include "monitor/readline.h"
+#include "qemu/readline.h"

 extern Monitor *cur_mon;
 extern Monitor *default_mon;
--- a/include/qapi/qmp/qdict.h
+++ b/include/qapi/qmp/qdict.h
@@ -68,5 +68,6 @@ QDict *qdict_clone_shallow(const QDict *src);
 void qdict_flatten(QDict *qdict);

 void qdict_extract_subqdict(QDict *src, QDict **dst, const char *start);
+void qdict_array_split(QDict *src, QList **dst);

 #endif /* QDICT_H */
--- a/include/qemu-io.h
+++ b/include/qemu-io.h
@@ -42,5 +42,8 @@ bool qemuio_command(BlockDriverState *bs, const char *cmd);

 void qemuio_add_command(const cmdinfo_t *ci);
 int qemuio_command_usage(const cmdinfo_t *ci);
+void qemuio_complete_command(const char *input,
+                             void (*fn)(const char *cmd, void *opaque),
+                             void *opaque);

 #endif /* QEMU_IO_H */
--- a/include/qemu/config-file.h
+++ b/include/qemu/config-file.h
@@ -4,6 +4,7 @@
 #include <stdio.h>
 #include "qemu/option.h"
 #include "qapi/error.h"
+#include "qapi/qmp/qdict.h"

 QemuOptsList *qemu_find_opts(const char *group);
 QemuOptsList *qemu_find_opts_err(const char *group, Error **errp);
@@ -18,6 +19,11 @@ int qemu_config_parse(FILE *fp, QemuOptsList **lists, const char *fname);

 int qemu_read_config_file(const char *filename);

+/* Parse QDict options as a replacement for a config file (allowing multiple
+   enumerated (0..(n-1)) configuration "sections") */
+void qemu_config_parse_qdict(QDict *options, QemuOptsList **lists,
+                             Error **errp);
+
 /* Read default QEMU config files
 */
 int qemu_read_default_config_files(bool userconfig);
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -240,4 +240,6 @@ static inline void qemu_init_auxval(char **envp) { }
 void qemu_init_auxval(char **envp);
 #endif

+void qemu_set_tty_echo(int fd, bool echo);
+
 #endif
--- a/include/monitor/readline.h
+++ b/include/monitor/readline.h
@@ -1,14 +1,15 @@
 #ifndef READLINE_H
 #define READLINE_H

-#include "qemu-common.h"
-
 #define READLINE_CMD_BUF_SIZE 4095
 #define READLINE_MAX_CMDS 64
 #define READLINE_MAX_COMPLETIONS 256

-typedef void ReadLineFunc(Monitor *mon, const char *str, void *opaque);
-typedef void ReadLineCompletionFunc(Monitor *mon,
+typedef void ReadLinePrintfFunc(void *opaque, const char *fmt, ...);
+typedef void ReadLineFlushFunc(void *opaque);
+typedef void ReadLineFunc(void *opaque, const char *str,
+                          void *readline_opaque);
+typedef void ReadLineCompletionFunc(void *opaque,
                                    const char *cmdline);

 typedef struct ReadLineState {
@@ -35,7 +36,10 @@ typedef struct ReadLineState {
    void *readline_opaque;
    int read_password;
    char prompt[256];
-    Monitor *mon;
+
+    ReadLinePrintfFunc *printf_func;
+    ReadLineFlushFunc *flush_func;
+    void *opaque;
 } ReadLineState;

 void readline_add_completion(ReadLineState *rs, const char *str);
@@ -46,11 +50,13 @@ const char *readline_get_history(ReadLineState *rs, unsigned int index);
 void readline_handle_byte(ReadLineState *rs, int ch);

 void readline_start(ReadLineState *rs, const char *prompt, int read_password,
-                    ReadLineFunc *readline_func, void *opaque);
+                    ReadLineFunc *readline_func, void *readline_opaque);
 void readline_restart(ReadLineState *rs);
 void readline_show_prompt(ReadLineState *rs);

-ReadLineState *readline_init(Monitor *mon,
+ReadLineState *readline_init(ReadLinePrintfFunc *printf_func,
+                             ReadLineFlushFunc *flush_func,
+                             void *opaque,
                             ReadLineCompletionFunc *completion_finder);

 #endif /* !READLINE_H */
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -499,7 +499,7 @@ int kvm_check_extension(KVMState *s, unsigned int extension)
    return ret;
 }

-static int kvm_set_ioeventfd_mmio(int fd, uint32_t addr, uint32_t val,
+static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val,
                                  bool assign, uint32_t size, bool datamatch)
 {
    int ret;
@@ -1422,16 +1422,22 @@ int kvm_init(void)
        nc++;
    }

-    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
-    if (s->vmfd < 0) {
+    do {
+        ret = kvm_ioctl(s, KVM_CREATE_VM, 0);
+    } while (ret == -EINTR);
+
+    if (ret < 0) {
+        fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -s->vmfd,
+                strerror(-ret));
+
 #ifdef TARGET_S390X
        fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
                        "your host kernel command line\n");
 #endif
-        ret = s->vmfd;
        goto err;
    }

+    s->vmfd = ret;
    missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
    if (!missing_cap) {
        missing_cap =
--- a/monitor.c
+++ b/monitor.c
@@ -37,7 +37,7 @@
 #include "ui/qemu-spice.h"
 #include "sysemu/sysemu.h"
 #include "monitor/monitor.h"
-#include "monitor/readline.h"
+#include "qemu/readline.h"
 #include "ui/console.h"
 #include "sysemu/blockdev.h"
 #include "audio/audio.h"
@@ -217,8 +217,8 @@ static const mon_cmd_t qmp_cmds[];
 Monitor *cur_mon;
 Monitor *default_mon;

-static void monitor_command_cb(Monitor *mon, const char *cmdline,
-                               void *opaque);
+static void monitor_command_cb(void *opaque, const char *cmdline,
+                               void *readline_opaque);

 static inline int qmp_cmd_mode(const Monitor *mon)
 {
@@ -4338,9 +4338,10 @@ static void monitor_find_completion_by_table(Monitor *mon,
    }
 }

-static void monitor_find_completion(Monitor *mon,
+static void monitor_find_completion(void *opaque,
                                    const char *cmdline)
 {
+    Monitor *mon = opaque;
    char *args[MAX_ARGS];
    int nb_args, len;

@@ -4751,8 +4752,11 @@ static void monitor_read(void *opaque, const uint8_t *buf, int size)
    cur_mon = old_mon;
 }

-static void monitor_command_cb(Monitor *mon, const char *cmdline, void *opaque)
+static void monitor_command_cb(void *opaque, const char *cmdline,
+                               void *readline_opaque)
 {
+    Monitor *mon = opaque;
+
    monitor_suspend(mon);
    handle_user_command(mon, cmdline);
    monitor_resume(mon);
@@ -4881,6 +4885,22 @@ static void sortcmdlist(void)
 * End:
 */

+/* These functions just adapt the readline interface in a typesafe way.  We
+ * could cast function pointers but that discards compiler checks.
+ */
+static void monitor_readline_printf(void *opaque, const char *fmt, ...)
+{
+    va_list ap;
+    va_start(ap, fmt);
+    monitor_vprintf(opaque, fmt, ap);
+    va_end(ap);
+}
+
+static void monitor_readline_flush(void *opaque)
+{
+    monitor_flush(opaque);
+}
+
 void monitor_init(CharDriverState *chr, int flags)
 {
    static int is_first_init = 1;
@@ -4898,7 +4918,10 @@ void monitor_init(CharDriverState *chr, int flags)
    mon->chr = chr;
    mon->flags = flags;
    if (flags & MONITOR_USE_READLINE) {
-        mon->rs = readline_init(mon, monitor_find_completion);
+        mon->rs = readline_init(monitor_readline_printf,
+                                monitor_readline_flush,
+                                mon,
+                                monitor_find_completion);
        monitor_read_command(mon, 0);
    }

@@ -4920,9 +4943,11 @@ void monitor_init(CharDriverState *chr, int flags)
        default_mon = mon;
 }

-static void bdrv_password_cb(Monitor *mon, const char *password, void *opaque)
+static void bdrv_password_cb(void *opaque, const char *password,
+                             void *readline_opaque)
 {
-    BlockDriverState *bs = opaque;
+    Monitor *mon = opaque;
+    BlockDriverState *bs = readline_opaque;
    int ret = 0;

    if (bdrv_set_key(bs, password) != 0) {
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -810,6 +810,8 @@
 #
 # @file: the filename of the backing device
 #
+# @node-name: #optional the name of the block driver node (Since 2.0)
+#
 # @ro: true if the backing device was open read-only
 #
 # @drv: the name of the block format used to open the backing device. As of
@@ -857,10 +859,9 @@
 #
 # Since: 0.14.0
 #
-# Notes: This interface is only found in @BlockInfo.
 ##
 { 'type': 'BlockDeviceInfo',
-  'data': { 'file': 'str', 'ro': 'bool', 'drv': 'str',
+  'data': { 'file': 'str', '*node-name': 'str', 'ro': 'bool', 'drv': 'str',
            '*backing_file': 'str', 'backing_file_depth': 'int',
            'encrypted': 'bool', 'encryption_key_missing': 'bool',
            'bps': 'int', 'bps_rd': 'int', 'bps_wr': 'int',
@@ -1022,15 +1023,17 @@
 #
 # @stats:  A @BlockDeviceStats for the device.
 #
-# @parent: #optional This may point to the backing block device if this is a
-#          a virtual block device.  If it's a backing block, this will point
-#          to the backing file is one is present.
+# @parent: #optional This describes the file block device if it has one.
+#
+# @backing: #optional This describes the backing block device if it has one.
+#           (Since 2.0)
 #
 # Since: 0.14.0
 ##
 { 'type': 'BlockStats',
  'data': {'*device': 'str', 'stats': 'BlockDeviceStats',
-           '*parent': 'BlockStats'} }
+           '*parent': 'BlockStats',
+           '*backing': 'BlockStats'} }

 ##
 # @query-blockstats:
@@ -1675,7 +1678,11 @@
 # determine which ones are encrypted, set the passwords with this command, and
 # then start the guest with the @cont command.
 #
-# @device:   the name of the device to set the password on
+# Either @device or @node-name must be set but not both.
+#
+# @device: #optional the name of the block backend device to set the password on
+#
+# @node-name: #optional graph node name to set the password on (Since 2.0)
 #
 # @password: the password to use for the device
 #
@@ -1689,7 +1696,8 @@
 #
 # Since: 0.14.0
 ##
-{ 'command': 'block_passwd', 'data': {'device': 'str', 'password': 'str'} }
+{ 'command': 'block_passwd', 'data': {'*device': 'str',
+                                      '*node-name': 'str', 'password': 'str'} }

 ##
 # @balloon:
@@ -1716,7 +1724,11 @@
 #
 # Resize a block image while a guest is running.
 #
-# @device:  the name of the device to get the image resized
+# Either @device or @node-name must be set but not both.
+#
+# @device: #optional the name of the device to get the image resized
+#
+# @node-name: #optional graph node name to get the image resized (Since 2.0)
 #
 # @size:  new image size in bytes
 #
@@ -1725,7 +1737,9 @@
 #
 # Since: 0.14.0
 ##
-{ 'command': 'block_resize', 'data': { 'device': 'str', 'size': 'int' }}
+{ 'command': 'block_resize', 'data': { '*device': 'str',
+                                       '*node-name': 'str',
+                                       'size': 'int' }}

 ##
 # @NewImageMode
@@ -1747,18 +1761,25 @@
 ##
 # @BlockdevSnapshot
 #
-# @device:  the name of the device to generate the snapshot from.
+# Either @device or @node-name must be set but not both.
+#
+# @device: #optional the name of the device to generate the snapshot from.
+#
+# @node-name: #optional graph node name to generate the snapshot from (Since 2.0)
 #
 # @snapshot-file: the target of the new image. A new file will be created.
 #
+# @snapshot-node-name: #optional the graph node name of the new image (Since 2.0)
+#
 # @format: #optional the format of the snapshot image, default is 'qcow2'.
 #
 # @mode: #optional whether and how QEMU should create a new image, default is
 #        'absolute-paths'.
 ##
 { 'type': 'BlockdevSnapshot',
-  'data': { 'device': 'str', 'snapshot-file': 'str', '*format': 'str',
-            '*mode': 'NewImageMode' } }
+  'data': { '*device': 'str', '*node-name': 'str',
+            'snapshot-file': 'str', '*snapshot-node-name': 'str',
+            '*format': 'str', '*mode': 'NewImageMode' } }

 ##
 # @BlockdevSnapshotInternal
@@ -1973,6 +1994,13 @@
 #                    user needs to complete the job with the block-job-complete
 #                    command after getting the ready event. (Since 2.0)
 #
+#                    If the base image is smaller than top, then the base image
+#                    will be resized to be the same size as top.  If top is
+#                    smaller than the base image, the base will not be
+#                    truncated.  If you want the base image size to match the
+#                    size of the smaller top, you can safely truncate it
+#                    yourself once the commit operation successfully completes.
+#
 #
 # @speed:  #optional the maximum speed, in bytes per second
 #
@@ -2008,6 +2036,17 @@
 ##
 { 'command': 'drive-backup', 'data': 'DriveBackup' }

+##
+# @query-named-block-nodes
+#
+# Get the named block driver list
+#
+# Returns: the list of BlockDeviceInfo
+#
+# Since 2.0
+##
+{ 'command': 'query-named-block-nodes', 'returns': [ 'BlockDeviceInfo' ] }
+
 ##
 # @drive-mirror
 #
@@ -4090,6 +4129,7 @@
 # @id:          #optional id by which the new block device can be referred to.
 #               This is a required option on the top level of blockdev-add, and
 #               currently not allowed on any other level.
+# @node-name:   #optional the name of a block driver state node (Since 2.0)
 # @discard:     #optional discard-related options (default: ignore)
 # @cache:       #optional cache-related options
 # @aio:         #optional AIO backend (default: threads)
@@ -4105,6 +4145,7 @@
 { 'type': 'BlockdevOptionsBase',
  'data': { 'driver': 'str',
            '*id': 'str',
+            '*node-name': 'str',
            '*discard': 'BlockdevDiscardOptions',
            '*cache': 'BlockdevCacheOptions',
            '*aio': 'BlockdevAioOptions',
@@ -4200,6 +4241,116 @@
            '*pass-discard-snapshot': 'bool',
            '*pass-discard-other': 'bool' } }

+##
+# @BlkdebugEvent
+#
+# Trigger events supported by blkdebug.
+##
+{ 'enum': 'BlkdebugEvent',
+  'data': [ 'l1_update', 'l1_grow.alloc_table', 'l1_grow.write_table',
+            'l1_grow.activate_table', 'l2_load', 'l2_update',
+            'l2_update_compressed', 'l2_alloc.cow_read', 'l2_alloc.write',
+            'read_aio', 'read_backing_aio', 'read_compressed', 'write_aio',
+            'write_compressed', 'vmstate_load', 'vmstate_save', 'cow_read',
+            'cow_write', 'reftable_load', 'reftable_grow', 'reftable_update',
+            'refblock_load', 'refblock_update', 'refblock_update_part',
+            'refblock_alloc', 'refblock_alloc.hookup', 'refblock_alloc.write',
+            'refblock_alloc.write_blocks', 'refblock_alloc.write_table',
+            'refblock_alloc.switch_table', 'cluster_alloc',
+            'cluster_alloc_bytes', 'cluster_free', 'flush_to_os',
+            'flush_to_disk' ] }
+
+##
+# @BlkdebugInjectErrorOptions
+#
+# Describes a single error injection for blkdebug.
+#
+# @event:       trigger event
+#
+# @state:       #optional the state identifier blkdebug needs to be in to
+#               actually trigger the event; defaults to "any"
+#
+# @errno:       #optional error identifier (errno) to be returned; defaults to
+#               EIO
+#
+# @sector:      #optional specifies the sector index which has to be affected
+#               in order to actually trigger the event; defaults to "any
+#               sector"
+#
+# @once:        #optional disables further events after this one has been
+#               triggered; defaults to false
+#
+# @immediately: #optional fail immediately; defaults to false
+#
+# Since: 2.0
+##
+{ 'type': 'BlkdebugInjectErrorOptions',
+  'data': { 'event': 'BlkdebugEvent',
+            '*state': 'int',
+            '*errno': 'int',
+            '*sector': 'int',
+            '*once': 'bool',
+            '*immediately': 'bool' } }
+
+##
+# @BlkdebugSetStateOptions
+#
+# Describes a single state-change event for blkdebug.
+#
+# @event:       trigger event
+#
+# @state:       #optional the current state identifier blkdebug needs to be in;
+#               defaults to "any"
+#
+# @new_state:   the state identifier blkdebug is supposed to assume if
+#               this event is triggered
+#
+# Since: 2.0
+##
+{ 'type': 'BlkdebugSetStateOptions',
+  'data': { 'event': 'BlkdebugEvent',
+            '*state': 'int',
+            'new_state': 'int' } }
+
+##
+# @BlockdevOptionsBlkdebug
+#
+# Driver specific block device options for blkdebug.
+#
+# @image:           underlying raw block device (or image file)
+#
+# @config:          #optional filename of the configuration file
+#
+# @align:           #optional required alignment for requests in bytes
+#
+# @inject-error:    #optional array of error injection descriptions
+#
+# @set-state:       #optional array of state-change descriptions
+#
+# Since: 2.0
+##
+{ 'type': 'BlockdevOptionsBlkdebug',
+  'data': { 'image': 'BlockdevRef',
+            '*config': 'str',
+            '*align': 'int',
+            '*inject-error': ['BlkdebugInjectErrorOptions'],
+            '*set-state': ['BlkdebugSetStateOptions'] } }
+
+##
+# @BlockdevOptionsBlkverify
+#
+# Driver specific block device options for blkverify.
+#
+# @test:    block device to be tested
+#
+# @raw:     raw image used for verification
+#
+# Since: 2.0
+##
+{ 'type': 'BlockdevOptionsBlkverify',
+  'data': { 'test': 'BlockdevRef',
+            'raw': 'BlockdevRef' } }
+
 ##
 # @BlockdevOptions
 #
@@ -4224,10 +4375,8 @@
 # TODO sheepdog: Wait for structured options
 # TODO ssh: Should take InetSocketAddress for 'host'?
      'vvfat':      'BlockdevOptionsVVFAT',
-
-# TODO blkdebug: Wait for structured options
-# TODO blkverify: Wait for structured options
-
+      'blkdebug':   'BlockdevOptionsBlkdebug',
+      'blkverify':  'BlockdevOptionsBlkverify',
      'bochs':      'BlockdevOptionsGenericFormat',
      'cloop':      'BlockdevOptionsGenericFormat',
      'cow':        'BlockdevOptionsGenericCOWFormat',
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -536,11 +536,11 @@ support of multiple VM snapshots.
 Supported options:
@table @code
@item compat
-Determines the qcow2 version to use. @code{compat=0.10} uses the traditional
-image format that can be read by any QEMU since 0.10 (this is the default).
+Determines the qcow2 version to use. @code{compat=0.10} uses the
+traditional image format that can be read by any QEMU since 0.10.
@code{compat=1.1} enables image format extensions that only QEMU 1.1 and
-newer understand. Amongst others, this includes zero clusters, which allow
-efficient copy-on-read for sparse images.
+newer understand (this is the default). Amongst others, this includes
+zero clusters, which allow efficient copy-on-read for sparse images.

@item backing_file
 File name of a base image (see @option{create} subcommand)
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -57,7 +57,9 @@ indicates that target image must be compressed (qcow format only)
@item -h
 with or without a command shows help and lists the supported formats
@item -p
-display progress bar (convert and rebase commands only)
+display progress bar (compare, convert and rebase commands only).
+If the @var{-p} option is not used for a command that supports it, the
+progress is reported when the process receives a @code{SIGUSR1} signal.
@item -q
 Quiet mode - do not print any output (except errors). There's no progress bar
 in case both @var{-q} and @var{-p} options are used.
@@ -140,7 +142,12 @@ it doesn't need to be specified separately in this case.

@item commit [-f @var{fmt}] [-t @var{cache}] @var{filename}

-Commit the changes recorded in @var{filename} in its base image.
+Commit the changes recorded in @var{filename} in its base image or backing file.
+If the backing file is smaller than the snapshot, then the backing file will be
+resized to be the same size as the snapshot.  If the snapshot is smaller than
+the backing file, the backing file will not be truncated.  If you want the
+backing file to match the size of the smaller snapshot, you can safely truncate
+it yourself once the commit operation successfully completes.

@item compare [-f @var{fmt}] [-F @var{fmt}] [-p] [-s] [-q] @var{filename1} @var{filename2}

@@ -391,11 +398,11 @@ support of multiple VM snapshots.
 Supported options:
@table @code
@item compat
-Determines the qcow2 version to use. @code{compat=0.10} uses the traditional
-image format that can be read by any QEMU since 0.10 (this is the default).
+Determines the qcow2 version to use. @code{compat=0.10} uses the
+traditional image format that can be read by any QEMU since 0.10.
@code{compat=1.1} enables image format extensions that only QEMU 1.1 and
-newer understand. Amongst others, this includes zero clusters, which allow
-efficient copy-on-read for sparse images.
+newer understand (this is the default). Amongst others, this includes zero
+clusters, which allow efficient copy-on-read for sparse images.

@item backing_file
 File name of a base image (see @option{create} subcommand)
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -12,6 +12,7 @@
 #include "block/block_int.h"
 #include "block/qapi.h"
 #include "qemu/main-loop.h"
+#include "qemu/timer.h"

 #define CMD_NOFILE_OK   0x01

@@ -94,6 +95,21 @@ static const cmdinfo_t *find_command(const char *cmd)
    return NULL;
 }

+/* Invoke fn() for commands with a matching prefix */
+void qemuio_complete_command(const char *input,
+                             void (*fn)(const char *cmd, void *opaque),
+                             void *opaque)
+{
+    cmdinfo_t *ct;
+    size_t input_len = strlen(input);
+
+    for (ct = cmdtab; ct < &cmdtab[ncmds]; ct++) {
+        if (strncmp(input, ct->name, input_len) == 0) {
+            fn(ct->name, opaque);
+        }
+    }
+}
+
 static char **breakline(char *input, int *count)
 {
    int c = 0;
@@ -2038,6 +2054,46 @@ static const cmdinfo_t abort_cmd = {
       .oneline        = "simulate a program crash using abort(3)",
 };

+static void sleep_cb(void *opaque)
+{
+    bool *expired = opaque;
+    *expired = true;
+}
+
+static int sleep_f(BlockDriverState *bs, int argc, char **argv)
+{
+    char *endptr;
+    long ms;
+    struct QEMUTimer *timer;
+    bool expired = false;
+
+    ms = strtol(argv[1], &endptr, 0);
+    if (ms < 0 || *endptr != '\0') {
+        printf("%s is not a valid number\n", argv[1]);
+        return 0;
+    }
+
+    timer = timer_new_ns(QEMU_CLOCK_HOST, sleep_cb, &expired);
+    timer_mod(timer, qemu_clock_get_ns(QEMU_CLOCK_HOST) + SCALE_MS * ms);
+
+    while (!expired) {
+        main_loop_wait(false);
+    }
+
+    timer_free(timer);
+
+    return 0;
+}
+
+static const cmdinfo_t sleep_cmd = {
+       .name           = "sleep",
+       .argmin         = 1,
+       .argmax         = 1,
+       .cfunc          = sleep_f,
+       .flags          = CMD_NOFILE_OK,
+       .oneline        = "waits for the given value in milliseconds",
+};
+
 static void help_oneline(const char *cmd, const cmdinfo_t *ct)
 {
    if (cmd) {
@@ -2151,4 +2207,5 @@ static void __attribute((constructor)) init_qemuio_commands(void)
    qemuio_add_command(&resume_cmd);
    qemuio_add_command(&wait_break_cmd);
    qemuio_add_command(&abort_cmd);
+    qemuio_add_command(&sleep_cmd);
 }
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -18,6 +18,7 @@
 #include "qemu/main-loop.h"
 #include "qemu/option.h"
 #include "qemu/config-file.h"
+#include "qemu/readline.h"
 #include "block/block_int.h"
 #include "trace/control.h"

@@ -32,6 +33,8 @@ extern int qemuio_misalign;
 static int ncmdline;
 static char **cmdline;

+static ReadLineState *readline_state;
+
 static int close_f(BlockDriverState *bs, int argc, char **argv)
 {
    bdrv_unref(bs);
@@ -56,7 +59,7 @@ static int openfile(char *name, int flags, int growable, QDict *opts)
    }

    if (growable) {
-        if (bdrv_file_open(&qemuio_bs, name, opts, flags, &local_err)) {
+        if (bdrv_file_open(&qemuio_bs, name, NULL, opts, flags, &local_err)) {
            fprintf(stderr, "%s: can't open device %s: %s\n", progname, name,
                    error_get_pretty(local_err));
            error_free(local_err);
@@ -160,11 +163,13 @@ static int open_f(BlockDriverState *bs, int argc, char **argv)
        flags |= BDRV_O_RDWR;
    }

-    if (optind != argc - 1) {
+    if (optind == argc - 1) {
+        return openfile(argv[optind], flags, growable, opts);
+    } else if (optind == argc) {
+        return openfile(NULL, flags, growable, opts);
+    } else {
        return qemuio_command_usage(&open_cmd);
    }
-
-    return openfile(argv[optind], flags, growable, opts);
 }

 static int quit_f(BlockDriverState *bs, int argc, char **argv)
@@ -203,14 +208,6 @@ static void usage(const char *name)
    name);
 }

-
-#if defined(ENABLE_READLINE)
-# include <readline/history.h>
-# include <readline/readline.h>
-#elif defined(ENABLE_EDITLINE)
-# include <histedit.h>
-#endif
-
 static char *get_prompt(void)
 {
    static char prompt[FILENAME_MAX + 2 /*"> "*/ + 1 /*"\0"*/ ];
@@ -222,52 +219,53 @@ static char *get_prompt(void)
    return prompt;
 }

-#if defined(ENABLE_READLINE)
-static char *fetchline(void)
+static void readline_printf_func(void *opaque, const char *fmt, ...)
 {
-    char *line = readline(get_prompt());
-    if (line && *line) {
-        add_history(line);
+    va_list ap;
+    va_start(ap, fmt);
+    vprintf(fmt, ap);
+    va_end(ap);
+}
+
+static void readline_flush_func(void *opaque)
+{
+    fflush(stdout);
+}
+
+static void readline_func(void *opaque, const char *str, void *readline_opaque)
+{
+    char **line = readline_opaque;
+    *line = g_strdup(str);
+}
+
+static void completion_match(const char *cmd, void *opaque)
+{
+    readline_add_completion(readline_state, cmd);
+}
+
+static void readline_completion_func(void *opaque, const char *str)
+{
+    readline_set_completion_index(readline_state, strlen(str));
+    qemuio_complete_command(str, completion_match, NULL);
+}
+
+static char *fetchline_readline(void)
+{
+    char *line = NULL;
+
+    readline_start(readline_state, get_prompt(), 0, readline_func, &line);
+    while (!line) {
+        int ch = getchar();
+        if (ch == EOF) {
+            break;
+        }
+        readline_handle_byte(readline_state, ch);
    }
    return line;
 }
-#elif defined(ENABLE_EDITLINE)
-static char *el_get_prompt(EditLine *e)
-{
-    return get_prompt();
-}

-static char *fetchline(void)
-{
-    static EditLine *el;
-    static History *hist;
-    HistEvent hevent;
-    char *line;
-    int count;
-
-    if (!el) {
-        hist = history_init();
-        history(hist, &hevent, H_SETSIZE, 100);
-        el = el_init(progname, stdin, stdout, stderr);
-        el_source(el, NULL);
-        el_set(el, EL_SIGNAL, 1);
-        el_set(el, EL_PROMPT, el_get_prompt);
-        el_set(el, EL_HIST, history, (const char *)hist);
-    }
-    line = strdup(el_gets(el, &count));
-    if (line) {
-        if (count > 0) {
-            line[count-1] = '\0';
-        }
-        if (*line) {
-            history(hist, &hevent, H_ENTER, line);
-        }
-    }
-    return line;
-}
-#else
-# define MAXREADLINESZ 1024
-static char *fetchline(void)
+#define MAXREADLINESZ 1024
+static char *fetchline_fgets(void)
 {
    char *p, *line = g_malloc(MAXREADLINESZ);

@@ -283,7 +281,15 @@ static char *fetchline(void)

    return line;
 }
-#endif
+
+static char *fetchline(void)
+{
+    if (readline_state) {
+        return fetchline_readline();
+    } else {
+        return fetchline_fgets();
+    }
+}

 static void prep_fetchline(void *opaque)
 {
@@ -339,6 +345,11 @@ static void add_user_command(char *optarg)
    cmdline[ncmdline-1] = optarg;
 }

+static void reenable_tty_echo(void)
+{
+    qemu_set_tty_echo(STDIN_FILENO, true);
+}
+
 int main(int argc, char **argv)
 {
    int readonly = 0;
@@ -435,6 +446,15 @@ int main(int argc, char **argv)
    qemuio_add_command(&open_cmd);
    qemuio_add_command(&close_cmd);

+    if (isatty(STDIN_FILENO)) {
+        readline_state = readline_init(readline_printf_func,
+                                       readline_flush_func,
+                                       NULL,
+                                       readline_completion_func);
+        qemu_set_tty_echo(STDIN_FILENO, false);
+        atexit(reenable_tty_echo);
+    }
+
    /* open the device */
    if (!readonly) {
        flags |= BDRV_O_RDWR;
@@ -453,5 +473,6 @@ int main(int argc, char **argv)
    if (qemuio_bs) {
        bdrv_unref(qemuio_bs);
    }
+    g_free(readline_state);
    return 0;
 }
--- a/qemu-seccomp.c
+++ b/qemu-seccomp.c
@@ -220,7 +220,12 @@ static const struct QemuSeccompSyscall seccomp_whitelist[] = {
    { SCMP_SYS(io_cancel), 241 },
    { SCMP_SYS(io_setup), 241 },
    { SCMP_SYS(io_destroy), 241 },
-    { SCMP_SYS(arch_prctl), 240 }
+    { SCMP_SYS(arch_prctl), 240 },
+    { SCMP_SYS(mkdir), 240 },
+    { SCMP_SYS(fchmod), 240 },
+    { SCMP_SYS(shmget), 240 },
+    { SCMP_SYS(shmat), 240 },
+    { SCMP_SYS(shmdt), 240 }
 };

 int seccomp_start(void)
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -931,7 +931,7 @@ EQMP

    {
        .name       = "block_resize",
-        .args_type  = "device:B,size:o",
+        .args_type  = "device:s?,node-name:s?,size:o",
        .mhandler.cmd_new = qmp_marshal_input_block_resize,
    },

@@ -944,6 +944,7 @@ Resize a block image while a guest is running.
 Arguments:

 - "device": the device's ID, must be unique (json-string)
+- "node-name": the node name in the block driver state graph (json-string)
 - "size": new size

 Example:
@@ -965,6 +966,45 @@ EQMP
        .mhandler.cmd_new = qmp_marshal_input_block_commit,
    },

+SQMP
+block-commit
+------------
+
+Live commit of data from overlay image nodes into backing nodes - i.e., writes
+data between 'top' and 'base' into 'base'.
+
+Arguments:
+
+- "device": The device's ID, must be unique (json-string)
+- "base": The file name of the backing image to write data into.
+          If not specified, this is the deepest backing image
+          (json-string, optional)
+- "top":  The file name of the backing image within the image chain,
+          which contains the topmost data to be committed down.
+
+          If top == base, that is an error.
+          If top == active, the job will not be completed by itself,
+          user needs to complete the job with the block-job-complete
+          command after getting the ready event. (Since 2.0)
+
+          If the base image is smaller than top, then the base image
+          will be resized to be the same size as top.  If top is
+          smaller than the base image, the base will not be
+          truncated.  If you want the base image size to match the
+          size of the smaller top, you can safely truncate it
+          yourself once the commit operation successfully completes.
+          (json-string)
+- "speed":  the maximum speed, in bytes per second (json-int, optional)
+
+
+Example:
+
+-> { "execute": "block-commit", "arguments": { "device": "virtio0",
+                                              "top": "/tmp/snap1.qcow2" } }
+<- { "return": {} }
+
+EQMP
+
    {
        .name       = "drive-backup",
        .args_type  = "sync:s,device:B,target:s,speed:i?,mode:s?,format:s?,"
@@ -1088,7 +1128,9 @@ actions array:
    - "data": a dictionary.  The contents depend on the value
      of "type".  When "type" is "blockdev-snapshot-sync":
      - "device": device name to snapshot (json-string)
+      - "node-name": graph node name to snapshot (json-string)
      - "snapshot-file": name of new image file (json-string)
+      - "snapshot-node-name": graph node name of the new snapshot (json-string)
      - "format": format of new image (json-string, optional)
      - "mode": whether and how QEMU should create the snapshot file
        (NewImageMode, optional, default "absolute-paths")
@@ -1103,6 +1145,11 @@ Example:
         { 'type': 'blockdev-snapshot-sync', 'data' : { "device": "ide-hd0",
                                         "snapshot-file": "/some/place/my-image",
                                         "format": "qcow2" } },
+         { 'type': 'blockdev-snapshot-sync', 'data' : { "node-name": "myfile",
+                                         "snapshot-file": "/some/place/my-image2",
+                                         "snapshot-node-name": "node3432",
+                                         "mode": "existing",
+                                         "format": "qcow2" } },
         { 'type': 'blockdev-snapshot-sync', 'data' : { "device": "ide-hd1",
                                         "snapshot-file": "/some/place/my-image2",
                                         "mode": "existing",
@@ -1116,7 +1163,7 @@ EQMP

    {
        .name       = "blockdev-snapshot-sync",
-        .args_type  = "device:B,snapshot-file:s,format:s?,mode:s?",
+        .args_type  = "device:s?,node-name:s?,snapshot-file:s,snapshot-node-name:s?,format:s?,mode:s?",
        .mhandler.cmd_new = qmp_marshal_input_blockdev_snapshot_sync,
    },

@@ -1133,7 +1180,9 @@ snapshot image, default is qcow2.
 Arguments:

 - "device": device name to snapshot (json-string)
+- "node-name": graph node name to snapshot (json-string)
 - "snapshot-file": name of new image file (json-string)
+- "snapshot-node-name": graph node name of the new snapshot (json-string)
 - "mode": whether and how QEMU should create the snapshot file
  (NewImageMode, optional, default "absolute-paths")
 - "format": format of new image (json-string, optional)
@@ -1503,7 +1552,7 @@ EQMP

    {
        .name       = "block_passwd",
-        .args_type  = "device:B,password:s",
+        .args_type  = "device:s?,node-name:s?,password:s",
        .mhandler.cmd_new = qmp_marshal_input_block_passwd,
    },

@@ -1516,6 +1565,7 @@ Set the password of encrypted block devices.
 Arguments:

 - "device": device name (json-string)
+- "node-name": name in the block driver state graph (json-string)
 - "password": password (json-string)

 Example:
@@ -3345,4 +3395,65 @@ Example (2):

 <- { "return": {} }

+EQMP
+
+    {
+        .name       = "query-named-block-nodes",
+        .args_type  = "",
+        .mhandler.cmd_new = qmp_marshal_input_query_named_block_nodes,
+    },
+
+SQMP
+@query-named-block-nodes
+------------------------
+
+Return a list of BlockDeviceInfo for all the named block driver nodes
+
+Example:
+
+-> { "execute": "query-named-block-nodes" }
+<- { "return": [ { "ro":false,
+                   "drv":"qcow2",
+                   "encrypted":false,
+                   "file":"disks/test.qcow2",
+                   "node-name": "my-node",
+                   "backing_file_depth":1,
+                   "bps":1000000,
+                   "bps_rd":0,
+                   "bps_wr":0,
+                   "iops":1000000,
+                   "iops_rd":0,
+                   "iops_wr":0,
+                   "bps_max": 8000000,
+                   "bps_rd_max": 0,
+                   "bps_wr_max": 0,
+                   "iops_max": 0,
+                   "iops_rd_max": 0,
+                   "iops_wr_max": 0,
+                   "iops_size": 0,
+                   "image":{
+                      "filename":"disks/test.qcow2",
+                      "format":"qcow2",
+                      "virtual-size":2048000,
+                      "backing_file":"base.qcow2",
+                      "full-backing-filename":"disks/base.qcow2",
+                      "backing-filename-format:"qcow2",
+                      "snapshots":[
+                         {
+                            "id": "1",
+                            "name": "snapshot1",
+                            "vm-state-size": 0,
+                            "date-sec": 10000200,
+                            "date-nsec": 12,
+                            "vm-clock-sec": 206,
+                            "vm-clock-nsec": 30
+                         }
+                      ],
+                      "backing-image":{
+                          "filename":"disks/base.qcow2",
+                          "format":"qcow2",
+                          "virtual-size":2048000
+                      }
+                   } } ] }
+
 EQMP
--- a/qobject/qdict.c
+++ b/qobject/qdict.c
@@ -477,7 +477,43 @@ static void qdict_destroy_obj(QObject *obj)
    g_free(qdict);
 }

-static void qdict_do_flatten(QDict *qdict, QDict *target, const char *prefix)
+static void qdict_flatten_qdict(QDict *qdict, QDict *target,
+                                const char *prefix);
+
+static void qdict_flatten_qlist(QList *qlist, QDict *target, const char *prefix)
+{
+    QObject *value;
+    const QListEntry *entry;
+    char *new_key;
+    int i;
+
+    /* This function is never called with prefix == NULL, i.e., it is always
+     * called from within qdict_flatten_q(list|dict)(). Therefore, it does not
+     * need to remove list entries during the iteration (the whole list will be
+     * deleted eventually anyway from qdict_flatten_qdict()). */
+    assert(prefix);
+
+    entry = qlist_first(qlist);
+
+    for (i = 0; entry; entry = qlist_next(entry), i++) {
+        value = qlist_entry_obj(entry);
+        new_key = g_strdup_printf("%s.%i", prefix, i);
+
+        if (qobject_type(value) == QTYPE_QDICT) {
+            qdict_flatten_qdict(qobject_to_qdict(value), target, new_key);
+        } else if (qobject_type(value) == QTYPE_QLIST) {
+            qdict_flatten_qlist(qobject_to_qlist(value), target, new_key);
+        } else {
+            /* All other types are moved to the target unchanged. */
+            qobject_incref(value);
+            qdict_put_obj(target, new_key, value);
+        }
+
+        g_free(new_key);
+    }
+}
+
+static void qdict_flatten_qdict(QDict *qdict, QDict *target, const char *prefix)
 {
    QObject *value;
    const QDictEntry *entry, *next;
@@ -500,8 +536,12 @@ static void qdict_do_flatten(QDict *qdict, QDict *target, const char *prefix)
        if (qobject_type(value) == QTYPE_QDICT) {
            /* Entries of QDicts are processed recursively, the QDict object
             * itself disappears. */
-            qdict_do_flatten(qobject_to_qdict(value), target,
-                             new_key ? new_key : entry->key);
+            qdict_flatten_qdict(qobject_to_qdict(value), target,
+                                new_key ? new_key : entry->key);
+            delete = true;
+        } else if (qobject_type(value) == QTYPE_QLIST) {
+            qdict_flatten_qlist(qobject_to_qlist(value), target,
+                                new_key ? new_key : entry->key);
            delete = true;
        } else if (prefix) {
            /* All other objects are moved to the target unchanged. */
@@ -526,12 +566,14 @@ static void qdict_do_flatten(QDict *qdict, QDict *target, const char *prefix)

 /**
 * qdict_flatten(): For each nested QDict with key x, all fields with key y
- * are moved to this QDict and their key is renamed to "x.y". This operation
- * is applied recursively for nested QDicts.
+ * are moved to this QDict and their key is renamed to "x.y". For each nested
+ * QList with key x, the field at index y is moved to this QDict with the key
+ * "x.y" (i.e., the reverse of what qdict_array_split() does).
+ * This operation is applied recursively for nested QDicts and QLists.
 */
 void qdict_flatten(QDict *qdict)
 {
-    qdict_do_flatten(qdict, qdict, NULL);
+    qdict_flatten_qdict(qdict, qdict, NULL);
 }

 /* extract all the src QDict entries starting by start into dst */
@@ -554,3 +596,40 @@ void qdict_extract_subqdict(QDict *src, QDict **dst, const char *start)
        entry = next;
    }
 }
+
+/**
+ * qdict_array_split(): This function moves array-like elements of a QDict into
+ * a new QList of QDicts. Every entry in the original QDict with a key prefixed
+ * "%u.", where %u designates an unsigned integer starting at 0 and
+ * incrementally counting up, will be moved to a new QDict at index %u in the
+ * output QList with the key prefix removed. The function terminates when there
+ * is no entry in the QDict with a prefix directly (incrementally) following the
+ * last one.
+ * Example: {"0.a": 42, "0.b": 23, "1.x": 0, "3.y": 1, "o.o": 7}
+ *      (or {"1.x": 0, "3.y": 1, "0.a": 42, "o.o": 7, "0.b": 23})
+ *       => [{"a": 42, "b": 23}, {"x": 0}]
+ *      and {"3.y": 1, "o.o": 7} (remainder of the old QDict)
+ */
+void qdict_array_split(QDict *src, QList **dst)
+{
+    unsigned i;
+
+    *dst = qlist_new();
+
+    for (i = 0; i < UINT_MAX; i++) {
+        QDict *subqdict;
+        char prefix[32];
+        size_t snprintf_ret;
+
+        snprintf_ret = snprintf(prefix, 32, "%u.", i);
+        assert(snprintf_ret < 32);
+
+        qdict_extract_subqdict(src, &subqdict, prefix);
+        if (!qdict_size(subqdict)) {
+            QDECREF(subqdict);
+            break;
+        }
+
+        qlist_append_obj(*dst, QOBJECT(subqdict));
+    }
+}
--- a/scripts/create_config
+++ b/scripts/create_config
@@ -26,6 +26,10 @@ case $line in
    # save for the next definitions
    prefix=${line#*=}
    ;;
+ IASL=*) # iasl executable
+    value=${line#*=}
+    echo "#define CONFIG_IASL $value"
+    ;;
 CONFIG_AUDIO_DRIVERS=*)
    drivers=${line#*=}
    echo "#define CONFIG_AUDIO_DRIVERS \\"
--- a/scripts/dump-guest-memory.py
+++ b/scripts/dump-guest-memory.py
@@ -0,0 +1,339 @@
+# This python script adds a new gdb command, "dump-guest-memory". It
+# should be loaded with "source dump-guest-memory.py" at the (gdb)
+# prompt.
+#
+# Copyright (C) 2013, Red Hat, Inc.
+#
+# Authors:
+#   Laszlo Ersek <lersek@redhat.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2 or later. See
+# the COPYING file in the top-level directory.
+#
+# The leading docstring doesn't have idiomatic Python formatting. It is
+# printed by gdb's "help" command (the first line is printed in the
+# "help data" summary), and it should match how other help texts look in
+# gdb.
+
+import struct
+
+class DumpGuestMemory(gdb.Command):
+    """Extract guest vmcore from qemu process coredump.
+
+The sole argument is FILE, identifying the target file to write the
+guest vmcore to.
+
+This GDB command reimplements the dump-guest-memory QMP command in
+python, using the representation of guest memory as captured in the qemu
+coredump. The qemu process that has been dumped must have had the
+command line option "-machine dump-guest-core=on".
+
+For simplicity, the "paging", "begin" and "end" parameters of the QMP
+command are not supported -- no attempt is made to get the guest's
+internal paging structures (ie. paging=false is hard-wired), and guest
+memory is always fully dumped.
+
+Only x86_64 guests are supported.
+
+The CORE/NT_PRSTATUS and QEMU notes (that is, the VCPUs' statuses) are
+not written to the vmcore. Preparing these would require context that is
+only present in the KVM host kernel module when the guest is alive. A
+fake ELF note is written instead, only to keep the ELF parser of "crash"
+happy.
+
+Dependent on how busted the qemu process was at the time of the
+coredump, this command might produce unpredictable results. If qemu
+deliberately called abort(), or it was dumped in response to a signal at
+a halfway fortunate point, then its coredump should be in reasonable
+shape and this command should mostly work."""
+
+    TARGET_PAGE_SIZE = 0x1000
+    TARGET_PAGE_MASK = 0xFFFFFFFFFFFFF000
+
+    # Various ELF constants
+    EM_X86_64   = 62        # AMD x86-64 target machine
+    ELFDATA2LSB = 1         # little endian
+    ELFCLASS64  = 2
+    ELFMAG      = "\x7FELF"
+    EV_CURRENT  = 1
+    ET_CORE     = 4
+    PT_LOAD     = 1
+    PT_NOTE     = 4
+
+    # Special value for e_phnum. This indicates that the real number of
+    # program headers is too large to fit into e_phnum. Instead the real
+    # value is in the field sh_info of section 0.
+    PN_XNUM = 0xFFFF
+
+    # Format strings for packing and header size calculation.
+    ELF64_EHDR = ("4s" # e_ident/magic
+                  "B"  # e_ident/class
+                  "B"  # e_ident/data
+                  "B"  # e_ident/version
+                  "B"  # e_ident/osabi
+                  "8s" # e_ident/pad
+                  "H"  # e_type
+                  "H"  # e_machine
+                  "I"  # e_version
+                  "Q"  # e_entry
+                  "Q"  # e_phoff
+                  "Q"  # e_shoff
+                  "I"  # e_flags
+                  "H"  # e_ehsize
+                  "H"  # e_phentsize
+                  "H"  # e_phnum
+                  "H"  # e_shentsize
+                  "H"  # e_shnum
+                  "H"  # e_shstrndx
+                 )
+    ELF64_PHDR = ("I"  # p_type
+                  "I"  # p_flags
+                  "Q"  # p_offset
+                  "Q"  # p_vaddr
+                  "Q"  # p_paddr
+                  "Q"  # p_filesz
+                  "Q"  # p_memsz
+                  "Q"  # p_align
+                 )
+
+    def __init__(self):
+        super(DumpGuestMemory, self).__init__("dump-guest-memory",
+                                              gdb.COMMAND_DATA,
+                                              gdb.COMPLETE_FILENAME)
+        self.uintptr_t     = gdb.lookup_type("uintptr_t")
+        self.elf64_ehdr_le = struct.Struct("<%s" % self.ELF64_EHDR)
+        self.elf64_phdr_le = struct.Struct("<%s" % self.ELF64_PHDR)
+
+    def int128_get64(self, val):
+        assert (val["hi"] == 0)
+        return val["lo"]
+
+    def qtailq_foreach(self, head, field_str):
+        var_p = head["tqh_first"]
+        while (var_p != 0):
+            var = var_p.dereference()
+            yield var
+            var_p = var[field_str]["tqe_next"]
+
+    def qemu_get_ram_block(self, ram_addr):
+        ram_blocks = gdb.parse_and_eval("ram_list.blocks")
+        for block in self.qtailq_foreach(ram_blocks, "next"):
+            if (ram_addr - block["offset"] < block["length"]):
+                return block
+        raise gdb.GdbError("Bad ram offset %x" % ram_addr)
+
+    def qemu_get_ram_ptr(self, ram_addr):
+        block = self.qemu_get_ram_block(ram_addr)
+        return block["host"] + (ram_addr - block["offset"])
+
+    def memory_region_get_ram_ptr(self, mr):
+        if (mr["alias"] != 0):
+            return (self.memory_region_get_ram_ptr(mr["alias"].dereference()) +
+                    mr["alias_offset"])
+        return self.qemu_get_ram_ptr(mr["ram_addr"] & self.TARGET_PAGE_MASK)
+
+    def guest_phys_blocks_init(self):
+        self.guest_phys_blocks = []
+
+    def guest_phys_blocks_append(self):
+        print "guest RAM blocks:"
+        print ("target_start     target_end       host_addr        message "
+               "count")
+        print ("---------------- ---------------- ---------------- ------- "
+               "-----")
+
+        current_map_p = gdb.parse_and_eval("address_space_memory.current_map")
+        current_map = current_map_p.dereference()
+        for cur in range(current_map["nr"]):
+            flat_range   = (current_map["ranges"] + cur).dereference()
+            mr           = flat_range["mr"].dereference()
+
+            # we only care about RAM
+            if (not mr["ram"]):
+                continue
+
+            section_size = self.int128_get64(flat_range["addr"]["size"])
+            target_start = self.int128_get64(flat_range["addr"]["start"])
+            target_end   = target_start + section_size
+            host_addr    = (self.memory_region_get_ram_ptr(mr) +
+                            flat_range["offset_in_region"])
+            predecessor = None
+
+            # find continuity in guest physical address space
+            if (len(self.guest_phys_blocks) > 0):
+                predecessor = self.guest_phys_blocks[-1]
+                predecessor_size = (predecessor["target_end"] -
+                                    predecessor["target_start"])
+
+                # the memory API guarantees monotonically increasing
+                # traversal
+                assert (predecessor["target_end"] <= target_start)
+
+                # we want continuity in both guest-physical and
+                # host-virtual memory
+                if (predecessor["target_end"] < target_start or
+                    predecessor["host_addr"] + predecessor_size != host_addr):
+                    predecessor = None
+
+            if (predecessor is None):
+                # isolated mapping, add it to the list
+                self.guest_phys_blocks.append({"target_start": target_start,
+                                               "target_end"  : target_end,
+                                               "host_addr"   : host_addr})
+                message = "added"
+            else:
+                # expand predecessor until @target_end; predecessor's
+                # start doesn't change
+                predecessor["target_end"] = target_end
+                message = "joined"
+
+            print ("%016x %016x %016x %-7s %5u" %
+                   (target_start, target_end, host_addr.cast(self.uintptr_t),
+                    message, len(self.guest_phys_blocks)))
+
+    def cpu_get_dump_info(self):
+        # We can't synchronize the registers with KVM post-mortem, and
+        # the bits in (first_x86_cpu->env.hflags) seem to be stale; they
+        # may not reflect long mode for example. Hence just assume the
+        # most common values. This also means that instruction pointer
+        # etc. will be bogus in the dump, but at least the RAM contents
+        # should be valid.
+        self.dump_info = {"d_machine": self.EM_X86_64,
+                          "d_endian" : self.ELFDATA2LSB,
+                          "d_class"  : self.ELFCLASS64}
+
+    def encode_elf64_ehdr_le(self):
+        return self.elf64_ehdr_le.pack(
+                                 self.ELFMAG,                 # e_ident/magic
+                                 self.dump_info["d_class"],   # e_ident/class
+                                 self.dump_info["d_endian"],  # e_ident/data
+                                 self.EV_CURRENT,             # e_ident/version
+                                 0,                           # e_ident/osabi
+                                 "",                          # e_ident/pad
+                                 self.ET_CORE,                # e_type
+                                 self.dump_info["d_machine"], # e_machine
+                                 self.EV_CURRENT,             # e_version
+                                 0,                           # e_entry
+                                 self.elf64_ehdr_le.size,     # e_phoff
+                                 0,                           # e_shoff
+                                 0,                           # e_flags
+                                 self.elf64_ehdr_le.size,     # e_ehsize
+                                 self.elf64_phdr_le.size,     # e_phentsize
+                                 self.phdr_num,               # e_phnum
+                                 0,                           # e_shentsize
+                                 0,                           # e_shnum
+                                 0                            # e_shstrndx
+                                )
+
+    def encode_elf64_note_le(self):
+        return self.elf64_phdr_le.pack(self.PT_NOTE,         # p_type
+                                       0,                    # p_flags
+                                       (self.memory_offset -
+                                        len(self.note)),     # p_offset
+                                       0,                    # p_vaddr
+                                       0,                    # p_paddr
+                                       len(self.note),       # p_filesz
+                                       len(self.note),       # p_memsz
+                                       0                     # p_align
+                                      )
+
+    def encode_elf64_load_le(self, offset, start_hwaddr, range_size):
+        return self.elf64_phdr_le.pack(self.PT_LOAD, # p_type
+                                       0,            # p_flags
+                                       offset,       # p_offset
+                                       0,            # p_vaddr
+                                       start_hwaddr, # p_paddr
+                                       range_size,   # p_filesz
+                                       range_size,   # p_memsz
+                                       0             # p_align
+                                      )
+
+    def note_init(self, name, desc, type):
+        # name must include a trailing NUL
+        namesz = (len(name) + 1 + 3) / 4 * 4
+        descsz = (len(desc)     + 3) / 4 * 4
+        fmt = ("<"   # little endian
+               "I"   # n_namesz
+               "I"   # n_descsz
+               "I"   # n_type
+               "%us" # name
+               "%us" # desc
+               % (namesz, descsz))
+        self.note = struct.pack(fmt,
+                                len(name) + 1, len(desc), type, name, desc)
+
+    def dump_init(self):
+        self.guest_phys_blocks_init()
+        self.guest_phys_blocks_append()
+        self.cpu_get_dump_info()
+        # we have no way to retrieve the VCPU status from KVM
+        # post-mortem
+        self.note_init("NONE", "EMPTY", 0)
+
+        # Account for PT_NOTE.
+        self.phdr_num = 1
+
+        # We should never reach PN_XNUM for paging=false dumps: there's
+        # just a handful of discontiguous ranges after merging.
+        self.phdr_num += len(self.guest_phys_blocks)
+        assert (self.phdr_num < self.PN_XNUM)
+
+        # Calculate the ELF file offset where the memory dump commences:
+        #
+        #   ELF header
+        #   PT_NOTE
+        #   PT_LOAD: 1
+        #   PT_LOAD: 2
+        #   ...
+        #   PT_LOAD: len(self.guest_phys_blocks)
+        #   ELF note
+        #   memory dump
+        self.memory_offset = (self.elf64_ehdr_le.size +
+                              self.elf64_phdr_le.size * self.phdr_num +
+                              len(self.note))
+
+    def dump_begin(self, vmcore):
+        vmcore.write(self.encode_elf64_ehdr_le())
+        vmcore.write(self.encode_elf64_note_le())
+        running = self.memory_offset
+        for block in self.guest_phys_blocks:
+            range_size = block["target_end"] - block["target_start"]
+            vmcore.write(self.encode_elf64_load_le(running,
+                                                   block["target_start"],
+                                                   range_size))
+            running += range_size
+        vmcore.write(self.note)
+
+    def dump_iterate(self, vmcore):
+        qemu_core = gdb.inferiors()[0]
+        for block in self.guest_phys_blocks:
+            cur  = block["host_addr"]
+            left = block["target_end"] - block["target_start"]
+            print ("dumping range at %016x for length %016x" %
+                   (cur.cast(self.uintptr_t), left))
+            while (left > 0):
+                chunk_size = min(self.TARGET_PAGE_SIZE, left)
+                chunk = qemu_core.read_memory(cur, chunk_size)
+                vmcore.write(chunk)
+                cur  += chunk_size
+                left -= chunk_size
+
+    def create_vmcore(self, filename):
+        vmcore = open(filename, "wb")
+        self.dump_begin(vmcore)
+        self.dump_iterate(vmcore)
+        vmcore.close()
+
+    def invoke(self, args, from_tty):
+        # Unwittingly pressing the Enter key after the command should
+        # not dump the same multi-gig coredump to the same file.
+        self.dont_repeat()
+
+        argv = gdb.string_to_argv(args)
+        if (len(argv) != 1):
+            raise gdb.GdbError("usage: dump-guest-memory FILE")
+
+        self.dump_init()
+        self.create_vmcore(argv[0])
+
+DumpGuestMemory()
--- a/scripts/qapi.py
+++ b/scripts/qapi.py
@@ -247,7 +247,7 @@ def c_var(name, protect=True):
                     'and', 'and_eq', 'bitand', 'bitor', 'compl', 'not',
                     'not_eq', 'or', 'or_eq', 'xor', 'xor_eq'])
    # namespace pollution:
-    polluted_words = set(['unix'])
+    polluted_words = set(['unix', 'errno'])
    if protect and (name in c89_words | c99_words | c11_words | gcc_words | cpp_words | polluted_words):
        return "q_" + name
    return name.replace('-', '_').lstrip("*")
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -336,6 +336,10 @@ typedef struct ExtSaveArea {
 static const ExtSaveArea ext_save_areas[] = {
    [2] = { .feature = FEAT_1_ECX, .bits = CPUID_EXT_AVX,
            .offset = 0x240, .size = 0x100 },
+    [3] = { .feature = FEAT_7_0_EBX, .bits = CPUID_7_0_EBX_MPX,
+            .offset = 0x3c0, .size = 0x40  },
+    [4] = { .feature = FEAT_7_0_EBX, .bits = CPUID_7_0_EBX_MPX,
+            .offset = 0x400, .size = 0x10  },
 };

 const char *get_register_name_32(unsigned int reg)
@@ -2461,6 +2465,9 @@ static void x86_cpu_reset(CPUState *s)
    cpu_breakpoint_remove_all(env, BP_CPU);
    cpu_watchpoint_remove_all(env, BP_CPU);

+    env->tsc_adjust = 0;
+    env->tsc = 0;
+
 #if !defined(CONFIG_USER_ONLY)
    /* We hard-wire the BSP to the first CPU. */
    if (s->cpu_index == 0) {
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -380,9 +380,14 @@

 #define MSR_VM_HSAVE_PA                 0xc0010117

-#define XSTATE_FP                       1
-#define XSTATE_SSE                      2
-#define XSTATE_YMM                      4
+#define MSR_IA32_BNDCFGS                0x00000d90
+
+#define XSTATE_FP                       (1ULL << 0)
+#define XSTATE_SSE                      (1ULL << 1)
+#define XSTATE_YMM                      (1ULL << 2)
+#define XSTATE_BNDREGS                  (1ULL << 3)
+#define XSTATE_BNDCSR                   (1ULL << 4)
+

 /* CPUID feature words */
 typedef enum FeatureWord {
@@ -545,6 +550,7 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS];
 #define CPUID_7_0_EBX_ERMS     (1 << 9)
 #define CPUID_7_0_EBX_INVPCID  (1 << 10)
 #define CPUID_7_0_EBX_RTM      (1 << 11)
+#define CPUID_7_0_EBX_MPX      (1 << 14)
 #define CPUID_7_0_EBX_RDSEED   (1 << 18)
 #define CPUID_7_0_EBX_ADX      (1 << 19)
 #define CPUID_7_0_EBX_SMAP     (1 << 20)
@@ -695,6 +701,16 @@ typedef union {
    uint64_t q;
 } MMXReg;

+typedef struct BNDReg {
+    uint64_t lb;
+    uint64_t ub;
+} BNDReg;
+
+typedef struct BNDCSReg {
+    uint64_t cfgu;
+    uint64_t sts;
+} BNDCSReg;
+
 #ifdef HOST_WORDS_BIGENDIAN
 #define XMM_B(n) _b[15 - (n)]
 #define XMM_W(n) _w[7 - (n)]
@@ -908,6 +924,9 @@ typedef struct CPUX86State {

    uint64_t xstate_bv;
    XMMReg ymmh_regs[CPU_NB_REGS];
+    BNDReg bnd_regs[4];
+    BNDCSReg bndcs_regs;
+    uint64_t msr_bndcfgs;

    uint64_t xcr0;

--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -69,6 +69,7 @@ static bool has_msr_feature_control;
 static bool has_msr_async_pf_en;
 static bool has_msr_pv_eoi_en;
 static bool has_msr_misc_enable;
+static bool has_msr_bndcfgs;
 static bool has_msr_kvm_steal_time;
 static int lm_capable_kernel;

@@ -772,6 +773,10 @@ static int kvm_get_supported_msrs(KVMState *s)
                    has_msr_misc_enable = true;
                    continue;
                }
+                if (kvm_msr_list->indices[i] == MSR_IA32_BNDCFGS) {
+                    has_msr_bndcfgs = true;
+                    continue;
+                }
            }
        }

@@ -975,6 +980,8 @@ static int kvm_put_fpu(X86CPU *cpu)
 #define XSAVE_XMM_SPACE   40
 #define XSAVE_XSTATE_BV   128
 #define XSAVE_YMMH_SPACE  144
+#define XSAVE_BNDREGS     240
+#define XSAVE_BNDCSR      256

 static int kvm_put_xsave(X86CPU *cpu)
 {
@@ -1007,6 +1014,10 @@ static int kvm_put_xsave(X86CPU *cpu)
    *(uint64_t *)&xsave->region[XSAVE_XSTATE_BV] = env->xstate_bv;
    memcpy(&xsave->region[XSAVE_YMMH_SPACE], env->ymmh_regs,
            sizeof env->ymmh_regs);
+    memcpy(&xsave->region[XSAVE_BNDREGS], env->bnd_regs,
+            sizeof env->bnd_regs);
+    memcpy(&xsave->region[XSAVE_BNDCSR], &env->bndcs_regs,
+            sizeof(env->bndcs_regs));
    r = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave);
    return r;
 }
@@ -1104,6 +1115,25 @@ static int kvm_put_tscdeadline_msr(X86CPU *cpu)
    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, &msr_data);
 }

+/*
+ * Provide a separate write service for the feature control MSR in order to
+ * kick the VCPU out of VMXON or even guest mode on reset. This has to be done
+ * before writing any other state because forcibly leaving nested mode
+ * invalidates the VCPU state.
+ */
+static int kvm_put_msr_feature_control(X86CPU *cpu)
+{
+    struct {
+        struct kvm_msrs info;
+        struct kvm_msr_entry entry;
+    } msr_data;
+
+    kvm_msr_entry_set(&msr_data.entry, MSR_IA32_FEATURE_CONTROL,
+                      cpu->env.msr_ia32_feature_control);
+    msr_data.info.nmsrs = 1;
+    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, &msr_data);
+}
+
 static int kvm_put_msrs(X86CPU *cpu, int level)
 {
    CPUX86State *env = &cpu->env;
@@ -1131,6 +1161,9 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
        kvm_msr_entry_set(&msrs[n++], MSR_IA32_MISC_ENABLE,
                          env->msr_ia32_misc_enable);
    }
+    if (has_msr_bndcfgs) {
+        kvm_msr_entry_set(&msrs[n++], MSR_IA32_BNDCFGS, env->msr_bndcfgs);
+    }
 #ifdef TARGET_X86_64
    if (lm_capable_kernel) {
        kvm_msr_entry_set(&msrs[n++], MSR_CSTAR, env->cstar);
@@ -1139,22 +1172,12 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
        kvm_msr_entry_set(&msrs[n++], MSR_LSTAR, env->lstar);
    }
 #endif
-    if (level == KVM_PUT_FULL_STATE) {
-        /*
-         * KVM is yet unable to synchronize TSC values of multiple VCPUs on
-         * writeback. Until this is fixed, we only write the offset to SMP
-         * guests after migration, desynchronizing the VCPUs, but avoiding
-         * huge jump-backs that would occur without any writeback at all.
-         */
-        if (smp_cpus == 1 || env->tsc != 0) {
-            kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
-        }
-    }
    /*
     * The following MSRs have side effects on the guest or are too heavy
     * for normal writeback. Limit them to reset or full state updates.
     */
    if (level >= KVM_PUT_RESET_STATE) {
+        kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
        kvm_msr_entry_set(&msrs[n++], MSR_KVM_SYSTEM_TIME,
                          env->system_time_msr);
        kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
@@ -1204,10 +1227,9 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
        if (cpu->hyperv_vapic) {
            kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_APIC_ASSIST_PAGE, 0);
        }
-        if (has_msr_feature_control) {
-            kvm_msr_entry_set(&msrs[n++], MSR_IA32_FEATURE_CONTROL,
-                              env->msr_ia32_feature_control);
-        }
+
+        /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see
+         *       kvm_put_msr_feature_control. */
    }
    if (env->mcg_cap) {
        int i;
@@ -1289,6 +1311,10 @@ static int kvm_get_xsave(X86CPU *cpu)
    env->xstate_bv = *(uint64_t *)&xsave->region[XSAVE_XSTATE_BV];
    memcpy(env->ymmh_regs, &xsave->region[XSAVE_YMMH_SPACE],
            sizeof env->ymmh_regs);
+    memcpy(env->bnd_regs, &xsave->region[XSAVE_BNDREGS],
+            sizeof env->bnd_regs);
+    memcpy(&env->bndcs_regs, &xsave->region[XSAVE_BNDCSR],
+            sizeof(env->bndcs_regs));
    return 0;
 }

@@ -1435,6 +1461,9 @@ static int kvm_get_msrs(X86CPU *cpu)
    if (has_msr_feature_control) {
        msrs[n++].index = MSR_IA32_FEATURE_CONTROL;
    }
+    if (has_msr_bndcfgs) {
+        msrs[n++].index = MSR_IA32_BNDCFGS;
+    }

    if (!env->tsc_valid) {
        msrs[n++].index = MSR_IA32_TSC;
@@ -1550,6 +1579,9 @@ static int kvm_get_msrs(X86CPU *cpu)
        case MSR_IA32_FEATURE_CONTROL:
            env->msr_ia32_feature_control = msrs[i].data;
            break;
+        case MSR_IA32_BNDCFGS:
+            env->msr_bndcfgs = msrs[i].data;
+            break;
        default:
            if (msrs[i].index >= MSR_MC0_CTL &&
                msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
@@ -1799,6 +1831,13 @@ int kvm_arch_put_registers(CPUState *cpu, int level)

    assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));

+    if (level >= KVM_PUT_RESET_STATE && has_msr_feature_control) {
+        ret = kvm_put_msr_feature_control(x86_cpu);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
    ret = kvm_getput_regs(x86_cpu, 1);
    if (ret < 0) {
        return ret;
--- a/target-i386/machine.c
+++ b/target-i386/machine.c
@@ -63,6 +63,21 @@ static const VMStateDescription vmstate_ymmh_reg = {
 #define VMSTATE_YMMH_REGS_VARS(_field, _state, _n, _v)                         \
    VMSTATE_STRUCT_ARRAY(_field, _state, _n, _v, vmstate_ymmh_reg, XMMReg)

+static const VMStateDescription vmstate_bnd_regs = {
+    .name = "bnd_regs",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .fields      = (VMStateField[]) {
+        VMSTATE_UINT64(lb, BNDReg),
+        VMSTATE_UINT64(ub, BNDReg),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+#define VMSTATE_BND_REGS(_field, _state, _n)          \
+    VMSTATE_STRUCT_ARRAY(_field, _state, _n, 0, vmstate_bnd_regs, BNDReg)
+
 static const VMStateDescription vmstate_mtrr_var = {
    .name = "mtrr_var",
    .version_id = 1,
@@ -506,6 +521,39 @@ static const VMStateDescription vmstate_msr_architectural_pmu = {
    }
 };

+static bool mpx_needed(void *opaque)
+{
+    X86CPU *cpu = opaque;
+    CPUX86State *env = &cpu->env;
+    unsigned int i;
+
+    for (i = 0; i < 4; i++) {
+        if (env->bnd_regs[i].lb || env->bnd_regs[i].ub) {
+            return true;
+        }
+    }
+
+    if (env->bndcs_regs.cfgu || env->bndcs_regs.sts) {
+        return true;
+    }
+
+    return !!env->msr_bndcfgs;
+}
+
+static const VMStateDescription vmstate_mpx = {
+    .name = "cpu/mpx",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .fields      = (VMStateField[]) {
+        VMSTATE_BND_REGS(env.bnd_regs, X86CPU, 4),
+        VMSTATE_UINT64(env.bndcs_regs.cfgu, X86CPU),
+        VMSTATE_UINT64(env.bndcs_regs.sts, X86CPU),
+        VMSTATE_UINT64(env.msr_bndcfgs, X86CPU),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 const VMStateDescription vmstate_x86_cpu = {
    .name = "cpu",
    .version_id = 12,
@@ -637,6 +685,9 @@ const VMStateDescription vmstate_x86_cpu = {
        }, {
            .vmsd = &vmstate_msr_architectural_pmu,
            .needed = pmu_enable_needed,
+        } , {
+            .vmsd = &vmstate_mpx,
+            .needed = mpx_needed,
        } , {
            /* empty */
        }
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -151,7 +151,7 @@ tests/check-qdict$(EXESUF): tests/check-qdict.o libqemuutil.a
 tests/check-qlist$(EXESUF): tests/check-qlist.o libqemuutil.a
 tests/check-qfloat$(EXESUF): tests/check-qfloat.o libqemuutil.a
 tests/check-qjson$(EXESUF): tests/check-qjson.o libqemuutil.a libqemustub.a
-tests/check-qom-interface$(EXESUF): tests/check-qom-interface.o $(qom-core-obj) libqemuutil.a
+tests/check-qom-interface$(EXESUF): tests/check-qom-interface.o $(qom-core-obj) libqemuutil.a libqemustub.a
 tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(block-obj-y) libqemuutil.a libqemustub.a
 tests/test-aio$(EXESUF): tests/test-aio.o $(block-obj-y) libqemuutil.a libqemustub.a
 tests/test-throttle$(EXESUF): tests/test-throttle.o $(block-obj-y) libqemuutil.a libqemustub.a
--- a/Show More
+++ b/Show More