target-i386: Cleanup 'foo=val' feature handling

Features family, model, stepping, level, hv_spinlocks are treated similarly when passed from command line, so it's not necessary to handle each of them individually. Collapse them to one catch-all branch which will treat any not explicitly handled feature in format 'foo=val'. Any unknown feature will be rejected by property setter so there is no need to check for unknown feature in cpu_x86_parse_featurestr(), therefore it's replaced by above mentioned catch-all handler. Signed-off-by: Igor Mammedov <imammedo@redhat.com> Signed-off-by: Andreas Färber <afaerber@suse.de>
target-i386: Cleanup 'foo' feature handling
2013-12-24 13:47:25 +01:00 · 2013-12-24 13:47:25 +01:00 · 2013-12-24 13:47:13 +01:00 · 2013-12-24 12:30:46 +01:00 · 2013-12-24 12:30:46 +01:00 · 2013-12-24 12:30:46 +01:00
439 changed files with 92449 additions and 82047 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@ config-all-devices.*
 config-all-disas.*
 config-host.*
 config-target.*
+config.status
 trace/generated-tracers.h
 trace/generated-tracers.c
 trace/generated-tracers-dtrace.h
--- a/40
+++ b/40
@@ -219,6 +219,13 @@ F: *win32*

 ARM Machines
 ------------
+Allwinner-a10
+M: Li Guang <lig.fnst@cn.fujitsu.com>
+S: Maintained
+F: hw/*/allwinner-a10*
+F: include/hw/*/allwinner-a10*
+F: hw/arm/cubieboard.c
+
 Exynos
 M: Evgeny Voevodin <e.voevodin@samsung.com>
 M: Maksim Kozlov <m.kozlov@samsung.com>
@@ -233,6 +240,12 @@ S: Supported
 F: hw/arm/highbank.c
 F: hw/net/xgmac.c

+Canon DIGIC
+M: Antony Pavlov <antonynpavlov@gmail.com>
+S: Maintained
+F: include/hw/arm/digic.h
+F: hw/*/digic*
+
 Gumstix
 M: qemu-devel@nongnu.org
 S: Orphan
@@ -500,9 +513,23 @@ X86 Machines
 ------------
 PC
 M: Anthony Liguori <aliguori@amazon.com>
+M: Michael S. Tsirkin <mst@redhat.com>
 S: Supported
-F: hw/i386/pc.[ch]
-F: hw/i386/pc_piix.c
+F: include/hw/i386/
+F: hw/i386/
+F: hw/pci-host/piix.c
+F: hw/pci-host/q35.c
+F: hw/pci-host/pam.c
+F: include/hw/pci-host/q35.h
+F: include/hw/pci-host/pam.h
+F: hw/isa/piix4.c
+F: hw/isa/lpc_ich9.c
+F: hw/i2c/smbus_ich9.c
+F: hw/acpi/piix4.c
+F: hw/acpi/ich9.c
+F: include/hw/acpi/ich9.h
+F: include/hw/acpi/piix.h
+

 Xtensa Machines
 ---------------
@@ -710,6 +737,14 @@ S: Maintained
 F: net/
 T: git git://github.com/stefanha/qemu.git net

+Netmap network backend
+M: Luigi Rizzo <rizzo@iet.unipi.it>
+M: Giuseppe Lettieri <g.lettieri@iet.unipi.it>
+M: Vincenzo Maffione <v.maffione@gmail.com>
+W: http://info.iet.unipi.it/~luigi/netmap/
+S: Maintained
+F: net/netmap.c
+
 Network Block Device (NBD)
 M: Paolo Bonzini <pbonzini@redhat.com>
 S: Odd Fixes
@@ -879,6 +914,7 @@ F: block/rbd.c
 Sheepdog
 M: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
 M: Liu Yuan <namei.unix@gmail.com>
+L: sheepdog@lists.wpkg.org
 S: Supported
 F: block/sheepdog.c

--- a/2
+++ b/2
@@ -1 +1 @@
-1.7.2
+1.7.50
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -217,11 +217,6 @@ bool aio_poll(AioContext *ctx, bool blocking)

    ctx->walking_handlers--;

-    /* early return if we only have the aio_notify() fd */
-    if (ctx->pollfds->len == 1) {
-        return progress;
-    }
-
    /* wait until next event */
    ret = qemu_poll_ns((GPollFD *)ctx->pollfds->data,
                         ctx->pollfds->len,
--- a/aio-win32.c
+++ b/aio-win32.c
@@ -161,11 +161,6 @@ bool aio_poll(AioContext *ctx, bool blocking)

    ctx->walking_handlers--;

-    /* early return if we only have the aio_notify() fd */
-    if (count == 1) {
-        return progress;
-    }
-
    /* wait until next event */
    while (count > 0) {
        int ret;
--- a/arch_init.c
+++ b/arch_init.c
@@ -857,60 +857,64 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
 {
    ram_addr_t addr;
    int flags, ret = 0;
+    int error;
    static uint64_t seq_iter;

    seq_iter++;

-    if (version_id != 4) {
+    if (version_id < 4 || version_id > 4) {
        return -EINVAL;
    }

-    while (!ret) {
+    do {
        addr = qemu_get_be64(f);

        flags = addr & ~TARGET_PAGE_MASK;
        addr &= TARGET_PAGE_MASK;

        if (flags & RAM_SAVE_FLAG_MEM_SIZE) {
-            /* Synchronize RAM block list */
-            char id[256];
-            ram_addr_t length;
-            ram_addr_t total_ram_bytes = addr;
+            if (version_id == 4) {
+                /* Synchronize RAM block list */
+                char id[256];
+                ram_addr_t length;
+                ram_addr_t total_ram_bytes = addr;

-            while (total_ram_bytes) {
-                RAMBlock *block;
-                uint8_t len;
+                while (total_ram_bytes) {
+                    RAMBlock *block;
+                    uint8_t len;

-                len = qemu_get_byte(f);
-                qemu_get_buffer(f, (uint8_t *)id, len);
-                id[len] = 0;
-                length = qemu_get_be64(f);
+                    len = qemu_get_byte(f);
+                    qemu_get_buffer(f, (uint8_t *)id, len);
+                    id[len] = 0;
+                    length = qemu_get_be64(f);

-                QTAILQ_FOREACH(block, &ram_list.blocks, next) {
-                    if (!strncmp(id, block->idstr, sizeof(id))) {
-                        if (block->length != length) {
-                            fprintf(stderr,
-                                    "Length mismatch: %s: " RAM_ADDR_FMT
-                                    " in != " RAM_ADDR_FMT "\n", id, length,
-                                    block->length);
-                            ret =  -EINVAL;
+                    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
+                        if (!strncmp(id, block->idstr, sizeof(id))) {
+                            if (block->length != length) {
+                                fprintf(stderr,
+                                        "Length mismatch: %s: " RAM_ADDR_FMT
+                                        " in != " RAM_ADDR_FMT "\n", id, length,
+                                        block->length);
+                                ret =  -EINVAL;
+                                goto done;
+                            }
+                            break;
                        }
-                        break;
                    }
-                }

-                if (!block) {
-                    fprintf(stderr, "Unknown ramblock \"%s\", cannot "
-                            "accept migration\n", id);
-                    ret = -EINVAL;
-                }
-                if (ret) {
-                    break;
-                }
+                    if (!block) {
+                        fprintf(stderr, "Unknown ramblock \"%s\", cannot "
+                                "accept migration\n", id);
+                        ret = -EINVAL;
+                        goto done;
+                    }

-                total_ram_bytes -= length;
+                    total_ram_bytes -= length;
+                }
            }
-        } else if (flags & RAM_SAVE_FLAG_COMPRESS) {
+        }
+
+        if (flags & RAM_SAVE_FLAG_COMPRESS) {
            void *host;
            uint8_t ch;

@@ -937,24 +941,20 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
            }

            if (load_xbzrle(f, addr, host) < 0) {
-                error_report("Failed to decompress XBZRLE page at "
-                             RAM_ADDR_FMT, addr);
                ret = -EINVAL;
-                break;
+                goto done;
            }
        } else if (flags & RAM_SAVE_FLAG_HOOK) {
            ram_control_load_hook(f, flags);
-        } else if (flags & RAM_SAVE_FLAG_EOS) {
-            /* normal exit */
-            break;
-        } else {
-            error_report("Unknown migration flags: %#x", flags);
-            ret = -EINVAL;
-            break;
        }
-        ret = qemu_file_get_error(f);
-    }
+        error = qemu_file_get_error(f);
+        if (error) {
+            ret = error;
+            goto done;
+        }
+    } while (!(flags & RAM_SAVE_FLAG_EOS));

+done:
    DPRINTF("Completed load of VM with exit code %d seq iteration "
            "%" PRIu64 "\n", ret, seq_iter);
    return ret;
--- a/async.c
+++ b/async.c
@@ -117,21 +117,15 @@ void qemu_bh_schedule_idle(QEMUBH *bh)

 void qemu_bh_schedule(QEMUBH *bh)
 {
-    AioContext *ctx;
-
    if (bh->scheduled)
        return;
-    ctx = bh->ctx;
    bh->idle = 0;
-    /* Make sure that:
-     * 1. idle & any writes needed by the callback are done before the
-     *    locations are read in the aio_bh_poll.
-     * 2. ctx is loaded before scheduled is set and the callback has a chance
-     *    to execute.
+    /* Make sure that idle & any writes needed by the callback are done
+     * before the locations are read in the aio_bh_poll.
     */
-    smp_mb();
+    smp_wmb();
    bh->scheduled = 1;
-    aio_notify(ctx);
+    aio_notify(bh->ctx);
 }


--- a/audio/audio.c
+++ b/audio/audio.c
@@ -95,7 +95,7 @@ static struct {
        }
    },

-    .period = { .hertz = 250 },
+    .period = { .hertz = 100 },
    .plive = 0,
    .log_to_monitor = 0,
    .try_poll_in = 1,
--- a/audio/paaudio.c
+++ b/audio/paaudio.c
@@ -547,11 +547,11 @@ static int qpa_init_out (HWVoiceOut *hw, struct audsettings *as)
    ss.rate = as->freq;

    /*
-     * qemu audio tick runs at 250 Hz (by default), so processing
-     * data chunks worth 4 ms of sound should be a good fit.
+     * qemu audio tick runs at 100 Hz (by default), so processing
+     * data chunks worth 10 ms of sound should be a good fit.
     */
-    ba.tlength = pa_usec_to_bytes (4 * 1000, &ss);
-    ba.minreq = pa_usec_to_bytes (2 * 1000, &ss);
+    ba.tlength = pa_usec_to_bytes (10 * 1000, &ss);
+    ba.minreq = pa_usec_to_bytes (5 * 1000, &ss);
    ba.maxlength = -1;
    ba.prebuf = -1;

--- a/block-migration.c
+++ b/block-migration.c
@@ -58,6 +58,7 @@ typedef struct BlkMigDevState {
    /* Protected by block migration lock.  */
    unsigned long *aio_bitmap;
    int64_t completed_sectors;
+    BdrvDirtyBitmap *dirty_bitmap;
 } BlkMigDevState;

 typedef struct BlkMigBlock {
@@ -309,12 +310,21 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)

 /* Called with iothread lock taken.  */

-static void set_dirty_tracking(int enable)
+static void set_dirty_tracking(void)
 {
    BlkMigDevState *bmds;

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-        bdrv_set_dirty_tracking(bmds->bs, enable ? BLOCK_SIZE : 0);
+        bmds->dirty_bitmap = bdrv_create_dirty_bitmap(bmds->bs, BLOCK_SIZE);
+    }
+}
+
+static void unset_dirty_tracking(void)
+{
+    BlkMigDevState *bmds;
+
+    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
+        bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap);
    }
 }

@@ -432,7 +442,7 @@ static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
        } else {
            blk_mig_unlock();
        }
-        if (bdrv_get_dirty(bmds->bs, sector)) {
+        if (bdrv_get_dirty(bmds->bs, bmds->dirty_bitmap, sector)) {

            if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
                nr_sectors = total_sectors - sector;
@@ -554,7 +564,7 @@ static int64_t get_remaining_dirty(void)
    int64_t dirty = 0;

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-        dirty += bdrv_get_dirty_count(bmds->bs);
+        dirty += bdrv_get_dirty_count(bmds->bs, bmds->dirty_bitmap);
    }

    return dirty << BDRV_SECTOR_BITS;
@@ -569,7 +579,7 @@ static void blk_mig_cleanup(void)

    bdrv_drain_all();

-    set_dirty_tracking(0);
+    unset_dirty_tracking();

    blk_mig_lock();
    while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
@@ -604,7 +614,7 @@ static int block_save_setup(QEMUFile *f, void *opaque)
    init_blk_migration(f);

    /* start track dirty blocks */
-    set_dirty_tracking(1);
+    set_dirty_tracking();
    qemu_mutex_unlock_iothread();

    ret = flush_blks(f);
@@ -780,7 +790,8 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
            }

            if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
-                ret = bdrv_write_zeroes(bs, addr, nr_sectors);
+                ret = bdrv_write_zeroes(bs, addr, nr_sectors,
+                                        BDRV_REQ_MAY_UNMAP);
            } else {
                buf = g_malloc(BLOCK_SIZE);
                qemu_get_buffer(f, buf, BLOCK_SIZE);
--- a/block.c
+++ b/block.c
@@ -49,12 +49,12 @@
 #include <windows.h>
 #endif

-#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
+struct BdrvDirtyBitmap {
+    HBitmap *bitmap;
+    QLIST_ENTRY(BdrvDirtyBitmap) list;
+};

-typedef enum {
-    BDRV_REQ_COPY_ON_READ = 0x1,
-    BDRV_REQ_ZERO_WRITE   = 0x2,
-} BdrvRequestFlags;
+#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */

 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
@@ -79,12 +79,13 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
                                               int64_t sector_num,
                                               QEMUIOVector *qiov,
                                               int nb_sectors,
+                                               BdrvRequestFlags flags,
                                               BlockDriverCompletionFunc *cb,
                                               void *opaque,
                                               bool is_write);
 static void coroutine_fn bdrv_co_do_rw(void *opaque);
 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors);
+    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);

 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
    QTAILQ_HEAD_INITIALIZER(bdrv_states);
@@ -323,6 +324,7 @@ BlockDriverState *bdrv_new(const char *device_name)
    BlockDriverState *bs;

    bs = g_malloc0(sizeof(BlockDriverState));
+    QLIST_INIT(&bs->dirty_bitmaps);
    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
    if (device_name[0] != '\0') {
        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
@@ -966,14 +968,14 @@ fail:
 */
 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
 {
-    char *backing_filename = g_malloc0(PATH_MAX);
-    int back_flags, ret = 0;
+    char backing_filename[PATH_MAX];
+    int back_flags, ret;
    BlockDriver *back_drv = NULL;
    Error *local_err = NULL;

    if (bs->backing_hd != NULL) {
        QDECREF(options);
-        goto free_exit;
+        return 0;
    }

    /* NULL means an empty set of options */
@@ -986,9 +988,10 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
        backing_filename[0] = '\0';
    } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
        QDECREF(options);
-        goto free_exit;
+        return 0;
    } else {
-        bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
+        bdrv_get_full_backing_filename(bs, backing_filename,
+                                       sizeof(backing_filename));
    }

    bs->backing_hd = bdrv_new("");
@@ -1011,14 +1014,11 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
        error_setg(errp, "Could not open backing file: %s",
                   error_get_pretty(local_err));
        error_free(local_err);
-        goto free_exit;
+        return ret;
    }
    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
            bs->backing_hd->file->filename);
-    ret = 0;
-free_exit:
-    g_free(backing_filename);
-    return ret;
+    return 0;
 }

 /*
@@ -1034,8 +1034,7 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
 {
    int ret;
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
-    char *backing_filename = NULL;
-    char *tmp_filename = g_malloc0(PATH_MAX + 1);
+    char tmp_filename[PATH_MAX + 1];
    BlockDriverState *file = NULL;
    QDict *file_options = NULL;
    const char *drvname;
@@ -1055,21 +1054,16 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
        int64_t total_size;
        BlockDriver *bdrv_qcow2;
        QEMUOptionParameter *create_options;
-        backing_filename = g_malloc0(PATH_MAX);
-
-        if (qdict_size(options) != 0) {
-            error_setg(errp, "Can't use snapshot=on with driver-specific options");
-            ret = -EINVAL;
-            goto fail;
-        }
-        assert(filename != NULL);
+        QDict *snapshot_options;

        /* if snapshot, we create a temporary backing file and open it
           instead of opening 'filename' directly */

+        /* Get the required size from the image */
        bs1 = bdrv_new("");
-        ret = bdrv_open(bs1, filename, NULL, BDRV_O_NO_BACKING, drv,
-                        &local_err);
+        QINCREF(options);
+        ret = bdrv_open(bs1, filename, options, BDRV_O_NO_BACKING,
+                        drv, &local_err);
        if (ret < 0) {
            bdrv_unref(bs1);
            goto fail;
@@ -1078,32 +1072,18 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,

        bdrv_unref(bs1);

-        ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
+        /* Create the temporary image */
+        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Could not get temporary filename");
            goto fail;
        }

-        /* Real path is meaningless for protocols */
-        if (path_has_protocol(filename)) {
-            snprintf(backing_filename, PATH_MAX, "%s", filename);
-        } else if (!realpath(filename, backing_filename)) {
-            ret = -errno;
-            error_setg_errno(errp, errno, "Could not resolve path '%s'", filename);
-            goto fail;
-        }
-
        bdrv_qcow2 = bdrv_find_format("qcow2");
        create_options = parse_option_parameters("", bdrv_qcow2->create_options,
                                                 NULL);

        set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
-        set_option_parameter(create_options, BLOCK_OPT_BACKING_FILE,
-                             backing_filename);
-        if (drv) {
-            set_option_parameter(create_options, BLOCK_OPT_BACKING_FMT,
-                drv->format_name);
-        }

        ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
        free_option_parameters(create_options);
@@ -1116,6 +1096,22 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
            goto fail;
        }

+        /* Prepare a new options QDict for the temporary file, where user
+         * options refer to the backing file */
+        if (filename) {
+            qdict_put(options, "file.filename", qstring_from_str(filename));
+        }
+        if (drv) {
+            qdict_put(options, "driver", qstring_from_str(drv->format_name));
+        }
+
+        snapshot_options = qdict_new();
+        qdict_put(snapshot_options, "backing", options);
+        qdict_flatten(snapshot_options);
+
+        bs->options = snapshot_options;
+        options = qdict_clone_shallow(bs->options);
+
        filename = tmp_filename;
        drv = bdrv_qcow2;
        bs->is_temporary = 1;
@@ -1208,8 +1204,6 @@ fail:
    if (error_is_set(&local_err)) {
        error_propagate(errp, local_err);
    }
-    g_free(tmp_filename);
-    g_free(backing_filename);
    return ret;

 close_and_fail:
@@ -1218,8 +1212,6 @@ close_and_fail:
    if (error_is_set(&local_err)) {
        error_propagate(errp, local_err);
    }
-    g_free(tmp_filename);
-    g_free(backing_filename);
    return ret;
 }

@@ -1565,13 +1557,8 @@ void bdrv_drain_all(void)
    BlockDriverState *bs;

    while (busy) {
-        /* FIXME: We do not have timer support here, so this is effectively
-         * a busy wait.
-         */
        QTAILQ_FOREACH(bs, &bdrv_states, list) {
-            if (bdrv_start_throttled_reqs(bs)) {
-                busy = true;
-            }
+            bdrv_start_throttled_reqs(bs);
        }

        busy = bdrv_requests_pending_all();
@@ -1628,7 +1615,7 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
    bs_dest->iostatus           = bs_src->iostatus;

    /* dirty bitmap */
-    bs_dest->dirty_bitmap       = bs_src->dirty_bitmap;
+    bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;

    /* reference count */
    bs_dest->refcnt             = bs_src->refcnt;
@@ -1661,7 +1648,7 @@ void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)

    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
    assert(bs_new->device_name[0] == '\0');
-    assert(bs_new->dirty_bitmap == NULL);
+    assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
    assert(bs_new->job == NULL);
    assert(bs_new->dev == NULL);
    assert(bs_new->in_use == 0);
@@ -1722,6 +1709,7 @@ static void bdrv_delete(BlockDriverState *bs)
    assert(!bs->job);
    assert(!bs->in_use);
    assert(!bs->refcnt);
+    assert(QLIST_EMPTY(&bs->dirty_bitmaps));

    bdrv_close(bs);

@@ -2277,10 +2265,6 @@ static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
                              int nb_sectors)
 {
-    if (nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
-        return -EIO;
-    }
-
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
                                   nb_sectors * BDRV_SECTOR_SIZE);
 }
@@ -2407,10 +2391,53 @@ int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
    return bdrv_rwv_co(bs, sector_num, qiov, true, 0);
 }

-int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
+int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
+                      int nb_sectors, BdrvRequestFlags flags)
 {
    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
-                      BDRV_REQ_ZERO_WRITE);
+                      BDRV_REQ_ZERO_WRITE | flags);
+}
+
+/*
+ * Completely zero out a block device with the help of bdrv_write_zeroes.
+ * The operation is sped up by checking the block status and only writing
+ * zeroes to the device if they currently do not return zeroes. Optional
+ * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
+ *
+ * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
+ */
+int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
+{
+    int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
+    int64_t ret, nb_sectors, sector_num = 0;
+    int n;
+
+    for (;;) {
+        nb_sectors = target_size - sector_num;
+        if (nb_sectors <= 0) {
+            return 0;
+        }
+        if (nb_sectors > INT_MAX) {
+            nb_sectors = INT_MAX;
+        }
+        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
+        if (ret < 0) {
+            error_report("error getting block status at sector %" PRId64 ": %s",
+                         sector_num, strerror(-ret));
+            return ret;
+        }
+        if (ret & BDRV_BLOCK_ZERO) {
+            sector_num += n;
+            continue;
+        }
+        ret = bdrv_write_zeroes(bs, sector_num, n, flags);
+        if (ret < 0) {
+            error_report("error writing zeroes at sector %" PRId64 ": %s",
+                         sector_num, strerror(-ret));
+            return ret;
+        }
+        sector_num += n;
+    }
 }

 int bdrv_pread(BlockDriverState *bs, int64_t offset,
@@ -2592,7 +2619,7 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
    if (drv->bdrv_co_write_zeroes &&
        buffer_is_zero(bounce_buffer, iov.iov_len)) {
        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
-                                      cluster_nb_sectors);
+                                      cluster_nb_sectors, 0);
    } else {
        /* This does not change the data on the disk, it is not necessary
         * to flush even in cache=writethrough mode.
@@ -2725,33 +2752,77 @@ int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
                            BDRV_REQ_COPY_ON_READ);
 }

+/* if no limit is specified in the BlockLimits use a default
+ * of 32768 512-byte sectors (16 MiB) per request.
+ */
+#define MAX_WRITE_ZEROES_DEFAULT 32768
+
 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors)
+    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
 {
    BlockDriver *drv = bs->drv;
    QEMUIOVector qiov;
-    struct iovec iov;
-    int ret;
+    struct iovec iov = {0};
+    int ret = 0;

-    /* TODO Emulate only part of misaligned requests instead of letting block
-     * drivers return -ENOTSUP and emulate everything */
+    int max_write_zeroes = bs->bl.max_write_zeroes ?
+                           bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;

-    /* First try the efficient write zeroes operation */
-    if (drv->bdrv_co_write_zeroes) {
-        ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
-        if (ret != -ENOTSUP) {
-            return ret;
+    while (nb_sectors > 0 && !ret) {
+        int num = nb_sectors;
+
+        /* Align request.  Block drivers can expect the "bulk" of the request
+         * to be aligned.
+         */
+        if (bs->bl.write_zeroes_alignment
+            && num > bs->bl.write_zeroes_alignment) {
+            if (sector_num % bs->bl.write_zeroes_alignment != 0) {
+                /* Make a small request up to the first aligned sector.  */
+                num = bs->bl.write_zeroes_alignment;
+                num -= sector_num % bs->bl.write_zeroes_alignment;
+            } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
+                /* Shorten the request to the last aligned sector.  num cannot
+                 * underflow because num > bs->bl.write_zeroes_alignment.
+                 */
+                num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
+            }
        }
+
+        /* limit request size */
+        if (num > max_write_zeroes) {
+            num = max_write_zeroes;
+        }
+
+        ret = -ENOTSUP;
+        /* First try the efficient write zeroes operation */
+        if (drv->bdrv_co_write_zeroes) {
+            ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
+        }
+
+        if (ret == -ENOTSUP) {
+            /* Fall back to bounce buffer if write zeroes is unsupported */
+            iov.iov_len = num * BDRV_SECTOR_SIZE;
+            if (iov.iov_base == NULL) {
+                iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
+                memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
+            }
+            qemu_iovec_init_external(&qiov, &iov, 1);
+
+            ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
+
+            /* Keep bounce buffer around if it is big enough for all
+             * all future requests.
+             */
+            if (num < max_write_zeroes) {
+                qemu_vfree(iov.iov_base);
+                iov.iov_base = NULL;
+            }
+        }
+
+        sector_num += num;
+        nb_sectors -= num;
    }

-    /* Fall back to bounce buffer if write zeroes is unsupported */
-    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
-    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
-    memset(iov.iov_base, 0, iov.iov_len);
-    qemu_iovec_init_external(&qiov, &iov, 1);
-
-    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
-
    qemu_vfree(iov.iov_base);
    return ret;
 }
@@ -2793,7 +2864,7 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
    if (ret < 0) {
        /* Do nothing, write notifier decided to fail this request */
    } else if (flags & BDRV_REQ_ZERO_WRITE) {
-        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
+        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
    } else {
        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
    }
@@ -2802,9 +2873,7 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
        ret = bdrv_co_flush(bs);
    }

-    if (bs->dirty_bitmap) {
-        bdrv_set_dirty(bs, sector_num, nb_sectors);
-    }
+    bdrv_set_dirty(bs, sector_num, nb_sectors);

    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
        bs->wr_highest_sector = sector_num + nb_sectors - 1;
@@ -2827,12 +2896,17 @@ int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
 }

 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
-                                      int64_t sector_num, int nb_sectors)
+                                      int64_t sector_num, int nb_sectors,
+                                      BdrvRequestFlags flags)
 {
-    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
+    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
+
+    if (!(bs->open_flags & BDRV_O_UNMAP)) {
+        flags &= ~BDRV_REQ_MAY_UNMAP;
+    }

    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
-                             BDRV_REQ_ZERO_WRITE);
+                             BDRV_REQ_ZERO_WRITE | flags);
 }

 /**
@@ -3112,6 +3186,36 @@ int bdrv_has_zero_init(BlockDriverState *bs)
    return 0;
 }

+bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
+{
+    BlockDriverInfo bdi;
+
+    if (bs->backing_hd) {
+        return false;
+    }
+
+    if (bdrv_get_info(bs, &bdi) == 0) {
+        return bdi.unallocated_blocks_are_zero;
+    }
+
+    return false;
+}
+
+bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
+{
+    BlockDriverInfo bdi;
+
+    if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
+        return false;
+    }
+
+    if (bdrv_get_info(bs, &bdi) == 0) {
+        return bdi.can_write_zeroes_with_unmap;
+    }
+
+    return false;
+}
+
 typedef struct BdrvCoGetBlockStatusData {
    BlockDriverState *bs;
    BlockDriverState *base;
@@ -3181,8 +3285,8 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
                                     *pnum, pnum);
    }

-    if (!(ret & BDRV_BLOCK_DATA)) {
-        if (bdrv_has_zero_init(bs)) {
+    if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
+        if (bdrv_unallocated_blocks_are_zero(bs)) {
            ret |= BDRV_BLOCK_ZERO;
        } else if (bs->backing_hd) {
            BlockDriverState *bs2 = bs->backing_hd;
@@ -3340,7 +3444,7 @@ int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
    if (bdrv_check_request(bs, sector_num, nb_sectors))
        return -EIO;

-    assert(!bs->dirty_bitmap);
+    assert(QLIST_EMPTY(&bs->dirty_bitmaps));

    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
 }
@@ -3429,6 +3533,19 @@ int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
    return -ENOTSUP;
 }

+int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
+{
+    while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
+        bs = bs->file;
+    }
+
+    if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
+        return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
+    }
+
+    return -ENOTSUP;
+}
+
 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
 {
    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
@@ -3564,7 +3681,7 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
 {
    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);

-    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
+    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
                                 cb, opaque, false);
 }

@@ -3574,7 +3691,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
 {
    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);

-    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
+    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
+                                 cb, opaque, true);
+}
+
+BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
+
+    return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
+                                 BDRV_REQ_ZERO_WRITE | flags,
                                 cb, opaque, true);
 }

@@ -3746,8 +3874,10 @@ int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
    /* Run the aio requests. */
    mcb->num_requests = num_reqs;
    for (i = 0; i < num_reqs; i++) {
-        bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
-            reqs[i].nb_sectors, multiwrite_cb, mcb);
+        bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
+                              reqs[i].nb_sectors, reqs[i].flags,
+                              multiwrite_cb, mcb,
+                              true);
    }

    return 0;
@@ -3889,10 +4019,10 @@ static void coroutine_fn bdrv_co_do_rw(void *opaque)

    if (!acb->is_write) {
        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
-            acb->req.nb_sectors, acb->req.qiov, 0);
+            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
    } else {
        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
-            acb->req.nb_sectors, acb->req.qiov, 0);
+            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
    }

    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
@@ -3903,6 +4033,7 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
                                               int64_t sector_num,
                                               QEMUIOVector *qiov,
                                               int nb_sectors,
+                                               BdrvRequestFlags flags,
                                               BlockDriverCompletionFunc *cb,
                                               void *opaque,
                                               bool is_write)
@@ -3914,6 +4045,7 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
    acb->req.sector = sector_num;
    acb->req.nb_sectors = nb_sectors;
    acb->req.qiov = qiov;
+    acb->req.flags = flags;
    acb->is_write = is_write;
    acb->done = NULL;

@@ -4189,9 +4321,16 @@ static void coroutine_fn bdrv_discard_co_entry(void *opaque)
    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
 }

+/* if no limit is specified in the BlockLimits use a default
+ * of 32768 512-byte sectors (16 MiB) per request.
+ */
+#define MAX_DISCARD_DEFAULT 32768
+
 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
                                 int nb_sectors)
 {
+    int max_discard;
+
    if (!bs->drv) {
        return -ENOMEDIUM;
    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
@@ -4200,34 +4339,62 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
        return -EROFS;
    }

-    if (bs->dirty_bitmap) {
-        bdrv_reset_dirty(bs, sector_num, nb_sectors);
-    }
+    bdrv_reset_dirty(bs, sector_num, nb_sectors);

    /* Do nothing if disabled.  */
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
        return 0;
    }

-    if (bs->drv->bdrv_co_discard) {
-        return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
-    } else if (bs->drv->bdrv_aio_discard) {
-        BlockDriverAIOCB *acb;
-        CoroutineIOCompletion co = {
-            .coroutine = qemu_coroutine_self(),
-        };
-
-        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
-                                        bdrv_co_io_em_complete, &co);
-        if (acb == NULL) {
-            return -EIO;
-        } else {
-            qemu_coroutine_yield();
-            return co.ret;
-        }
-    } else {
+    if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
        return 0;
    }
+
+    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
+    while (nb_sectors > 0) {
+        int ret;
+        int num = nb_sectors;
+
+        /* align request */
+        if (bs->bl.discard_alignment &&
+            num >= bs->bl.discard_alignment &&
+            sector_num % bs->bl.discard_alignment) {
+            if (num > bs->bl.discard_alignment) {
+                num = bs->bl.discard_alignment;
+            }
+            num -= sector_num % bs->bl.discard_alignment;
+        }
+
+        /* limit request size */
+        if (num > max_discard) {
+            num = max_discard;
+        }
+
+        if (bs->drv->bdrv_co_discard) {
+            ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
+        } else {
+            BlockDriverAIOCB *acb;
+            CoroutineIOCompletion co = {
+                .coroutine = qemu_coroutine_self(),
+            };
+
+            acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
+                                            bdrv_co_io_em_complete, &co);
+            if (acb == NULL) {
+                return -EIO;
+            } else {
+                qemu_coroutine_yield();
+                ret = co.ret;
+            }
+        }
+        if (ret && ret != -ENOTSUP) {
+            return ret;
+        }
+
+        sector_num += num;
+        nb_sectors -= num;
+    }
+    return 0;
 }

 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
@@ -4364,60 +4531,92 @@ bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
    return true;
 }

-void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity)
+BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
 {
    int64_t bitmap_size;
+    BdrvDirtyBitmap *bitmap;

    assert((granularity & (granularity - 1)) == 0);

-    if (granularity) {
-        granularity >>= BDRV_SECTOR_BITS;
-        assert(!bs->dirty_bitmap);
-        bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
-        bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
-    } else {
-        if (bs->dirty_bitmap) {
-            hbitmap_free(bs->dirty_bitmap);
-            bs->dirty_bitmap = NULL;
+    granularity >>= BDRV_SECTOR_BITS;
+    assert(granularity);
+    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
+    bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
+    bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
+    QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
+    return bitmap;
+}
+
+void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
+{
+    BdrvDirtyBitmap *bm, *next;
+    QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
+        if (bm == bitmap) {
+            QLIST_REMOVE(bitmap, list);
+            hbitmap_free(bitmap->bitmap);
+            g_free(bitmap);
+            return;
        }
    }
 }

-int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
+BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
 {
-    if (bs->dirty_bitmap) {
-        return hbitmap_get(bs->dirty_bitmap, sector);
+    BdrvDirtyBitmap *bm;
+    BlockDirtyInfoList *list = NULL;
+    BlockDirtyInfoList **plist = &list;
+
+    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
+        BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
+        BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
+        info->count = bdrv_get_dirty_count(bs, bm);
+        info->granularity =
+            ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
+        entry->value = info;
+        *plist = entry;
+        plist = &entry->next;
+    }
+
+    return list;
+}
+
+int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
+{
+    if (bitmap) {
+        return hbitmap_get(bitmap->bitmap, sector);
    } else {
        return 0;
    }
 }

-void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi)
+void bdrv_dirty_iter_init(BlockDriverState *bs,
+                          BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
 {
-    hbitmap_iter_init(hbi, bs->dirty_bitmap, 0);
+    hbitmap_iter_init(hbi, bitmap->bitmap, 0);
 }

 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
                    int nr_sectors)
 {
-    hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors);
-}
-
-void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
-                      int nr_sectors)
-{
-    hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors);
-}
-
-int64_t bdrv_get_dirty_count(BlockDriverState *bs)
-{
-    if (bs->dirty_bitmap) {
-        return hbitmap_count(bs->dirty_bitmap);
-    } else {
-        return 0;
+    BdrvDirtyBitmap *bitmap;
+    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
+        hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
    }
 }

+void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
+{
+    BdrvDirtyBitmap *bitmap;
+    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
+        hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
+    }
+}
+
+int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
+{
+    return hbitmap_count(bitmap->bitmap);
+}
+
 /* Get a reference to bs */
 void bdrv_ref(BlockDriverState *bs)
 {
@@ -4514,7 +4713,6 @@ void bdrv_img_create(const char *filename, const char *fmt,
 {
    QEMUOptionParameter *param = NULL, *create_options = NULL;
    QEMUOptionParameter *backing_fmt, *backing_file, *size;
-    BlockDriverState *bs = NULL;
    BlockDriver *drv, *proto_drv;
    BlockDriver *backing_drv = NULL;
    Error *local_err = NULL;
@@ -4593,6 +4791,7 @@ void bdrv_img_create(const char *filename, const char *fmt,
    size = get_option_parameter(param, BLOCK_OPT_SIZE);
    if (size && size->value.n == -1) {
        if (backing_file && backing_file->value.s) {
+            BlockDriverState *bs;
            uint64_t size;
            char buf[32];
            int back_flags;
@@ -4611,6 +4810,7 @@ void bdrv_img_create(const char *filename, const char *fmt,
                                 error_get_pretty(local_err));
                error_free(local_err);
                local_err = NULL;
+                bdrv_unref(bs);
                goto out;
            }
            bdrv_get_geometry(bs, &size);
@@ -4618,6 +4818,8 @@ void bdrv_img_create(const char *filename, const char *fmt,

            snprintf(buf, sizeof(buf), "%" PRId64, size);
            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
+
+            bdrv_unref(bs);
        } else {
            error_setg(errp, "Image creation needs a size parameter");
            goto out;
@@ -4648,9 +4850,6 @@ out:
    free_option_parameters(create_options);
    free_option_parameters(param);

-    if (bs) {
-        bdrv_unref(bs);
-    }
    if (error_is_set(&local_err)) {
        error_propagate(errp, local_err);
    }
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -10,7 +10,7 @@ block-obj-$(CONFIG_POSIX) += raw-posix.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o

 ifeq ($(CONFIG_POSIX),y)
-block-obj-y += nbd.o sheepdog.o
+block-obj-y += nbd.o nbd-client.o sheepdog.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
 block-obj-$(CONFIG_CURL) += curl.o
 block-obj-$(CONFIG_RBD) += rbd.o
--- a/block/backup.c
+++ b/block/backup.c
@@ -138,7 +138,8 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,

        if (buffer_is_zero(iov.iov_base, iov.iov_len)) {
            ret = bdrv_co_write_zeroes(job->target,
-                                       start * BACKUP_SECTORS_PER_CLUSTER, n);
+                                       start * BACKUP_SECTORS_PER_CLUSTER,
+                                       n, BDRV_REQ_MAY_UNMAP);
        } else {
            ret = bdrv_co_writev(job->target,
                                 start * BACKUP_SECTORS_PER_CLUSTER, n,
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -594,9 +594,9 @@ static int blkdebug_debug_breakpoint(BlockDriverState *bs, const char *event,
 static int blkdebug_debug_resume(BlockDriverState *bs, const char *tag)
 {
    BDRVBlkdebugState *s = bs->opaque;
-    BlkdebugSuspendedReq *r;
+    BlkdebugSuspendedReq *r, *next;

-    QLIST_FOREACH(r, &s->suspended_reqs, next) {
+    QLIST_FOREACH_SAFE(r, &s->suspended_reqs, next, next) {
        if (!strcmp(r->tag, tag)) {
            qemu_coroutine_enter(r->co, NULL);
            return 0;
@@ -605,6 +605,31 @@ static int blkdebug_debug_resume(BlockDriverState *bs, const char *tag)
    return -ENOENT;
 }

+static int blkdebug_debug_remove_breakpoint(BlockDriverState *bs,
+                                            const char *tag)
+{
+    BDRVBlkdebugState *s = bs->opaque;
+    BlkdebugSuspendedReq *r, *r_next;
+    BlkdebugRule *rule, *next;
+    int i, ret = -ENOENT;
+
+    for (i = 0; i < BLKDBG_EVENT_MAX; i++) {
+        QLIST_FOREACH_SAFE(rule, &s->rules[i], next, next) {
+            if (rule->action == ACTION_SUSPEND &&
+                !strcmp(rule->options.suspend.tag, tag)) {
+                remove_rule(rule);
+                ret = 0;
+            }
+        }
+    }
+    QLIST_FOREACH_SAFE(r, &s->suspended_reqs, next, r_next) {
+        if (!strcmp(r->tag, tag)) {
+            qemu_coroutine_enter(r->co, NULL);
+            ret = 0;
+        }
+    }
+    return ret;
+}

 static bool blkdebug_debug_is_suspended(BlockDriverState *bs, const char *tag)
 {
@@ -639,6 +664,8 @@ static BlockDriver bdrv_blkdebug = {

    .bdrv_debug_event           = blkdebug_debug_event,
    .bdrv_debug_breakpoint      = blkdebug_debug_breakpoint,
+    .bdrv_debug_remove_breakpoint
+                                = blkdebug_debug_remove_breakpoint,
    .bdrv_debug_resume          = blkdebug_debug_resume,
    .bdrv_debug_is_suspended    = blkdebug_debug_is_suspended,
 };
--- a/block/bochs.c
+++ b/block/bochs.c
@@ -39,41 +39,56 @@
 // not allocated: 0xffffffff

 // always little-endian
-struct bochs_header {
-    char magic[32];     /* "Bochs Virtual HD Image" */
-    char type[16];      /* "Redolog" */
-    char subtype[16];   /* "Undoable" / "Volatile" / "Growing" */
+struct bochs_header_v1 {
+    char magic[32]; // "Bochs Virtual HD Image"
+    char type[16]; // "Redolog"
+    char subtype[16]; // "Undoable" / "Volatile" / "Growing"
    uint32_t version;
-    uint32_t header;    /* size of header */
-
-    uint32_t catalog;   /* num of entries */
-    uint32_t bitmap;    /* bitmap size */
-    uint32_t extent;    /* extent size */
+    uint32_t header; // size of header

    union {
-        struct {
-            uint32_t reserved;  /* for ??? */
-            uint64_t disk;      /* disk size */
-            char padding[HEADER_SIZE - 64 - 20 - 12];
-        } QEMU_PACKED redolog;
-        struct {
-            uint64_t disk;      /* disk size */
-            char padding[HEADER_SIZE - 64 - 20 - 8];
-        } QEMU_PACKED redolog_v1;
-        char padding[HEADER_SIZE - 64 - 20];
+	struct {
+	    uint32_t catalog; // num of entries
+	    uint32_t bitmap; // bitmap size
+	    uint32_t extent; // extent size
+	    uint64_t disk; // disk size
+	    char padding[HEADER_SIZE - 64 - 8 - 20];
+	} redolog;
+	char padding[HEADER_SIZE - 64 - 8];
    } extra;
-} QEMU_PACKED;
+};
+
+// always little-endian
+struct bochs_header {
+    char magic[32]; // "Bochs Virtual HD Image"
+    char type[16]; // "Redolog"
+    char subtype[16]; // "Undoable" / "Volatile" / "Growing"
+    uint32_t version;
+    uint32_t header; // size of header
+
+    union {
+	struct {
+	    uint32_t catalog; // num of entries
+	    uint32_t bitmap; // bitmap size
+	    uint32_t extent; // extent size
+	    uint32_t reserved; // for ???
+	    uint64_t disk; // disk size
+	    char padding[HEADER_SIZE - 64 - 8 - 24];
+	} redolog;
+	char padding[HEADER_SIZE - 64 - 8];
+    } extra;
+};

 typedef struct BDRVBochsState {
    CoMutex lock;
    uint32_t *catalog_bitmap;
-    uint32_t catalog_size;
+    int catalog_size;

-    uint32_t data_offset;
+    int data_offset;

-    uint32_t bitmap_blocks;
-    uint32_t extent_blocks;
-    uint32_t extent_size;
+    int bitmap_blocks;
+    int extent_blocks;
+    int extent_size;
 } BDRVBochsState;

 static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename)
@@ -97,8 +112,9 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags,
                      Error **errp)
 {
    BDRVBochsState *s = bs->opaque;
-    uint32_t i;
+    int i;
    struct bochs_header bochs;
+    struct bochs_header_v1 header_v1;
    int ret;

    bs->read_only = 1; // no write support yet
@@ -117,19 +133,13 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags,
    }

    if (le32_to_cpu(bochs.version) == HEADER_V1) {
-        bs->total_sectors = le64_to_cpu(bochs.extra.redolog_v1.disk) / 512;
+      memcpy(&header_v1, &bochs, sizeof(bochs));
+      bs->total_sectors = le64_to_cpu(header_v1.extra.redolog.disk) / 512;
    } else {
-        bs->total_sectors = le64_to_cpu(bochs.extra.redolog.disk) / 512;
-    }
-
-    /* Limit to 1M entries to avoid unbounded allocation. This is what is
-     * needed for the largest image that bximage can create (~8 TB). */
-    s->catalog_size = le32_to_cpu(bochs.catalog);
-    if (s->catalog_size > 0x100000) {
-        error_setg(errp, "Catalog size is too large");
-        return -EFBIG;
+      bs->total_sectors = le64_to_cpu(bochs.extra.redolog.disk) / 512;
    }

+    s->catalog_size = le32_to_cpu(bochs.extra.redolog.catalog);
    s->catalog_bitmap = g_malloc(s->catalog_size * 4);

    ret = bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap,
@@ -143,24 +153,10 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags,

    s->data_offset = le32_to_cpu(bochs.header) + (s->catalog_size * 4);

-    s->bitmap_blocks = 1 + (le32_to_cpu(bochs.bitmap) - 1) / 512;
-    s->extent_blocks = 1 + (le32_to_cpu(bochs.extent) - 1) / 512;
+    s->bitmap_blocks = 1 + (le32_to_cpu(bochs.extra.redolog.bitmap) - 1) / 512;
+    s->extent_blocks = 1 + (le32_to_cpu(bochs.extra.redolog.extent) - 1) / 512;

-    s->extent_size = le32_to_cpu(bochs.extent);
-    if (s->extent_size == 0) {
-        error_setg(errp, "Extent size may not be zero");
-        return -EINVAL;
-    } else if (s->extent_size > 0x800000) {
-        error_setg(errp, "Extent size %" PRIu32 " is too large",
-                   s->extent_size);
-        return -EINVAL;
-    }
-
-    if (s->catalog_size < bs->total_sectors / s->extent_size) {
-        error_setg(errp, "Catalog size is too small for this disk size");
-        ret = -EINVAL;
-        goto fail;
-    }
+    s->extent_size = le32_to_cpu(bochs.extra.redolog.extent);

    qemu_co_mutex_init(&s->lock);
    return 0;
@@ -173,8 +169,8 @@ fail:
 static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
 {
    BDRVBochsState *s = bs->opaque;
-    uint64_t offset = sector_num * 512;
-    uint64_t extent_index, extent_offset, bitmap_offset;
+    int64_t offset = sector_num * 512;
+    int64_t extent_index, extent_offset, bitmap_offset;
    char bitmap_entry;

    // seek to sector
@@ -185,9 +181,8 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
 	return -1; /* not allocated */
    }

-    bitmap_offset = s->data_offset +
-        (512 * (uint64_t) s->catalog_bitmap[extent_index] *
-        (s->extent_blocks + s->bitmap_blocks));
+    bitmap_offset = s->data_offset + (512 * s->catalog_bitmap[extent_index] *
+	(s->extent_blocks + s->bitmap_blocks));

    /* read in bitmap for current extent */
    if (bdrv_pread(bs->file, bitmap_offset + (extent_offset / 8),
--- a/block/cloop.c
+++ b/block/cloop.c
@@ -26,9 +26,6 @@
 #include "qemu/module.h"
 #include <zlib.h>

-/* Maximum compressed block size */
-#define MAX_BLOCK_SIZE (64 * 1024 * 1024)
-
 typedef struct BDRVCloopState {
    CoMutex lock;
    uint32_t block_size;
@@ -71,26 +68,6 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
        return ret;
    }
    s->block_size = be32_to_cpu(s->block_size);
-    if (s->block_size % 512) {
-        error_setg(errp, "block_size %u must be a multiple of 512",
-                   s->block_size);
-        return -EINVAL;
-    }
-    if (s->block_size == 0) {
-        error_setg(errp, "block_size cannot be zero");
-        return -EINVAL;
-    }
-
-    /* cloop's create_compressed_fs.c warns about block sizes beyond 256 KB but
-     * we can accept more.  Prevent ridiculous values like 4 GB - 1 since we
-     * need a buffer this big.
-     */
-    if (s->block_size > MAX_BLOCK_SIZE) {
-        error_setg(errp, "block_size %u must be %u MB or less",
-                   s->block_size,
-                   MAX_BLOCK_SIZE / (1024 * 1024));
-        return -EINVAL;
-    }

    ret = bdrv_pread(bs->file, 128 + 4, &s->n_blocks, 4);
    if (ret < 0) {
@@ -99,23 +76,7 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
    s->n_blocks = be32_to_cpu(s->n_blocks);

    /* read offsets */
-    if (s->n_blocks > (UINT32_MAX - 1) / sizeof(uint64_t)) {
-        /* Prevent integer overflow */
-        error_setg(errp, "n_blocks %u must be %zu or less",
-                   s->n_blocks,
-                   (UINT32_MAX - 1) / sizeof(uint64_t));
-        return -EINVAL;
-    }
-    offsets_size = (s->n_blocks + 1) * sizeof(uint64_t);
-    if (offsets_size > 512 * 1024 * 1024) {
-        /* Prevent ridiculous offsets_size which causes memory allocation to
-         * fail or overflows bdrv_pread() size.  In practice the 512 MB
-         * offsets[] limit supports 16 TB images at 256 KB block size.
-         */
-        error_setg(errp, "image requires too many offsets, "
-                   "try increasing block size");
-        return -EINVAL;
-    }
+    offsets_size = s->n_blocks * sizeof(uint64_t);
    s->offsets = g_malloc(offsets_size);

    ret = bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size);
@@ -123,37 +84,13 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
        goto fail;
    }

-    for (i = 0; i < s->n_blocks + 1; i++) {
-        uint64_t size;
-
+    for(i=0;i<s->n_blocks;i++) {
        s->offsets[i] = be64_to_cpu(s->offsets[i]);
-        if (i == 0) {
-            continue;
-        }
-
-        if (s->offsets[i] < s->offsets[i - 1]) {
-            error_setg(errp, "offsets not monotonically increasing at "
-                       "index %u, image file is corrupt", i);
-            ret = -EINVAL;
-            goto fail;
-        }
-
-        size = s->offsets[i] - s->offsets[i - 1];
-
-        /* Compressed blocks should be smaller than the uncompressed block size
-         * but maybe compression performed poorly so the compressed block is
-         * actually bigger.  Clamp down on unrealistic values to prevent
-         * ridiculous s->compressed_block allocation.
-         */
-        if (size > 2 * MAX_BLOCK_SIZE) {
-            error_setg(errp, "invalid compressed block size at index %u, "
-                       "image file is corrupt", i);
-            ret = -EINVAL;
-            goto fail;
-        }
-
-        if (size > max_compressed_block_size) {
-            max_compressed_block_size = size;
+        if (i > 0) {
+            uint32_t size = s->offsets[i] - s->offsets[i - 1];
+            if (size > max_compressed_block_size) {
+                max_compressed_block_size = size;
+            }
        }
    }

@@ -243,7 +180,9 @@ static coroutine_fn int cloop_co_read(BlockDriverState *bs, int64_t sector_num,
 static void cloop_close(BlockDriverState *bs)
 {
    BDRVCloopState *s = bs->opaque;
-    g_free(s->offsets);
+    if (s->n_blocks > 0) {
+        g_free(s->offsets);
+    }
    g_free(s->compressed_block);
    g_free(s->uncompressed_block);
    inflateEnd(&s->zstream);
--- a/block/cow.c
+++ b/block/cow.c
@@ -103,40 +103,18 @@ static int cow_open(BlockDriverState *bs, QDict *options, int flags,
    return ret;
 }

-/*
- * XXX(hch): right now these functions are extremely inefficient.
- * We should just read the whole bitmap we'll need in one go instead.
- */
-static inline int cow_set_bit(BlockDriverState *bs, int64_t bitnum, bool *first)
+static inline void cow_set_bits(uint8_t *bitmap, int start, int64_t nb_sectors)
 {
-    uint64_t offset = sizeof(struct cow_header_v2) + bitnum / 8;
-    uint8_t bitmap;
-    int ret;
-
-    ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap));
-    if (ret < 0) {
-       return ret;
-    }
-
-    if (bitmap & (1 << (bitnum % 8))) {
-        return 0;
-    }
-
-    if (*first) {
-        ret = bdrv_flush(bs->file);
-        if (ret < 0) {
-            return ret;
+    int64_t bitnum = start, last = start + nb_sectors;
+    while (bitnum < last) {
+        if ((bitnum & 7) == 0 && bitnum + 8 <= last) {
+            bitmap[bitnum / 8] = 0xFF;
+            bitnum += 8;
+            continue;
        }
-        *first = false;
+        bitmap[bitnum/8] |= (1 << (bitnum % 8));
+        bitnum++;
    }
-
-    bitmap |= (1 << (bitnum % 8));
-
-    ret = bdrv_pwrite(bs->file, offset, &bitmap, sizeof(bitmap));
-    if (ret < 0) {
-       return ret;
-    }
-    return 0;
 }

 #define BITS_PER_BITMAP_SECTOR (512 * 8)
@@ -174,18 +152,34 @@ static int coroutine_fn cow_co_is_allocated(BlockDriverState *bs,
 {
    int64_t bitnum = sector_num + sizeof(struct cow_header_v2) * 8;
    uint64_t offset = (bitnum / 8) & -BDRV_SECTOR_SIZE;
-    uint8_t bitmap[BDRV_SECTOR_SIZE];
-    int ret;
-    int changed;
+    bool first = true;
+    int changed = 0, same = 0;

-    ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap));
-    if (ret < 0) {
-        return ret;
-    }
+    do {
+        int ret;
+        uint8_t bitmap[BDRV_SECTOR_SIZE];

-    bitnum &= BITS_PER_BITMAP_SECTOR - 1;
-    changed = cow_test_bit(bitnum, bitmap);
-    *num_same = cow_find_streak(bitmap, changed, bitnum, nb_sectors);
+        bitnum &= BITS_PER_BITMAP_SECTOR - 1;
+        int sector_bits = MIN(nb_sectors, BITS_PER_BITMAP_SECTOR - bitnum);
+
+        ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap));
+        if (ret < 0) {
+            return ret;
+        }
+
+        if (first) {
+            changed = cow_test_bit(bitnum, bitmap);
+            first = false;
+        }
+
+        same += cow_find_streak(bitmap, changed, bitnum, nb_sectors);
+
+        bitnum += sector_bits;
+        nb_sectors -= sector_bits;
+        offset += BDRV_SECTOR_SIZE;
+    } while (nb_sectors);
+
+    *num_same = same;
    return changed;
 }

@@ -204,18 +198,52 @@ static int64_t coroutine_fn cow_co_get_block_status(BlockDriverState *bs,
 static int cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
        int nb_sectors)
 {
-    int error = 0;
-    int i;
+    int64_t bitnum = sector_num + sizeof(struct cow_header_v2) * 8;
+    uint64_t offset = (bitnum / 8) & -BDRV_SECTOR_SIZE;
    bool first = true;
+    int sector_bits;

-    for (i = 0; i < nb_sectors; i++) {
-        error = cow_set_bit(bs, sector_num + i, &first);
-        if (error) {
-            break;
+    for ( ; nb_sectors;
+            bitnum += sector_bits,
+            nb_sectors -= sector_bits,
+            offset += BDRV_SECTOR_SIZE) {
+        int ret, set;
+        uint8_t bitmap[BDRV_SECTOR_SIZE];
+
+        bitnum &= BITS_PER_BITMAP_SECTOR - 1;
+        sector_bits = MIN(nb_sectors, BITS_PER_BITMAP_SECTOR - bitnum);
+
+        ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap));
+        if (ret < 0) {
+            return ret;
+        }
+
+        /* Skip over any already set bits */
+        set = cow_find_streak(bitmap, 1, bitnum, sector_bits);
+        bitnum += set;
+        sector_bits -= set;
+        nb_sectors -= set;
+        if (!sector_bits) {
+            continue;
+        }
+
+        if (first) {
+            ret = bdrv_flush(bs->file);
+            if (ret < 0) {
+                return ret;
+            }
+            first = false;
+        }
+
+        cow_set_bits(bitmap, bitnum, sector_bits);
+
+        ret = bdrv_pwrite(bs->file, offset, &bitmap, sizeof(bitmap));
+        if (ret < 0) {
+            return ret;
        }
    }

-    return error;
+    return 0;
 }

 static int coroutine_fn cow_read(BlockDriverState *bs, int64_t sector_num,
--- a/block/curl.c
+++ b/block/curl.c
@@ -34,11 +34,6 @@
 #define DPRINTF(fmt, ...) do { } while (0)
 #endif

-#if LIBCURL_VERSION_NUM >= 0x071000
-/* The multi interface timer callback was introduced in 7.16.0 */
-#define NEED_CURL_TIMER_CALLBACK
-#endif
-
 #define PROTOCOLS (CURLPROTO_HTTP | CURLPROTO_HTTPS | \
                   CURLPROTO_FTP | CURLPROTO_FTPS | \
                   CURLPROTO_TFTP)
@@ -82,7 +77,6 @@ typedef struct CURLState

 typedef struct BDRVCURLState {
    CURLM *multi;
-    QEMUTimer timer;
    size_t len;
    CURLState states[CURL_NUM_STATES];
    char *url;
@@ -93,23 +87,6 @@ typedef struct BDRVCURLState {
 static void curl_clean_state(CURLState *s);
 static void curl_multi_do(void *arg);

-#ifdef NEED_CURL_TIMER_CALLBACK
-static int curl_timer_cb(CURLM *multi, long timeout_ms, void *opaque)
-{
-    BDRVCURLState *s = opaque;
-
-    DPRINTF("CURL: timer callback timeout_ms %ld\n", timeout_ms);
-    if (timeout_ms == -1) {
-        timer_del(&s->timer);
-    } else {
-        int64_t timeout_ns = (int64_t)timeout_ms * 1000 * 1000;
-        timer_mod(&s->timer,
-                  qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ns);
-    }
-    return 0;
-}
-#endif
-
 static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
                        void *s, void *sp)
 {
@@ -157,11 +134,6 @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque)
    if (!s || !s->orig_buf)
        goto read_end;

-    if (s->buf_off >= s->buf_len) {
-        /* buffer full, read nothing */
-        return 0;
-    }
-    realsize = MIN(realsize, s->buf_len - s->buf_off);
    memcpy(s->orig_buf + s->buf_off, ptr, realsize);
    s->buf_off += realsize;

@@ -237,10 +209,20 @@ static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len,
    return FIND_RET_NONE;
 }

-static void curl_multi_read(BDRVCURLState *s)
+static void curl_multi_do(void *arg)
 {
+    BDRVCURLState *s = (BDRVCURLState *)arg;
+    int running;
+    int r;
    int msgs_in_queue;

+    if (!s->multi)
+        return;
+
+    do {
+        r = curl_multi_socket_all(s->multi, &running);
+    } while(r == CURLM_CALL_MULTI_PERFORM);
+
    /* Try to find done transfers, so we can free the easy
     * handle again. */
    do {
@@ -284,41 +266,6 @@ static void curl_multi_read(BDRVCURLState *s)
    } while(msgs_in_queue);
 }

-static void curl_multi_do(void *arg)
-{
-    BDRVCURLState *s = (BDRVCURLState *)arg;
-    int running;
-    int r;
-
-    if (!s->multi) {
-        return;
-    }
-
-    do {
-        r = curl_multi_socket_all(s->multi, &running);
-    } while(r == CURLM_CALL_MULTI_PERFORM);
-
-    curl_multi_read(s);
-}
-
-static void curl_multi_timeout_do(void *arg)
-{
-#ifdef NEED_CURL_TIMER_CALLBACK
-    BDRVCURLState *s = (BDRVCURLState *)arg;
-    int running;
-
-    if (!s->multi) {
-        return;
-    }
-
-    curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
-
-    curl_multi_read(s);
-#else
-    abort();
-#endif
-}
-
 static CURLState *curl_init_state(BDRVCURLState *s)
 {
    CURLState *state = NULL;
@@ -526,20 +473,12 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
    curl_easy_cleanup(state->curl);
    state->curl = NULL;

-    aio_timer_init(bdrv_get_aio_context(bs), &s->timer,
-                   QEMU_CLOCK_REALTIME, SCALE_NS,
-                   curl_multi_timeout_do, s);
-
    // Now we know the file exists and its size, so let's
    // initialize the multi interface!

    s->multi = curl_multi_init();
    curl_multi_setopt(s->multi, CURLMOPT_SOCKETDATA, s);
    curl_multi_setopt(s->multi, CURLMOPT_SOCKETFUNCTION, curl_sock_cb);
-#ifdef NEED_CURL_TIMER_CALLBACK
-    curl_multi_setopt(s->multi, CURLMOPT_TIMERDATA, s);
-    curl_multi_setopt(s->multi, CURLMOPT_TIMERFUNCTION, curl_timer_cb);
-#endif
    curl_multi_do(s);

    qemu_opts_del(opts);
@@ -658,9 +597,6 @@ static void curl_close(BlockDriverState *bs)
    }
    if (s->multi)
        curl_multi_cleanup(s->multi);
-
-    timer_del(&s->timer);
-
    g_free(s->url);
 }

--- a/block/dmg.c
+++ b/block/dmg.c
@@ -27,14 +27,6 @@
 #include "qemu/module.h"
 #include <zlib.h>

-enum {
-    /* Limit chunk sizes to prevent unreasonable amounts of memory being used
-     * or truncating when converting to 32-bit types
-     */
-    DMG_LENGTHS_MAX = 64 * 1024 * 1024, /* 64 MB */
-    DMG_SECTORCOUNTS_MAX = DMG_LENGTHS_MAX / 512,
-};
-
 typedef struct BDRVDMGState {
    CoMutex lock;
    /* each chunk contains a certain number of sectors,
@@ -100,44 +92,13 @@ static int read_uint32(BlockDriverState *bs, int64_t offset, uint32_t *result)
    return 0;
 }

-/* Increase max chunk sizes, if necessary.  This function is used to calculate
- * the buffer sizes needed for compressed/uncompressed chunk I/O.
- */
-static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk,
-                                  uint32_t *max_compressed_size,
-                                  uint32_t *max_sectors_per_chunk)
-{
-    uint32_t compressed_size = 0;
-    uint32_t uncompressed_sectors = 0;
-
-    switch (s->types[chunk]) {
-    case 0x80000005: /* zlib compressed */
-        compressed_size = s->lengths[chunk];
-        uncompressed_sectors = s->sectorcounts[chunk];
-        break;
-    case 1: /* copy */
-        uncompressed_sectors = (s->lengths[chunk] + 511) / 512;
-        break;
-    case 2: /* zero */
-        uncompressed_sectors = s->sectorcounts[chunk];
-        break;
-    }
-
-    if (compressed_size > *max_compressed_size) {
-        *max_compressed_size = compressed_size;
-    }
-    if (uncompressed_sectors > *max_sectors_per_chunk) {
-        *max_sectors_per_chunk = uncompressed_sectors;
-    }
-}
-
 static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
                    Error **errp)
 {
    BDRVDMGState *s = bs->opaque;
-    uint64_t info_begin, info_end, last_in_offset, last_out_offset;
+    uint64_t info_begin,info_end,last_in_offset,last_out_offset;
    uint32_t count, tmp;
-    uint32_t max_compressed_size = 1, max_sectors_per_chunk = 1, i;
+    uint32_t max_compressed_size=1,max_sectors_per_chunk=1,i;
    int64_t offset;
    int ret;

@@ -199,40 +160,37 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
            goto fail;
        }

-        if (type == 0x6d697368 && count >= 244) {
-            size_t new_size;
-            uint32_t chunk_count;
+	if (type == 0x6d697368 && count >= 244) {
+	    int new_size, chunk_count;

            offset += 4;
            offset += 200;

-            chunk_count = (count - 204) / 40;
-            new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count);
-            s->types = g_realloc(s->types, new_size / 2);
-            s->offsets = g_realloc(s->offsets, new_size);
-            s->lengths = g_realloc(s->lengths, new_size);
-            s->sectors = g_realloc(s->sectors, new_size);
-            s->sectorcounts = g_realloc(s->sectorcounts, new_size);
+	    chunk_count = (count-204)/40;
+	    new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count);
+	    s->types = g_realloc(s->types, new_size/2);
+	    s->offsets = g_realloc(s->offsets, new_size);
+	    s->lengths = g_realloc(s->lengths, new_size);
+	    s->sectors = g_realloc(s->sectors, new_size);
+	    s->sectorcounts = g_realloc(s->sectorcounts, new_size);

            for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) {
                ret = read_uint32(bs, offset, &s->types[i]);
                if (ret < 0) {
                    goto fail;
                }
-                offset += 4;
-                if (s->types[i] != 0x80000005 && s->types[i] != 1 &&
-                    s->types[i] != 2) {
-                    if (s->types[i] == 0xffffffff && i > 0) {
-                        last_in_offset = s->offsets[i - 1] + s->lengths[i - 1];
-                        last_out_offset = s->sectors[i - 1] +
-                                          s->sectorcounts[i - 1];
-                    }
-                    chunk_count--;
-                    i--;
-                    offset += 36;
-                    continue;
-                }
-                offset += 4;
+		offset += 4;
+		if(s->types[i]!=0x80000005 && s->types[i]!=1 && s->types[i]!=2) {
+		    if(s->types[i]==0xffffffff) {
+			last_in_offset = s->offsets[i-1]+s->lengths[i-1];
+			last_out_offset = s->sectors[i-1]+s->sectorcounts[i-1];
+		    }
+		    chunk_count--;
+		    i--;
+		    offset += 36;
+		    continue;
+		}
+		offset += 4;

                ret = read_uint64(bs, offset, &s->sectors[i]);
                if (ret < 0) {
@@ -247,14 +205,6 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
                }
                offset += 8;

-                if (s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) {
-                    error_report("sector count %" PRIu64 " for chunk %u is "
-                                 "larger than max (%u)",
-                                 s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX);
-                    ret = -EINVAL;
-                    goto fail;
-                }
-
                ret = read_uint64(bs, offset, &s->offsets[i]);
                if (ret < 0) {
                    goto fail;
@@ -268,25 +218,19 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
                }
                offset += 8;

-                if (s->lengths[i] > DMG_LENGTHS_MAX) {
-                    error_report("length %" PRIu64 " for chunk %u is larger "
-                                 "than max (%u)",
-                                 s->lengths[i], i, DMG_LENGTHS_MAX);
-                    ret = -EINVAL;
-                    goto fail;
-                }
-
-                update_max_chunk_size(s, i, &max_compressed_size,
-                                      &max_sectors_per_chunk);
-            }
-            s->n_chunks += chunk_count;
-        }
+		if(s->lengths[i]>max_compressed_size)
+		    max_compressed_size = s->lengths[i];
+		if(s->sectorcounts[i]>max_sectors_per_chunk)
+		    max_sectors_per_chunk = s->sectorcounts[i];
+	    }
+	    s->n_chunks+=chunk_count;
+	}
    }

    /* initialize zlib engine */
-    s->compressed_chunk = g_malloc(max_compressed_size + 1);
-    s->uncompressed_chunk = g_malloc(512 * max_sectors_per_chunk);
-    if (inflateInit(&s->zstream) != Z_OK) {
+    s->compressed_chunk = g_malloc(max_compressed_size+1);
+    s->uncompressed_chunk = g_malloc(512*max_sectors_per_chunk);
+    if(inflateInit(&s->zstream) != Z_OK) {
        ret = -EINVAL;
        goto fail;
    }
@@ -308,82 +252,83 @@ fail:
 }

 static inline int is_sector_in_chunk(BDRVDMGState* s,
-                uint32_t chunk_num, uint64_t sector_num)
+		uint32_t chunk_num,int sector_num)
 {
-    if (chunk_num >= s->n_chunks || s->sectors[chunk_num] > sector_num ||
-            s->sectors[chunk_num] + s->sectorcounts[chunk_num] <= sector_num) {
-        return 0;
-    } else {
-        return -1;
-    }
+    if(chunk_num>=s->n_chunks || s->sectors[chunk_num]>sector_num ||
+	    s->sectors[chunk_num]+s->sectorcounts[chunk_num]<=sector_num)
+	return 0;
+    else
+	return -1;
 }

-static inline uint32_t search_chunk(BDRVDMGState *s, uint64_t sector_num)
+static inline uint32_t search_chunk(BDRVDMGState* s,int sector_num)
 {
    /* binary search */
-    uint32_t chunk1 = 0, chunk2 = s->n_chunks, chunk3;
-    while (chunk1 != chunk2) {
-        chunk3 = (chunk1 + chunk2) / 2;
-        if (s->sectors[chunk3] > sector_num) {
-            chunk2 = chunk3;
-        } else if (s->sectors[chunk3] + s->sectorcounts[chunk3] > sector_num) {
-            return chunk3;
-        } else {
-            chunk1 = chunk3;
-        }
+    uint32_t chunk1=0,chunk2=s->n_chunks,chunk3;
+    while(chunk1!=chunk2) {
+	chunk3 = (chunk1+chunk2)/2;
+	if(s->sectors[chunk3]>sector_num)
+	    chunk2 = chunk3;
+	else if(s->sectors[chunk3]+s->sectorcounts[chunk3]>sector_num)
+	    return chunk3;
+	else
+	    chunk1 = chunk3;
    }
    return s->n_chunks; /* error */
 }

-static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
+static inline int dmg_read_chunk(BlockDriverState *bs, int sector_num)
 {
    BDRVDMGState *s = bs->opaque;

-    if (!is_sector_in_chunk(s, s->current_chunk, sector_num)) {
-        int ret;
-        uint32_t chunk = search_chunk(s, sector_num);
+    if(!is_sector_in_chunk(s,s->current_chunk,sector_num)) {
+	int ret;
+	uint32_t chunk = search_chunk(s,sector_num);

-        if (chunk >= s->n_chunks) {
-            return -1;
-        }
+	if(chunk>=s->n_chunks)
+	    return -1;

-        s->current_chunk = s->n_chunks;
-        switch (s->types[chunk]) {
-        case 0x80000005: { /* zlib compressed */
-            /* we need to buffer, because only the chunk as whole can be
-             * inflated. */
-            ret = bdrv_pread(bs->file, s->offsets[chunk],
-                             s->compressed_chunk, s->lengths[chunk]);
-            if (ret != s->lengths[chunk]) {
-                return -1;
-            }
+	s->current_chunk = s->n_chunks;
+	switch(s->types[chunk]) {
+	case 0x80000005: { /* zlib compressed */
+	    int i;

-            s->zstream.next_in = s->compressed_chunk;
-            s->zstream.avail_in = s->lengths[chunk];
-            s->zstream.next_out = s->uncompressed_chunk;
-            s->zstream.avail_out = 512 * s->sectorcounts[chunk];
-            ret = inflateReset(&s->zstream);
-            if (ret != Z_OK) {
-                return -1;
-            }
-            ret = inflate(&s->zstream, Z_FINISH);
-            if (ret != Z_STREAM_END ||
-                s->zstream.total_out != 512 * s->sectorcounts[chunk]) {
-                return -1;
-            }
-            break; }
-        case 1: /* copy */
-            ret = bdrv_pread(bs->file, s->offsets[chunk],
+	    /* we need to buffer, because only the chunk as whole can be
+	     * inflated. */
+	    i=0;
+	    do {
+                ret = bdrv_pread(bs->file, s->offsets[chunk] + i,
+                                 s->compressed_chunk+i, s->lengths[chunk]-i);
+		if(ret<0 && errno==EINTR)
+		    ret=0;
+		i+=ret;
+	    } while(ret>=0 && ret+i<s->lengths[chunk]);
+
+	    if (ret != s->lengths[chunk])
+		return -1;
+
+	    s->zstream.next_in = s->compressed_chunk;
+	    s->zstream.avail_in = s->lengths[chunk];
+	    s->zstream.next_out = s->uncompressed_chunk;
+	    s->zstream.avail_out = 512*s->sectorcounts[chunk];
+	    ret = inflateReset(&s->zstream);
+	    if(ret != Z_OK)
+		return -1;
+	    ret = inflate(&s->zstream, Z_FINISH);
+	    if(ret != Z_STREAM_END || s->zstream.total_out != 512*s->sectorcounts[chunk])
+		return -1;
+	    break; }
+	case 1: /* copy */
+	    ret = bdrv_pread(bs->file, s->offsets[chunk],
                             s->uncompressed_chunk, s->lengths[chunk]);
-            if (ret != s->lengths[chunk]) {
-                return -1;
-            }
-            break;
-        case 2: /* zero */
-            memset(s->uncompressed_chunk, 0, 512 * s->sectorcounts[chunk]);
-            break;
-        }
-        s->current_chunk = chunk;
+	    if (ret != s->lengths[chunk])
+		return -1;
+	    break;
+	case 2: /* zero */
+	    memset(s->uncompressed_chunk, 0, 512*s->sectorcounts[chunk]);
+	    break;
+	}
+	s->current_chunk = chunk;
    }
    return 0;
 }
@@ -394,14 +339,12 @@ static int dmg_read(BlockDriverState *bs, int64_t sector_num,
    BDRVDMGState *s = bs->opaque;
    int i;

-    for (i = 0; i < nb_sectors; i++) {
-        uint32_t sector_offset_in_chunk;
-        if (dmg_read_chunk(bs, sector_num + i) != 0) {
-            return -1;
-        }
-        sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk];
-        memcpy(buf + i * 512,
-               s->uncompressed_chunk + sector_offset_in_chunk * 512, 512);
+    for(i=0;i<nb_sectors;i++) {
+	uint32_t sector_offset_in_chunk;
+	if(dmg_read_chunk(bs, sector_num+i) != 0)
+	    return -1;
+	sector_offset_in_chunk = sector_num+i-s->sectors[s->current_chunk];
+	memcpy(buf+i*512,s->uncompressed_chunk+sector_offset_in_chunk*512,512);
    }
    return 0;
 }
@@ -433,12 +376,12 @@ static void dmg_close(BlockDriverState *bs)
 }

 static BlockDriver bdrv_dmg = {
-    .format_name    = "dmg",
-    .instance_size  = sizeof(BDRVDMGState),
-    .bdrv_probe     = dmg_probe,
-    .bdrv_open      = dmg_open,
-    .bdrv_read      = dmg_co_read,
-    .bdrv_close     = dmg_close,
+    .format_name	= "dmg",
+    .instance_size	= sizeof(BDRVDMGState),
+    .bdrv_probe		= dmg_probe,
+    .bdrv_open		= dmg_open,
+    .bdrv_read          = dmg_co_read,
+    .bdrv_close		= dmg_close,
 };

 static void bdrv_dmg_init(void)
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -2,6 +2,7 @@
 * QEMU Block driver for iSCSI images
 *
 * Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com>
+ * Copyright (c) 2012-2013 Peter Lieven <pl@kamp.de>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -54,8 +55,10 @@ typedef struct IscsiLun {
    QEMUTimer *nop_timer;
    uint8_t lbpme;
    uint8_t lbprz;
+    uint8_t has_write_same;
    struct scsi_inquiry_logical_block_provisioning lbp;
    struct scsi_inquiry_block_limits bl;
+    unsigned char *zeroblock;
 } IscsiLun;

 typedef struct IscsiTask {
@@ -65,7 +68,6 @@ typedef struct IscsiTask {
    int do_retry;
    struct scsi_task *task;
    Coroutine *co;
-    QEMUBH *bh;
 } IscsiTask;

 typedef struct IscsiAIOCB {
@@ -88,7 +90,6 @@ typedef struct IscsiAIOCB {
 #define NOP_INTERVAL 5000
 #define MAX_NOP_FAILURES 3
 #define ISCSI_CMD_RETRIES 5
-#define ISCSI_MAX_UNMAP 131072

 static void
 iscsi_bh_cb(void *p)
@@ -122,13 +123,6 @@ iscsi_schedule_bh(IscsiAIOCB *acb)
    qemu_bh_schedule(acb->bh);
 }

-static void iscsi_co_generic_bh_cb(void *opaque)
-{
-    struct IscsiTask *iTask = opaque;
-    qemu_bh_delete(iTask->bh);
-    qemu_coroutine_enter(iTask->co, NULL);
-}
-
 static void
 iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
                        void *command_data, void *opaque)
@@ -143,19 +137,17 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,

    if (iTask->retries-- > 0 && status == SCSI_STATUS_CHECK_CONDITION
        && task->sense.key == SCSI_SENSE_UNIT_ATTENTION) {
-        error_report("iSCSI CheckCondition: %s", iscsi_get_error(iscsi));
        iTask->do_retry = 1;
        goto out;
    }

    if (status != SCSI_STATUS_GOOD) {
-        error_report("iSCSI Failure: %s", iscsi_get_error(iscsi));
+        error_report("iSCSI: Failure. %s", iscsi_get_error(iscsi));
    }

 out:
    if (iTask->co) {
-        iTask->bh = qemu_bh_new(iscsi_co_generic_bh_cb, iTask);
-        qemu_bh_schedule(iTask->bh);
+        qemu_coroutine_enter(iTask->co, NULL);
    }
 }

@@ -247,44 +239,6 @@ iscsi_process_write(void *arg)
    iscsi_set_events(iscsilun);
 }

-static int
-iscsi_aio_writev_acb(IscsiAIOCB *acb);
-
-static void
-iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status,
-                     void *command_data, void *opaque)
-{
-    IscsiAIOCB *acb = opaque;
-
-    trace_iscsi_aio_write16_cb(iscsi, status, acb, acb->canceled);
-
-    g_free(acb->buf);
-    acb->buf = NULL;
-
-    if (acb->canceled != 0) {
-        return;
-    }
-
-    acb->status = 0;
-    if (status != 0) {
-        if (status == SCSI_STATUS_CHECK_CONDITION
-            && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION
-            && acb->retries-- > 0) {
-            scsi_free_scsi_task(acb->task);
-            acb->task = NULL;
-            if (iscsi_aio_writev_acb(acb) == 0) {
-                iscsi_set_events(acb->iscsilun);
-                return;
-            }
-        }
-        error_report("Failed to write16 data to iSCSI lun. %s",
-                     iscsi_get_error(iscsi));
-        acb->status = -EIO;
-    }
-
-    iscsi_schedule_bh(acb);
-}
-
 static int64_t sector_lun2qemu(int64_t sector, IscsiLun *iscsilun)
 {
    return sector * iscsilun->block_size / BDRV_SECTOR_SIZE;
@@ -309,324 +263,172 @@ static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors,
    return 1;
 }

-static int
-iscsi_aio_writev_acb(IscsiAIOCB *acb)
+static int coroutine_fn iscsi_co_writev(BlockDriverState *bs,
+                                        int64_t sector_num, int nb_sectors,
+                                        QEMUIOVector *iov)
 {
-    struct iscsi_context *iscsi = acb->iscsilun->iscsi;
-    size_t size;
-    uint32_t num_sectors;
+    IscsiLun *iscsilun = bs->opaque;
+    struct IscsiTask iTask;
    uint64_t lba;
+    uint32_t num_sectors;
+    uint8_t *data = NULL;
+    uint8_t *buf = NULL;
+
+    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+        return -EINVAL;
+    }
+
+    lba = sector_qemu2lun(sector_num, iscsilun);
+    num_sectors = sector_qemu2lun(nb_sectors, iscsilun);
 #if !defined(LIBISCSI_FEATURE_IOVECTOR)
-    struct iscsi_data data;
-#endif
-    int ret;
-
-    acb->canceled   = 0;
-    acb->bh         = NULL;
-    acb->status     = -EINPROGRESS;
-    acb->buf        = NULL;
-
-    /* this will allow us to get rid of 'buf' completely */
-    size = acb->nb_sectors * BDRV_SECTOR_SIZE;
-
-#if !defined(LIBISCSI_FEATURE_IOVECTOR)
-    data.size = MIN(size, acb->qiov->size);
-
    /* if the iovec only contains one buffer we can pass it directly */
-    if (acb->qiov->niov == 1) {
-        data.data = acb->qiov->iov[0].iov_base;
+    if (iov->niov == 1) {
+        data = iov->iov[0].iov_base;
    } else {
-        acb->buf = g_malloc(data.size);
-        qemu_iovec_to_buf(acb->qiov, 0, acb->buf, data.size);
-        data.data = acb->buf;
+        size_t size = MIN(nb_sectors * BDRV_SECTOR_SIZE, iov->size);
+        buf = g_malloc(size);
+        qemu_iovec_to_buf(iov, 0, buf, size);
+        data = buf;
    }
 #endif
-
-    acb->task = malloc(sizeof(struct scsi_task));
-    if (acb->task == NULL) {
-        error_report("iSCSI: Failed to allocate task for scsi WRITE16 "
-                     "command. %s", iscsi_get_error(iscsi));
-        return -1;
+    iscsi_co_init_iscsitask(iscsilun, &iTask);
+retry:
+    iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba,
+                                    data, num_sectors * iscsilun->block_size,
+                                    iscsilun->block_size, 0, 0, 0, 0, 0,
+                                    iscsi_co_generic_cb, &iTask);
+    if (iTask.task == NULL) {
+        g_free(buf);
+        return -EIO;
    }
-    memset(acb->task, 0, sizeof(struct scsi_task));
-
-    acb->task->xfer_dir = SCSI_XFER_WRITE;
-    acb->task->cdb_size = 16;
-    acb->task->cdb[0] = 0x8a;
-    lba = sector_qemu2lun(acb->sector_num, acb->iscsilun);
-    *(uint32_t *)&acb->task->cdb[2]  = htonl(lba >> 32);
-    *(uint32_t *)&acb->task->cdb[6]  = htonl(lba & 0xffffffff);
-    num_sectors = sector_qemu2lun(acb->nb_sectors, acb->iscsilun);
-    *(uint32_t *)&acb->task->cdb[10] = htonl(num_sectors);
-    acb->task->expxferlen = size;
-
 #if defined(LIBISCSI_FEATURE_IOVECTOR)
-    ret = iscsi_scsi_command_async(iscsi, acb->iscsilun->lun, acb->task,
-                                   iscsi_aio_write16_cb,
-                                   NULL,
-                                   acb);
-#else
-    ret = iscsi_scsi_command_async(iscsi, acb->iscsilun->lun, acb->task,
-                                   iscsi_aio_write16_cb,
-                                   &data,
-                                   acb);
+    scsi_task_set_iov_out(iTask.task, (struct scsi_iovec *) iov->iov,
+                          iov->niov);
 #endif
-    if (ret != 0) {
-        scsi_free_scsi_task(acb->task);
-        g_free(acb->buf);
-        return -1;
+    while (!iTask.complete) {
+        iscsi_set_events(iscsilun);
+        qemu_coroutine_yield();
    }

-#if defined(LIBISCSI_FEATURE_IOVECTOR)
-    scsi_task_set_iov_out(acb->task, (struct scsi_iovec*) acb->qiov->iov, acb->qiov->niov);
-#endif
+    if (iTask.task != NULL) {
+        scsi_free_scsi_task(iTask.task);
+        iTask.task = NULL;
+    }
+
+    if (iTask.do_retry) {
+        goto retry;
+    }
+
+    g_free(buf);
+
+    if (iTask.status != SCSI_STATUS_GOOD) {
+        return -EIO;
+    }

    return 0;
 }

-static BlockDriverAIOCB *
-iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num,
-                 QEMUIOVector *qiov, int nb_sectors,
-                 BlockDriverCompletionFunc *cb,
-                 void *opaque)
+static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
+                                       int64_t sector_num, int nb_sectors,
+                                       QEMUIOVector *iov)
 {
    IscsiLun *iscsilun = bs->opaque;
-    IscsiAIOCB *acb;
-
-    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
-        return NULL;
-    }
-
-    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
-    trace_iscsi_aio_writev(iscsilun->iscsi, sector_num, nb_sectors, opaque, acb);
-
-    acb->iscsilun    = iscsilun;
-    acb->qiov        = qiov;
-    acb->nb_sectors  = nb_sectors;
-    acb->sector_num  = sector_num;
-    acb->retries     = ISCSI_CMD_RETRIES;
-
-    if (iscsi_aio_writev_acb(acb) != 0) {
-        qemu_aio_release(acb);
-        return NULL;
-    }
-
-    iscsi_set_events(iscsilun);
-    return &acb->common;
-}
-
-static int
-iscsi_aio_readv_acb(IscsiAIOCB *acb);
-
-static void
-iscsi_aio_read16_cb(struct iscsi_context *iscsi, int status,
-                    void *command_data, void *opaque)
-{
-    IscsiAIOCB *acb = opaque;
-
-    trace_iscsi_aio_read16_cb(iscsi, status, acb, acb->canceled);
-
-    if (acb->canceled != 0) {
-        return;
-    }
-
-    acb->status = 0;
-    if (status != 0) {
-        if (status == SCSI_STATUS_CHECK_CONDITION
-            && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION
-            && acb->retries-- > 0) {
-            scsi_free_scsi_task(acb->task);
-            acb->task = NULL;
-            if (iscsi_aio_readv_acb(acb) == 0) {
-                iscsi_set_events(acb->iscsilun);
-                return;
-            }
-        }
-        error_report("Failed to read16 data from iSCSI lun. %s",
-                     iscsi_get_error(iscsi));
-        acb->status = -EIO;
-    }
-
-    iscsi_schedule_bh(acb);
-}
-
-static int
-iscsi_aio_readv_acb(IscsiAIOCB *acb)
-{
-    struct iscsi_context *iscsi = acb->iscsilun->iscsi;
-    size_t size;
+    struct IscsiTask iTask;
    uint64_t lba;
    uint32_t num_sectors;
-    int ret;
 #if !defined(LIBISCSI_FEATURE_IOVECTOR)
    int i;
 #endif

-    acb->canceled    = 0;
-    acb->bh          = NULL;
-    acb->status      = -EINPROGRESS;
-    acb->buf         = NULL;
-
-    size = acb->nb_sectors * BDRV_SECTOR_SIZE;
-
-    acb->task = malloc(sizeof(struct scsi_task));
-    if (acb->task == NULL) {
-        error_report("iSCSI: Failed to allocate task for scsi READ16 "
-                     "command. %s", iscsi_get_error(iscsi));
-        return -1;
+    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+        return -EINVAL;
    }
-    memset(acb->task, 0, sizeof(struct scsi_task));

-    acb->task->xfer_dir = SCSI_XFER_READ;
-    acb->task->expxferlen = size;
-    lba = sector_qemu2lun(acb->sector_num, acb->iscsilun);
-    num_sectors = sector_qemu2lun(acb->nb_sectors, acb->iscsilun);
+    lba = sector_qemu2lun(sector_num, iscsilun);
+    num_sectors = sector_qemu2lun(nb_sectors, iscsilun);

-    switch (acb->iscsilun->type) {
+    iscsi_co_init_iscsitask(iscsilun, &iTask);
+retry:
+    switch (iscsilun->type) {
    case TYPE_DISK:
-        acb->task->cdb_size = 16;
-        acb->task->cdb[0]  = 0x88;
-        *(uint32_t *)&acb->task->cdb[2]  = htonl(lba >> 32);
-        *(uint32_t *)&acb->task->cdb[6]  = htonl(lba & 0xffffffff);
-        *(uint32_t *)&acb->task->cdb[10] = htonl(num_sectors);
+        iTask.task = iscsi_read16_task(iscsilun->iscsi, iscsilun->lun, lba,
+                                       num_sectors * iscsilun->block_size,
+                                       iscsilun->block_size, 0, 0, 0, 0, 0,
+                                       iscsi_co_generic_cb, &iTask);
        break;
    default:
-        acb->task->cdb_size = 10;
-        acb->task->cdb[0]  = 0x28;
-        *(uint32_t *)&acb->task->cdb[2] = htonl(lba);
-        *(uint16_t *)&acb->task->cdb[7] = htons(num_sectors);
+        iTask.task = iscsi_read10_task(iscsilun->iscsi, iscsilun->lun, lba,
+                                       num_sectors * iscsilun->block_size,
+                                       iscsilun->block_size, 0, 0, 0, 0, 0,
+                                       iscsi_co_generic_cb, &iTask);
        break;
    }
-
-    ret = iscsi_scsi_command_async(iscsi, acb->iscsilun->lun, acb->task,
-                                   iscsi_aio_read16_cb,
-                                   NULL,
-                                   acb);
-    if (ret != 0) {
-        scsi_free_scsi_task(acb->task);
-        return -1;
+    if (iTask.task == NULL) {
+        return -EIO;
    }
-
 #if defined(LIBISCSI_FEATURE_IOVECTOR)
-    scsi_task_set_iov_in(acb->task, (struct scsi_iovec*) acb->qiov->iov, acb->qiov->niov);
+    scsi_task_set_iov_in(iTask.task, (struct scsi_iovec *) iov->iov, iov->niov);
 #else
-    for (i = 0; i < acb->qiov->niov; i++) {
-        scsi_task_add_data_in_buffer(acb->task,
-                acb->qiov->iov[i].iov_len,
-                acb->qiov->iov[i].iov_base);
+    for (i = 0; i < iov->niov; i++) {
+        scsi_task_add_data_in_buffer(iTask.task,
+                                     iov->iov[i].iov_len,
+                                     iov->iov[i].iov_base);
    }
 #endif
-    return 0;
-}

-static BlockDriverAIOCB *
-iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num,
-                QEMUIOVector *qiov, int nb_sectors,
-                BlockDriverCompletionFunc *cb,
-                void *opaque)
-{
-    IscsiLun *iscsilun = bs->opaque;
-    IscsiAIOCB *acb;
-
-    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
-        return NULL;
+    while (!iTask.complete) {
+        iscsi_set_events(iscsilun);
+        qemu_coroutine_yield();
    }

-    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
-    trace_iscsi_aio_readv(iscsilun->iscsi, sector_num, nb_sectors, opaque, acb);
-
-    acb->nb_sectors  = nb_sectors;
-    acb->sector_num  = sector_num;
-    acb->iscsilun    = iscsilun;
-    acb->qiov        = qiov;
-    acb->retries     = ISCSI_CMD_RETRIES;
-
-    if (iscsi_aio_readv_acb(acb) != 0) {
-        qemu_aio_release(acb);
-        return NULL;
+    if (iTask.task != NULL) {
+        scsi_free_scsi_task(iTask.task);
+        iTask.task = NULL;
    }

-    iscsi_set_events(iscsilun);
-    return &acb->common;
-}
-
-static int
-iscsi_aio_flush_acb(IscsiAIOCB *acb);
-
-static void
-iscsi_synccache10_cb(struct iscsi_context *iscsi, int status,
-                     void *command_data, void *opaque)
-{
-    IscsiAIOCB *acb = opaque;
-
-    if (acb->canceled != 0) {
-        return;
+    if (iTask.do_retry) {
+        goto retry;
    }

-    acb->status = 0;
-    if (status != 0) {
-        if (status == SCSI_STATUS_CHECK_CONDITION
-            && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION
-            && acb->retries-- > 0) {
-            scsi_free_scsi_task(acb->task);
-            acb->task = NULL;
-            if (iscsi_aio_flush_acb(acb) == 0) {
-                iscsi_set_events(acb->iscsilun);
-                return;
-            }
-        }
-        error_report("Failed to sync10 data on iSCSI lun. %s",
-                     iscsi_get_error(iscsi));
-        acb->status = -EIO;
-    }
-
-    iscsi_schedule_bh(acb);
-}
-
-static int
-iscsi_aio_flush_acb(IscsiAIOCB *acb)
-{
-    struct iscsi_context *iscsi = acb->iscsilun->iscsi;
-
-    acb->canceled   = 0;
-    acb->bh         = NULL;
-    acb->status     = -EINPROGRESS;
-    acb->buf        = NULL;
-
-    acb->task = iscsi_synchronizecache10_task(iscsi, acb->iscsilun->lun,
-                                         0, 0, 0, 0,
-                                         iscsi_synccache10_cb,
-                                         acb);
-    if (acb->task == NULL) {
-        error_report("iSCSI: Failed to send synchronizecache10 command. %s",
-                     iscsi_get_error(iscsi));
-        return -1;
+    if (iTask.status != SCSI_STATUS_GOOD) {
+        return -EIO;
    }

    return 0;
 }

-static BlockDriverAIOCB *
-iscsi_aio_flush(BlockDriverState *bs,
-                BlockDriverCompletionFunc *cb, void *opaque)
+static int coroutine_fn iscsi_co_flush(BlockDriverState *bs)
 {
    IscsiLun *iscsilun = bs->opaque;
+    struct IscsiTask iTask;

-    IscsiAIOCB *acb;
+    iscsi_co_init_iscsitask(iscsilun, &iTask);

-    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
-
-    acb->iscsilun    = iscsilun;
-    acb->retries     = ISCSI_CMD_RETRIES;
-
-    if (iscsi_aio_flush_acb(acb) != 0) {
-        qemu_aio_release(acb);
-        return NULL;
+retry:
+    if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, 0,
+                                      0, iscsi_co_generic_cb, &iTask) == NULL) {
+        return -EIO;
    }

-    iscsi_set_events(iscsilun);
+    while (!iTask.complete) {
+        iscsi_set_events(iscsilun);
+        qemu_coroutine_yield();
+    }

-    return &acb->common;
+    if (iTask.task != NULL) {
+        scsi_free_scsi_task(iTask.task);
+        iTask.task = NULL;
+    }
+
+    if (iTask.do_retry) {
+        goto retry;
+    }
+
+    if (iTask.status != SCSI_STATUS_GOOD) {
+        return -EIO;
+    }
+
+    return 0;
 }

 #ifdef __linux__
@@ -869,7 +671,6 @@ retry:
            scsi_free_scsi_task(iTask.task);
            iTask.task = NULL;
        }
-        iTask.complete = 0;
        goto retry;
    }

@@ -923,8 +724,6 @@ coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num,
    IscsiLun *iscsilun = bs->opaque;
    struct IscsiTask iTask;
    struct unmap_list list;
-    uint32_t nb_blocks;
-    uint32_t max_unmap;

    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
        return -EINVAL;
@@ -936,58 +735,115 @@ coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num,
    }

    list.lba = sector_qemu2lun(sector_num, iscsilun);
-    nb_blocks = sector_qemu2lun(nb_sectors, iscsilun);
+    list.num = sector_qemu2lun(nb_sectors, iscsilun);

-    max_unmap = iscsilun->bl.max_unmap;
-    if (max_unmap == 0xffffffff) {
-        max_unmap = ISCSI_MAX_UNMAP;
+    iscsi_co_init_iscsitask(iscsilun, &iTask);
+retry:
+    if (iscsi_unmap_task(iscsilun->iscsi, iscsilun->lun, 0, 0, &list, 1,
+                     iscsi_co_generic_cb, &iTask) == NULL) {
+        return -EIO;
    }

-    while (nb_blocks > 0) {
-        iscsi_co_init_iscsitask(iscsilun, &iTask);
-        list.num = nb_blocks;
-        if (list.num > max_unmap) {
-            list.num = max_unmap;
-        }
-retry:
-        if (iscsi_unmap_task(iscsilun->iscsi, iscsilun->lun, 0, 0, &list, 1,
-                         iscsi_co_generic_cb, &iTask) == NULL) {
-            return -EIO;
-        }
+    while (!iTask.complete) {
+        iscsi_set_events(iscsilun);
+        qemu_coroutine_yield();
+    }

-        while (!iTask.complete) {
-            iscsi_set_events(iscsilun);
-            qemu_coroutine_yield();
-        }
+    if (iTask.task != NULL) {
+        scsi_free_scsi_task(iTask.task);
+        iTask.task = NULL;
+    }

-        if (iTask.task != NULL) {
-            scsi_free_scsi_task(iTask.task);
-            iTask.task = NULL;
-        }
+    if (iTask.do_retry) {
+        goto retry;
+    }

-        if (iTask.do_retry) {
-            iTask.complete = 0;
-            goto retry;
-        }
+    if (iTask.status == SCSI_STATUS_CHECK_CONDITION) {
+        /* the target might fail with a check condition if it
+           is not happy with the alignment of the UNMAP request
+           we silently fail in this case */
+        return 0;
+    }

-        if (iTask.status == SCSI_STATUS_CHECK_CONDITION) {
-            /* the target might fail with a check condition if it
-               is not happy with the alignment of the UNMAP request
-               we silently fail in this case */
-            return 0;
-        }
-
-        if (iTask.status != SCSI_STATUS_GOOD) {
-            return -EIO;
-        }
-
-        list.lba += list.num;
-        nb_blocks -= list.num;
+    if (iTask.status != SCSI_STATUS_GOOD) {
+        return -EIO;
    }

    return 0;
 }

+#if defined(SCSI_SENSE_ASCQ_CAPACITY_DATA_HAS_CHANGED)
+
+static int
+coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
+                                   int nb_sectors, BdrvRequestFlags flags)
+{
+    IscsiLun *iscsilun = bs->opaque;
+    struct IscsiTask iTask;
+    uint64_t lba;
+    uint32_t nb_blocks;
+
+    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+        return -EINVAL;
+    }
+
+    if (!(flags & BDRV_REQ_MAY_UNMAP) && !iscsilun->has_write_same) {
+        /* WRITE SAME without UNMAP is not supported by the target */
+        return -ENOTSUP;
+    }
+
+    if ((flags & BDRV_REQ_MAY_UNMAP) && !iscsilun->lbp.lbpws) {
+        /* WRITE SAME with UNMAP is not supported by the target */
+        return -ENOTSUP;
+    }
+
+    lba = sector_qemu2lun(sector_num, iscsilun);
+    nb_blocks = sector_qemu2lun(nb_sectors, iscsilun);
+
+    if (iscsilun->zeroblock == NULL) {
+        iscsilun->zeroblock = g_malloc0(iscsilun->block_size);
+    }
+
+    iscsi_co_init_iscsitask(iscsilun, &iTask);
+retry:
+    if (iscsi_writesame16_task(iscsilun->iscsi, iscsilun->lun, lba,
+                               iscsilun->zeroblock, iscsilun->block_size,
+                               nb_blocks, 0, !!(flags & BDRV_REQ_MAY_UNMAP),
+                               0, 0, iscsi_co_generic_cb, &iTask) == NULL) {
+        return -EIO;
+    }
+
+    while (!iTask.complete) {
+        iscsi_set_events(iscsilun);
+        qemu_coroutine_yield();
+    }
+
+    if (iTask.task != NULL) {
+        scsi_free_scsi_task(iTask.task);
+        iTask.task = NULL;
+    }
+
+    if (iTask.do_retry) {
+        goto retry;
+    }
+
+    if (iTask.status != SCSI_STATUS_GOOD) {
+        if (iTask.status == SCSI_STATUS_CHECK_CONDITION &&
+            iTask.task->sense.key == SCSI_SENSE_ILLEGAL_REQUEST &&
+            iTask.task->sense.ascq == SCSI_SENSE_ASCQ_INVALID_OPERATION_CODE) {
+            /* WRITE SAME is not supported by the target */
+            iscsilun->has_write_same = false;
+            return -ENOTSUP;
+        }
+
+        return -EIO;
+    }
+
+    return 0;
+}
+
+#endif /* SCSI_SENSE_ASCQ_CAPACITY_DATA_HAS_CHANGED */
+
 static int parse_chap(struct iscsi_context *iscsi, const char *target)
 {
    QemuOptsList *list;
@@ -1343,6 +1199,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
    }

    iscsilun->type = inq->periperal_device_type;
+    iscsilun->has_write_same = true;

    if ((ret = iscsi_readcapacity_sync(iscsilun)) != 0) {
        goto out;
@@ -1396,6 +1253,23 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
               sizeof(struct scsi_inquiry_block_limits));
        scsi_free_scsi_task(task);
        task = NULL;
+
+        if (iscsilun->bl.max_unmap < 0xffffffff) {
+            bs->bl.max_discard = sector_lun2qemu(iscsilun->bl.max_unmap,
+                                                 iscsilun);
+        }
+        bs->bl.discard_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran,
+                                                   iscsilun);
+
+        if (iscsilun->bl.max_ws_len < 0xffffffff) {
+            bs->bl.max_write_zeroes = sector_lun2qemu(iscsilun->bl.max_ws_len,
+                                                      iscsilun);
+        }
+        bs->bl.write_zeroes_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran,
+                                                        iscsilun);
+
+        bs->bl.opt_transfer_length = sector_lun2qemu(iscsilun->bl.opt_xfer_len,
+                                                     iscsilun);
    }

 #if defined(LIBISCSI_FEATURE_NOP_COUNTER)
@@ -1436,6 +1310,7 @@ static void iscsi_close(BlockDriverState *bs)
    }
    qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), NULL, NULL, NULL);
    iscsi_destroy_context(iscsi);
+    g_free(iscsilun->zeroblock);
    memset(iscsilun, 0, sizeof(IscsiLun));
 }

@@ -1459,11 +1334,6 @@ static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
    return 0;
 }

-static int iscsi_has_zero_init(BlockDriverState *bs)
-{
-    return 0;
-}
-
 static int iscsi_create(const char *filename, QEMUOptionParameter *options,
                        Error **errp)
 {
@@ -1518,6 +1388,21 @@ out:
    return ret;
 }

+static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    IscsiLun *iscsilun = bs->opaque;
+    bdi->unallocated_blocks_are_zero = !!iscsilun->lbprz;
+    bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws;
+    /* Guess the internal cluster (page) size of the iscsi target by the means
+     * of opt_unmap_gran. Transfer the unmap granularity only if it has a
+     * reasonable size for bdi->cluster_size */
+    if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 64 * 1024 &&
+        iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) {
+        bdi->cluster_size = iscsilun->bl.opt_unmap_gran * iscsilun->block_size;
+    }
+    return 0;
+}
+
 static QEMUOptionParameter iscsi_create_options[] = {
    {
        .name = BLOCK_OPT_SIZE,
@@ -1539,18 +1424,19 @@ static BlockDriver bdrv_iscsi = {
    .create_options  = iscsi_create_options,

    .bdrv_getlength  = iscsi_getlength,
+    .bdrv_get_info   = iscsi_get_info,
    .bdrv_truncate   = iscsi_truncate,

 #if defined(LIBISCSI_FEATURE_IOVECTOR)
    .bdrv_co_get_block_status = iscsi_co_get_block_status,
 #endif
    .bdrv_co_discard      = iscsi_co_discard,
-
-    .bdrv_aio_readv  = iscsi_aio_readv,
-    .bdrv_aio_writev = iscsi_aio_writev,
-    .bdrv_aio_flush  = iscsi_aio_flush,
-
-    .bdrv_has_zero_init = iscsi_has_zero_init,
+#if defined(SCSI_SENSE_ASCQ_CAPACITY_DATA_HAS_CHANGED)
+    .bdrv_co_write_zeroes = iscsi_co_write_zeroes,
+#endif
+    .bdrv_co_readv         = iscsi_co_readv,
+    .bdrv_co_writev        = iscsi_co_writev,
+    .bdrv_co_flush_to_disk = iscsi_co_flush,

 #ifdef __linux__
    .bdrv_ioctl       = iscsi_ioctl,
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -39,6 +39,7 @@ typedef struct MirrorBlockJob {
    int64_t granularity;
    size_t buf_size;
    unsigned long *cow_bitmap;
+    BdrvDirtyBitmap *dirty_bitmap;
    HBitmapIter hbi;
    uint8_t *buf;
    QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
@@ -95,14 +96,7 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
    }

    g_slice_free(MirrorOp, op);
-
-    /* Enter coroutine when it is not sleeping.  The coroutine sleeps to
-     * rate-limit itself.  The coroutine will eventually resume since there is
-     * a sleep timeout so don't wake it early.
-     */
-    if (s->common.busy) {
-        qemu_coroutine_enter(s->common.co, NULL);
-    }
+    qemu_coroutine_enter(s->common.co, NULL);
 }

 static void mirror_write_complete(void *opaque, int ret)
@@ -143,19 +137,19 @@ static void mirror_read_complete(void *opaque, int ret)
                    mirror_write_complete, op);
 }

-static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
+static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
    BlockDriverState *source = s->common.bs;
    int nb_sectors, sectors_per_chunk, nb_chunks;
    int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector;
-    uint64_t delay_ns;
    MirrorOp *op;

    s->sector_num = hbitmap_iter_next(&s->hbi);
    if (s->sector_num < 0) {
-        bdrv_dirty_iter_init(source, &s->hbi);
+        bdrv_dirty_iter_init(source, s->dirty_bitmap, &s->hbi);
        s->sector_num = hbitmap_iter_next(&s->hbi);
-        trace_mirror_restart_iter(s, bdrv_get_dirty_count(source));
+        trace_mirror_restart_iter(s,
+                                  bdrv_get_dirty_count(source, s->dirty_bitmap));
        assert(s->sector_num >= 0);
    }

@@ -191,7 +185,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
    do {
        int added_sectors, added_chunks;

-        if (!bdrv_get_dirty(source, next_sector) ||
+        if (!bdrv_get_dirty(source, s->dirty_bitmap, next_sector) ||
            test_bit(next_chunk, s->in_flight_bitmap)) {
            assert(nb_sectors > 0);
            break;
@@ -235,12 +229,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
        nb_chunks += added_chunks;
        next_sector += added_sectors;
        next_chunk += added_chunks;
-        if (!s->synced && s->common.speed) {
-            delay_ns = ratelimit_calculate_delay(&s->limit, added_sectors);
-        } else {
-            delay_ns = 0;
-        }
-    } while (delay_ns == 0 && next_sector < end);
+    } while (next_sector < end);

    /* Allocate a MirrorOp that is used as an AIO callback.  */
    op = g_slice_new(MirrorOp);
@@ -262,7 +251,8 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
        /* Advance the HBitmapIter in parallel, so that we do not examine
         * the same sector twice.
         */
-        if (next_sector > hbitmap_next_sector && bdrv_get_dirty(source, next_sector)) {
+        if (next_sector > hbitmap_next_sector
+            && bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) {
            hbitmap_next_sector = hbitmap_iter_next(&s->hbi);
        }

@@ -276,7 +266,6 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
    trace_mirror_one_iteration(s, sector_num, nb_sectors);
    bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
                   mirror_read_complete, op);
-    return delay_ns;
 }

 static void mirror_free_init(MirrorBlockJob *s)
@@ -369,10 +358,10 @@ static void coroutine_fn mirror_run(void *opaque)
        }
    }

-    bdrv_dirty_iter_init(bs, &s->hbi);
+    bdrv_dirty_iter_init(bs, s->dirty_bitmap, &s->hbi);
    last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
    for (;;) {
-        uint64_t delay_ns = 0;
+        uint64_t delay_ns;
        int64_t cnt;
        bool should_complete;

@@ -381,7 +370,7 @@ static void coroutine_fn mirror_run(void *opaque)
            goto immediate_exit;
        }

-        cnt = bdrv_get_dirty_count(bs);
+        cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);

        /* Note that even when no rate limit is applied we need to yield
         * periodically with no pending I/O so that qemu_aio_flush() returns.
@@ -396,10 +385,8 @@ static void coroutine_fn mirror_run(void *opaque)
                qemu_coroutine_yield();
                continue;
            } else if (cnt != 0) {
-                delay_ns = mirror_iteration(s);
-                if (delay_ns == 0) {
-                    continue;
-                }
+                mirror_iteration(s);
+                continue;
            }
        }

@@ -425,7 +412,7 @@ static void coroutine_fn mirror_run(void *opaque)

                should_complete = s->should_complete ||
                    block_job_is_cancelled(&s->common);
-                cnt = bdrv_get_dirty_count(bs);
+                cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
            }
        }

@@ -440,14 +427,21 @@ static void coroutine_fn mirror_run(void *opaque)
             */
            trace_mirror_before_drain(s, cnt);
            bdrv_drain_all();
-            cnt = bdrv_get_dirty_count(bs);
+            cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
        }

        ret = 0;
-        trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
+        trace_mirror_before_sleep(s, cnt, s->synced);
        if (!s->synced) {
            /* Publish progress */
            s->common.offset = (end - cnt) * BDRV_SECTOR_SIZE;
+
+            if (s->common.speed) {
+                delay_ns = ratelimit_calculate_delay(&s->limit, sectors_per_chunk);
+            } else {
+                delay_ns = 0;
+            }
+
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
            if (block_job_is_cancelled(&s->common)) {
                break;
@@ -480,7 +474,7 @@ immediate_exit:
    qemu_vfree(s->buf);
    g_free(s->cow_bitmap);
    g_free(s->in_flight_bitmap);
-    bdrv_set_dirty_tracking(bs, 0);
+    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
    bdrv_iostatus_disable(s->target);
    if (s->should_complete && ret == 0) {
        if (bdrv_get_flags(s->target) != bdrv_get_flags(s->common.bs)) {
@@ -584,7 +578,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
    s->granularity = granularity;
    s->buf_size = MAX(buf_size, granularity);

-    bdrv_set_dirty_tracking(bs, granularity);
+    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity);
    bdrv_set_enable_write_cache(s->target, true);
    bdrv_set_on_error(s->target, on_target_error, on_target_error);
    bdrv_iostatus_enable(s->target);
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -0,0 +1,385 @@
+/*
+ * QEMU Block driver for  NBD
+ *
+ * Copyright (C) 2008 Bull S.A.S.
+ *     Author: Laurent Vivier <Laurent.Vivier@bull.net>
+ *
+ * Some parts:
+ *    Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "nbd-client.h"
+#include "qemu/sockets.h"
+
+#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
+#define INDEX_TO_HANDLE(bs, index)  ((index)  ^ ((uint64_t)(intptr_t)bs))
+
+static void nbd_recv_coroutines_enter_all(NbdClientSession *s)
+{
+    int i;
+
+    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
+        if (s->recv_coroutine[i]) {
+            qemu_coroutine_enter(s->recv_coroutine[i], NULL);
+        }
+    }
+}
+
+static void nbd_reply_ready(void *opaque)
+{
+    NbdClientSession *s = opaque;
+    uint64_t i;
+    int ret;
+
+    if (s->reply.handle == 0) {
+        /* No reply already in flight.  Fetch a header.  It is possible
+         * that another thread has done the same thing in parallel, so
+         * the socket is not readable anymore.
+         */
+        ret = nbd_receive_reply(s->sock, &s->reply);
+        if (ret == -EAGAIN) {
+            return;
+        }
+        if (ret < 0) {
+            s->reply.handle = 0;
+            goto fail;
+        }
+    }
+
+    /* There's no need for a mutex on the receive side, because the
+     * handler acts as a synchronization point and ensures that only
+     * one coroutine is called until the reply finishes.  */
+    i = HANDLE_TO_INDEX(s, s->reply.handle);
+    if (i >= MAX_NBD_REQUESTS) {
+        goto fail;
+    }
+
+    if (s->recv_coroutine[i]) {
+        qemu_coroutine_enter(s->recv_coroutine[i], NULL);
+        return;
+    }
+
+fail:
+    nbd_recv_coroutines_enter_all(s);
+}
+
+static void nbd_restart_write(void *opaque)
+{
+    NbdClientSession *s = opaque;
+
+    qemu_coroutine_enter(s->send_coroutine, NULL);
+}
+
+static int nbd_co_send_request(NbdClientSession *s,
+    struct nbd_request *request,
+    QEMUIOVector *qiov, int offset)
+{
+    int rc, ret;
+
+    qemu_co_mutex_lock(&s->send_mutex);
+    s->send_coroutine = qemu_coroutine_self();
+    qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write, s);
+    if (qiov) {
+        if (!s->is_unix) {
+            socket_set_cork(s->sock, 1);
+        }
+        rc = nbd_send_request(s->sock, request);
+        if (rc >= 0) {
+            ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov,
+                                offset, request->len);
+            if (ret != request->len) {
+                rc = -EIO;
+            }
+        }
+        if (!s->is_unix) {
+            socket_set_cork(s->sock, 0);
+        }
+    } else {
+        rc = nbd_send_request(s->sock, request);
+    }
+    qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL, s);
+    s->send_coroutine = NULL;
+    qemu_co_mutex_unlock(&s->send_mutex);
+    return rc;
+}
+
+static void nbd_co_receive_reply(NbdClientSession *s,
+    struct nbd_request *request, struct nbd_reply *reply,
+    QEMUIOVector *qiov, int offset)
+{
+    int ret;
+
+    /* Wait until we're woken up by the read handler.  TODO: perhaps
+     * peek at the next reply and avoid yielding if it's ours?  */
+    qemu_coroutine_yield();
+    *reply = s->reply;
+    if (reply->handle != request->handle) {
+        reply->error = EIO;
+    } else {
+        if (qiov && reply->error == 0) {
+            ret = qemu_co_recvv(s->sock, qiov->iov, qiov->niov,
+                                offset, request->len);
+            if (ret != request->len) {
+                reply->error = EIO;
+            }
+        }
+
+        /* Tell the read handler to read another header.  */
+        s->reply.handle = 0;
+    }
+}
+
+static void nbd_coroutine_start(NbdClientSession *s,
+   struct nbd_request *request)
+{
+    int i;
+
+    /* Poor man semaphore.  The free_sema is locked when no other request
+     * can be accepted, and unlocked after receiving one reply.  */
+    if (s->in_flight >= MAX_NBD_REQUESTS - 1) {
+        qemu_co_mutex_lock(&s->free_sema);
+        assert(s->in_flight < MAX_NBD_REQUESTS);
+    }
+    s->in_flight++;
+
+    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
+        if (s->recv_coroutine[i] == NULL) {
+            s->recv_coroutine[i] = qemu_coroutine_self();
+            break;
+        }
+    }
+
+    assert(i < MAX_NBD_REQUESTS);
+    request->handle = INDEX_TO_HANDLE(s, i);
+}
+
+static void nbd_coroutine_end(NbdClientSession *s,
+    struct nbd_request *request)
+{
+    int i = HANDLE_TO_INDEX(s, request->handle);
+    s->recv_coroutine[i] = NULL;
+    if (s->in_flight-- == MAX_NBD_REQUESTS) {
+        qemu_co_mutex_unlock(&s->free_sema);
+    }
+}
+
+static int nbd_co_readv_1(NbdClientSession *client, int64_t sector_num,
+                          int nb_sectors, QEMUIOVector *qiov,
+                          int offset)
+{
+    struct nbd_request request = { .type = NBD_CMD_READ };
+    struct nbd_reply reply;
+    ssize_t ret;
+
+    request.from = sector_num * 512;
+    request.len = nb_sectors * 512;
+
+    nbd_coroutine_start(client, &request);
+    ret = nbd_co_send_request(client, &request, NULL, 0);
+    if (ret < 0) {
+        reply.error = -ret;
+    } else {
+        nbd_co_receive_reply(client, &request, &reply, qiov, offset);
+    }
+    nbd_coroutine_end(client, &request);
+    return -reply.error;
+
+}
+
+static int nbd_co_writev_1(NbdClientSession *client, int64_t sector_num,
+                           int nb_sectors, QEMUIOVector *qiov,
+                           int offset)
+{
+    struct nbd_request request = { .type = NBD_CMD_WRITE };
+    struct nbd_reply reply;
+    ssize_t ret;
+
+    if (!bdrv_enable_write_cache(client->bs) &&
+        (client->nbdflags & NBD_FLAG_SEND_FUA)) {
+        request.type |= NBD_CMD_FLAG_FUA;
+    }
+
+    request.from = sector_num * 512;
+    request.len = nb_sectors * 512;
+
+    nbd_coroutine_start(client, &request);
+    ret = nbd_co_send_request(client, &request, qiov, offset);
+    if (ret < 0) {
+        reply.error = -ret;
+    } else {
+        nbd_co_receive_reply(client, &request, &reply, NULL, 0);
+    }
+    nbd_coroutine_end(client, &request);
+    return -reply.error;
+}
+
+/* qemu-nbd has a limit of slightly less than 1M per request.  Try to
+ * remain aligned to 4K. */
+#define NBD_MAX_SECTORS 2040
+
+int nbd_client_session_co_readv(NbdClientSession *client, int64_t sector_num,
+    int nb_sectors, QEMUIOVector *qiov)
+{
+    int offset = 0;
+    int ret;
+    while (nb_sectors > NBD_MAX_SECTORS) {
+        ret = nbd_co_readv_1(client, sector_num,
+                             NBD_MAX_SECTORS, qiov, offset);
+        if (ret < 0) {
+            return ret;
+        }
+        offset += NBD_MAX_SECTORS * 512;
+        sector_num += NBD_MAX_SECTORS;
+        nb_sectors -= NBD_MAX_SECTORS;
+    }
+    return nbd_co_readv_1(client, sector_num, nb_sectors, qiov, offset);
+}
+
+int nbd_client_session_co_writev(NbdClientSession *client, int64_t sector_num,
+                                 int nb_sectors, QEMUIOVector *qiov)
+{
+    int offset = 0;
+    int ret;
+    while (nb_sectors > NBD_MAX_SECTORS) {
+        ret = nbd_co_writev_1(client, sector_num,
+                              NBD_MAX_SECTORS, qiov, offset);
+        if (ret < 0) {
+            return ret;
+        }
+        offset += NBD_MAX_SECTORS * 512;
+        sector_num += NBD_MAX_SECTORS;
+        nb_sectors -= NBD_MAX_SECTORS;
+    }
+    return nbd_co_writev_1(client, sector_num, nb_sectors, qiov, offset);
+}
+
+int nbd_client_session_co_flush(NbdClientSession *client)
+{
+    struct nbd_request request = { .type = NBD_CMD_FLUSH };
+    struct nbd_reply reply;
+    ssize_t ret;
+
+    if (!(client->nbdflags & NBD_FLAG_SEND_FLUSH)) {
+        return 0;
+    }
+
+    if (client->nbdflags & NBD_FLAG_SEND_FUA) {
+        request.type |= NBD_CMD_FLAG_FUA;
+    }
+
+    request.from = 0;
+    request.len = 0;
+
+    nbd_coroutine_start(client, &request);
+    ret = nbd_co_send_request(client, &request, NULL, 0);
+    if (ret < 0) {
+        reply.error = -ret;
+    } else {
+        nbd_co_receive_reply(client, &request, &reply, NULL, 0);
+    }
+    nbd_coroutine_end(client, &request);
+    return -reply.error;
+}
+
+int nbd_client_session_co_discard(NbdClientSession *client, int64_t sector_num,
+    int nb_sectors)
+{
+    struct nbd_request request = { .type = NBD_CMD_TRIM };
+    struct nbd_reply reply;
+    ssize_t ret;
+
+    if (!(client->nbdflags & NBD_FLAG_SEND_TRIM)) {
+        return 0;
+    }
+    request.from = sector_num * 512;
+    request.len = nb_sectors * 512;
+
+    nbd_coroutine_start(client, &request);
+    ret = nbd_co_send_request(client, &request, NULL, 0);
+    if (ret < 0) {
+        reply.error = -ret;
+    } else {
+        nbd_co_receive_reply(client, &request, &reply, NULL, 0);
+    }
+    nbd_coroutine_end(client, &request);
+    return -reply.error;
+
+}
+
+static void nbd_teardown_connection(NbdClientSession *client)
+{
+    struct nbd_request request = {
+        .type = NBD_CMD_DISC,
+        .from = 0,
+        .len = 0
+    };
+
+    nbd_send_request(client->sock, &request);
+
+    /* finish any pending coroutines */
+    shutdown(client->sock, 2);
+    nbd_recv_coroutines_enter_all(client);
+
+    qemu_aio_set_fd_handler(client->sock, NULL, NULL, NULL);
+    closesocket(client->sock);
+    client->sock = -1;
+}
+
+void nbd_client_session_close(NbdClientSession *client)
+{
+    if (!client->bs) {
+        return;
+    }
+
+    nbd_teardown_connection(client);
+    client->bs = NULL;
+}
+
+int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs,
+    int sock, const char *export)
+{
+    int ret;
+
+    /* NBD handshake */
+    logout("session init %s\n", export);
+    qemu_set_block(sock);
+    ret = nbd_receive_negotiate(sock, export,
+                                &client->nbdflags, &client->size,
+                                &client->blocksize);
+    if (ret < 0) {
+        logout("Failed to negotiate with the NBD server\n");
+        closesocket(sock);
+        return ret;
+    }
+
+    qemu_co_mutex_init(&client->send_mutex);
+    qemu_co_mutex_init(&client->free_sema);
+    client->bs = bs;
+    client->sock = sock;
+
+    /* Now that we're connected, set the socket to be non-blocking and
+     * kick the reply mechanism.  */
+    qemu_set_nonblock(sock);
+    qemu_aio_set_fd_handler(sock, nbd_reply_ready, NULL, client);
+
+    logout("Established connection with NBD server\n");
+    return 0;
+}
--- a/block/nbd-client.h
+++ b/block/nbd-client.h
@@ -0,0 +1,50 @@
+#ifndef NBD_CLIENT_H
+#define NBD_CLIENT_H
+
+#include "qemu-common.h"
+#include "block/nbd.h"
+#include "block/block_int.h"
+
+/* #define DEBUG_NBD */
+
+#if defined(DEBUG_NBD)
+#define logout(fmt, ...) \
+    fprintf(stderr, "nbd\t%-24s" fmt, __func__, ##__VA_ARGS__)
+#else
+#define logout(fmt, ...) ((void)0)
+#endif
+
+#define MAX_NBD_REQUESTS    16
+
+typedef struct NbdClientSession {
+    int sock;
+    uint32_t nbdflags;
+    off_t size;
+    size_t blocksize;
+
+    CoMutex send_mutex;
+    CoMutex free_sema;
+    Coroutine *send_coroutine;
+    int in_flight;
+
+    Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
+    struct nbd_reply reply;
+
+    bool is_unix;
+
+    BlockDriverState *bs;
+} NbdClientSession;
+
+int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs,
+                            int sock, const char *export_name);
+void nbd_client_session_close(NbdClientSession *client);
+
+int nbd_client_session_co_discard(NbdClientSession *client, int64_t sector_num,
+                                  int nb_sectors);
+int nbd_client_session_co_flush(NbdClientSession *client);
+int nbd_client_session_co_writev(NbdClientSession *client, int64_t sector_num,
+                                 int nb_sectors, QEMUIOVector *qiov);
+int nbd_client_session_co_readv(NbdClientSession *client, int64_t sector_num,
+                                int nb_sectors, QEMUIOVector *qiov);
+
+#endif /* NBD_CLIENT_H */
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -26,8 +26,7 @@
 * THE SOFTWARE.
 */

-#include "qemu-common.h"
-#include "block/nbd.h"
+#include "block/nbd-client.h"
 #include "qemu/uri.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
@@ -40,37 +39,9 @@

 #define EN_OPTSTR ":exportname="

-/* #define DEBUG_NBD */
-
-#if defined(DEBUG_NBD)
-#define logout(fmt, ...) \
-                fprintf(stderr, "nbd\t%-24s" fmt, __func__, ##__VA_ARGS__)
-#else
-#define logout(fmt, ...) ((void)0)
-#endif
-
-#define MAX_NBD_REQUESTS	16
-#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
-#define INDEX_TO_HANDLE(bs, index)  ((index)  ^ ((uint64_t)(intptr_t)bs))
-
 typedef struct BDRVNBDState {
-    int sock;
-    uint32_t nbdflags;
-    off_t size;
-    size_t blocksize;
-
-    CoMutex send_mutex;
-    CoMutex free_sema;
-    Coroutine *send_coroutine;
-    int in_flight;
-
-    Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
-    struct nbd_reply reply;
-
-    bool is_unix;
+    NbdClientSession client;
    QemuOpts *socket_opts;
-
-    char *export_name; /* An NBD server may export several devices */
 } BDRVNBDState;

 static int nbd_parse_uri(const char *filename, QDict *options)
@@ -217,7 +188,7 @@ out:
    g_free(file);
 }

-static int nbd_config(BDRVNBDState *s, QDict *options)
+static int nbd_config(BDRVNBDState *s, QDict *options, char **export)
 {
    Error *local_err = NULL;

@@ -227,9 +198,9 @@ static int nbd_config(BDRVNBDState *s, QDict *options)
                          "be used at the same time.");
            return -EINVAL;
        }
-        s->is_unix = true;
+        s->client.is_unix = true;
    } else if (qdict_haskey(options, "host")) {
-        s->is_unix = false;
+        s->client.is_unix = false;
    } else {
        return -EINVAL;
    }
@@ -247,162 +218,20 @@ static int nbd_config(BDRVNBDState *s, QDict *options)
        qemu_opt_set_number(s->socket_opts, "port", NBD_DEFAULT_PORT);
    }

-    s->export_name = g_strdup(qdict_get_try_str(options, "export"));
-    if (s->export_name) {
+    *export = g_strdup(qdict_get_try_str(options, "export"));
+    if (*export) {
        qdict_del(options, "export");
    }

    return 0;
 }

-
-static void nbd_coroutine_start(BDRVNBDState *s, struct nbd_request *request)
-{
-    int i;
-
-    /* Poor man semaphore.  The free_sema is locked when no other request
-     * can be accepted, and unlocked after receiving one reply.  */
-    if (s->in_flight >= MAX_NBD_REQUESTS - 1) {
-        qemu_co_mutex_lock(&s->free_sema);
-        assert(s->in_flight < MAX_NBD_REQUESTS);
-    }
-    s->in_flight++;
-
-    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
-        if (s->recv_coroutine[i] == NULL) {
-            s->recv_coroutine[i] = qemu_coroutine_self();
-            break;
-        }
-    }
-
-    assert(i < MAX_NBD_REQUESTS);
-    request->handle = INDEX_TO_HANDLE(s, i);
-}
-
-static void nbd_reply_ready(void *opaque)
-{
-    BDRVNBDState *s = opaque;
-    uint64_t i;
-    int ret;
-
-    if (s->reply.handle == 0) {
-        /* No reply already in flight.  Fetch a header.  It is possible
-         * that another thread has done the same thing in parallel, so
-         * the socket is not readable anymore.
-         */
-        ret = nbd_receive_reply(s->sock, &s->reply);
-        if (ret == -EAGAIN) {
-            return;
-        }
-        if (ret < 0) {
-            s->reply.handle = 0;
-            goto fail;
-        }
-    }
-
-    /* There's no need for a mutex on the receive side, because the
-     * handler acts as a synchronization point and ensures that only
-     * one coroutine is called until the reply finishes.  */
-    i = HANDLE_TO_INDEX(s, s->reply.handle);
-    if (i >= MAX_NBD_REQUESTS) {
-        goto fail;
-    }
-
-    if (s->recv_coroutine[i]) {
-        qemu_coroutine_enter(s->recv_coroutine[i], NULL);
-        return;
-    }
-
-fail:
-    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
-        if (s->recv_coroutine[i]) {
-            qemu_coroutine_enter(s->recv_coroutine[i], NULL);
-        }
-    }
-}
-
-static void nbd_restart_write(void *opaque)
-{
-    BDRVNBDState *s = opaque;
-    qemu_coroutine_enter(s->send_coroutine, NULL);
-}
-
-static int nbd_co_send_request(BDRVNBDState *s, struct nbd_request *request,
-                               QEMUIOVector *qiov, int offset)
-{
-    int rc, ret;
-
-    qemu_co_mutex_lock(&s->send_mutex);
-    s->send_coroutine = qemu_coroutine_self();
-    qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write, s);
-    if (qiov) {
-        if (!s->is_unix) {
-            socket_set_cork(s->sock, 1);
-        }
-        rc = nbd_send_request(s->sock, request);
-        if (rc >= 0) {
-            ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov,
-                                offset, request->len);
-            if (ret != request->len) {
-                rc = -EIO;
-            }
-        }
-        if (!s->is_unix) {
-            socket_set_cork(s->sock, 0);
-        }
-    } else {
-        rc = nbd_send_request(s->sock, request);
-    }
-    qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL, s);
-    s->send_coroutine = NULL;
-    qemu_co_mutex_unlock(&s->send_mutex);
-    return rc;
-}
-
-static void nbd_co_receive_reply(BDRVNBDState *s, struct nbd_request *request,
-                                 struct nbd_reply *reply,
-                                 QEMUIOVector *qiov, int offset)
-{
-    int ret;
-
-    /* Wait until we're woken up by the read handler.  TODO: perhaps
-     * peek at the next reply and avoid yielding if it's ours?  */
-    qemu_coroutine_yield();
-    *reply = s->reply;
-    if (reply->handle != request->handle) {
-        reply->error = EIO;
-    } else {
-        if (qiov && reply->error == 0) {
-            ret = qemu_co_recvv(s->sock, qiov->iov, qiov->niov,
-                                offset, request->len);
-            if (ret != request->len) {
-                reply->error = EIO;
-            }
-        }
-
-        /* Tell the read handler to read another header.  */
-        s->reply.handle = 0;
-    }
-}
-
-static void nbd_coroutine_end(BDRVNBDState *s, struct nbd_request *request)
-{
-    int i = HANDLE_TO_INDEX(s, request->handle);
-    s->recv_coroutine[i] = NULL;
-    if (s->in_flight-- == MAX_NBD_REQUESTS) {
-        qemu_co_mutex_unlock(&s->free_sema);
-    }
-}
-
 static int nbd_establish_connection(BlockDriverState *bs)
 {
    BDRVNBDState *s = bs->opaque;
    int sock;
-    int ret;
-    off_t size;
-    size_t blocksize;

-    if (s->is_unix) {
+    if (s->client.is_unix) {
        sock = unix_socket_outgoing(qemu_opt_get(s->socket_opts, "path"));
    } else {
        sock = tcp_socket_outgoing_opts(s->socket_opts);
@@ -417,53 +246,18 @@ static int nbd_establish_connection(BlockDriverState *bs)
        return -errno;
    }

-    /* NBD handshake */
-    ret = nbd_receive_negotiate(sock, s->export_name, &s->nbdflags, &size,
-                                &blocksize);
-    if (ret < 0) {
-        logout("Failed to negotiate with the NBD server\n");
-        closesocket(sock);
-        return ret;
-    }
-
-    /* Now that we're connected, set the socket to be non-blocking and
-     * kick the reply mechanism.  */
-    qemu_set_nonblock(sock);
-    qemu_aio_set_fd_handler(sock, nbd_reply_ready, NULL, s);
-
-    s->sock = sock;
-    s->size = size;
-    s->blocksize = blocksize;
-
-    logout("Established connection with NBD server\n");
-    return 0;
-}
-
-static void nbd_teardown_connection(BlockDriverState *bs)
-{
-    BDRVNBDState *s = bs->opaque;
-    struct nbd_request request;
-
-    request.type = NBD_CMD_DISC;
-    request.from = 0;
-    request.len = 0;
-    nbd_send_request(s->sock, &request);
-
-    qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL);
-    closesocket(s->sock);
+    return sock;
 }

 static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
                    Error **errp)
 {
    BDRVNBDState *s = bs->opaque;
-    int result;
-
-    qemu_co_mutex_init(&s->send_mutex);
-    qemu_co_mutex_init(&s->free_sema);
+    char *export = NULL;
+    int result, sock;

    /* Pop the config into our state object. Exit if invalid. */
-    result = nbd_config(s, options);
+    result = nbd_config(s, options, &export);
    if (result != 0) {
        return result;
    }
@@ -471,172 +265,64 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
    /* establish TCP connection, return error if it fails
     * TODO: Configurable retry-until-timeout behaviour.
     */
-    result = nbd_establish_connection(bs);
+    sock = nbd_establish_connection(bs);
+    if (sock < 0) {
+        return sock;
+    }

+    /* NBD handshake */
+    result = nbd_client_session_init(&s->client, bs, sock, export);
+    g_free(export);
    return result;
 }

-static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num,
-                          int nb_sectors, QEMUIOVector *qiov,
-                          int offset)
-{
-    BDRVNBDState *s = bs->opaque;
-    struct nbd_request request;
-    struct nbd_reply reply;
-    ssize_t ret;
-
-    request.type = NBD_CMD_READ;
-    request.from = sector_num * 512;
-    request.len = nb_sectors * 512;
-
-    nbd_coroutine_start(s, &request);
-    ret = nbd_co_send_request(s, &request, NULL, 0);
-    if (ret < 0) {
-        reply.error = -ret;
-    } else {
-        nbd_co_receive_reply(s, &request, &reply, qiov, offset);
-    }
-    nbd_coroutine_end(s, &request);
-    return -reply.error;
-
-}
-
-static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num,
-                           int nb_sectors, QEMUIOVector *qiov,
-                           int offset)
-{
-    BDRVNBDState *s = bs->opaque;
-    struct nbd_request request;
-    struct nbd_reply reply;
-    ssize_t ret;
-
-    request.type = NBD_CMD_WRITE;
-    if (!bdrv_enable_write_cache(bs) && (s->nbdflags & NBD_FLAG_SEND_FUA)) {
-        request.type |= NBD_CMD_FLAG_FUA;
-    }
-
-    request.from = sector_num * 512;
-    request.len = nb_sectors * 512;
-
-    nbd_coroutine_start(s, &request);
-    ret = nbd_co_send_request(s, &request, qiov, offset);
-    if (ret < 0) {
-        reply.error = -ret;
-    } else {
-        nbd_co_receive_reply(s, &request, &reply, NULL, 0);
-    }
-    nbd_coroutine_end(s, &request);
-    return -reply.error;
-}
-
-/* qemu-nbd has a limit of slightly less than 1M per request.  Try to
- * remain aligned to 4K. */
-#define NBD_MAX_SECTORS 2040
-
 static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num,
                        int nb_sectors, QEMUIOVector *qiov)
 {
-    int offset = 0;
-    int ret;
-    while (nb_sectors > NBD_MAX_SECTORS) {
-        ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset);
-        if (ret < 0) {
-            return ret;
-        }
-        offset += NBD_MAX_SECTORS * 512;
-        sector_num += NBD_MAX_SECTORS;
-        nb_sectors -= NBD_MAX_SECTORS;
-    }
-    return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset);
+    BDRVNBDState *s = bs->opaque;
+
+    return nbd_client_session_co_readv(&s->client, sector_num,
+                                       nb_sectors, qiov);
 }

 static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num,
                         int nb_sectors, QEMUIOVector *qiov)
 {
-    int offset = 0;
-    int ret;
-    while (nb_sectors > NBD_MAX_SECTORS) {
-        ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset);
-        if (ret < 0) {
-            return ret;
-        }
-        offset += NBD_MAX_SECTORS * 512;
-        sector_num += NBD_MAX_SECTORS;
-        nb_sectors -= NBD_MAX_SECTORS;
-    }
-    return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset);
+    BDRVNBDState *s = bs->opaque;
+
+    return nbd_client_session_co_writev(&s->client, sector_num,
+                                        nb_sectors, qiov);
 }

 static int nbd_co_flush(BlockDriverState *bs)
 {
    BDRVNBDState *s = bs->opaque;
-    struct nbd_request request;
-    struct nbd_reply reply;
-    ssize_t ret;

-    if (!(s->nbdflags & NBD_FLAG_SEND_FLUSH)) {
-        return 0;
-    }
-
-    request.type = NBD_CMD_FLUSH;
-    if (s->nbdflags & NBD_FLAG_SEND_FUA) {
-        request.type |= NBD_CMD_FLAG_FUA;
-    }
-
-    request.from = 0;
-    request.len = 0;
-
-    nbd_coroutine_start(s, &request);
-    ret = nbd_co_send_request(s, &request, NULL, 0);
-    if (ret < 0) {
-        reply.error = -ret;
-    } else {
-        nbd_co_receive_reply(s, &request, &reply, NULL, 0);
-    }
-    nbd_coroutine_end(s, &request);
-    return -reply.error;
+    return nbd_client_session_co_flush(&s->client);
 }

 static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
                          int nb_sectors)
 {
    BDRVNBDState *s = bs->opaque;
-    struct nbd_request request;
-    struct nbd_reply reply;
-    ssize_t ret;

-    if (!(s->nbdflags & NBD_FLAG_SEND_TRIM)) {
-        return 0;
-    }
-    request.type = NBD_CMD_TRIM;
-    request.from = sector_num * 512;
-    request.len = nb_sectors * 512;
-
-    nbd_coroutine_start(s, &request);
-    ret = nbd_co_send_request(s, &request, NULL, 0);
-    if (ret < 0) {
-        reply.error = -ret;
-    } else {
-        nbd_co_receive_reply(s, &request, &reply, NULL, 0);
-    }
-    nbd_coroutine_end(s, &request);
-    return -reply.error;
+    return nbd_client_session_co_discard(&s->client, sector_num,
+                                         nb_sectors);
 }

 static void nbd_close(BlockDriverState *bs)
 {
    BDRVNBDState *s = bs->opaque;
-    g_free(s->export_name);
-    qemu_opts_del(s->socket_opts);

-    nbd_teardown_connection(bs);
+    qemu_opts_del(s->socket_opts);
+    nbd_client_session_close(&s->client);
 }

 static int64_t nbd_getlength(BlockDriverState *bs)
 {
    BDRVNBDState *s = bs->opaque;

-    return s->size;
+    return s->client.size;
 }

 static BlockDriver bdrv_nbd = {
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -49,9 +49,9 @@ typedef struct BDRVParallelsState {
    CoMutex lock;

    uint32_t *catalog_bitmap;
-    unsigned int catalog_size;
+    int catalog_size;

-    unsigned int tracks;
+    int tracks;
 } BDRVParallelsState;

 static int parallels_probe(const uint8_t *buf, int buf_size, const char *filename)
@@ -92,18 +92,8 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
    bs->total_sectors = le32_to_cpu(ph.nb_sectors);

    s->tracks = le32_to_cpu(ph.tracks);
-    if (s->tracks == 0) {
-        error_setg(errp, "Invalid image: Zero sectors per track");
-        ret = -EINVAL;
-        goto fail;
-    }

    s->catalog_size = le32_to_cpu(ph.catalog_entries);
-    if (s->catalog_size > INT_MAX / 4) {
-        error_setg(errp, "Catalog too large");
-        ret = -EFBIG;
-        goto fail;
-    }
    s->catalog_bitmap = g_malloc(s->catalog_size * 4);

    ret = bdrv_pread(bs->file, 64, s->catalog_bitmap, s->catalog_size * 4);
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -204,12 +204,9 @@ void bdrv_query_info(BlockDriverState *bs,
        info->io_status = bs->iostatus;
    }

-    if (bs->dirty_bitmap) {
-        info->has_dirty = true;
-        info->dirty = g_malloc0(sizeof(*info->dirty));
-        info->dirty->count = bdrv_get_dirty_count(bs) * BDRV_SECTOR_SIZE;
-        info->dirty->granularity =
-         ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bs->dirty_bitmap));
+    if (!QLIST_EMPTY(&bs->dirty_bitmaps)) {
+        info->has_dirty_bitmaps = true;
+        info->dirty_bitmaps = bdrv_query_dirty_bitmaps(bs);
    }

    if (bs->drv) {
@@ -471,7 +468,6 @@ static void dump_qobject(fprintf_function func_fprintf, void *f,
        case QTYPE_QERROR: {
            QString *value = qerror_human((QError *)obj);
            func_fprintf(f, "%s", qstring_get_str(value));
-            QDECREF(value);
            break;
        }
        case QTYPE_NONE:
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -48,10 +48,9 @@ typedef struct QCowHeader {
    uint64_t size; /* in bytes */
    uint8_t cluster_bits;
    uint8_t l2_bits;
-    uint16_t padding;
    uint32_t crypt_method;
    uint64_t l1_table_offset;
-} QEMU_PACKED QCowHeader;
+} QCowHeader;

 #define L2_CACHE_SIZE 16

@@ -61,7 +60,7 @@ typedef struct BDRVQcowState {
    int cluster_sectors;
    int l2_bits;
    int l2_size;
-    unsigned int l1_size;
+    int l1_size;
    uint64_t cluster_offset_mask;
    uint64_t l1_table_offset;
    uint64_t *l1_table;
@@ -97,8 +96,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp)
 {
    BDRVQcowState *s = bs->opaque;
-    unsigned int len, i, shift;
-    int ret;
+    int len, i, shift, ret;
    QCowHeader header;

    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
@@ -127,25 +125,10 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
        goto fail;
    }

-    if (header.size <= 1) {
-        error_setg(errp, "Image size is too small (must be at least 2 bytes)");
+    if (header.size <= 1 || header.cluster_bits < 9) {
        ret = -EINVAL;
        goto fail;
    }
-    if (header.cluster_bits < 9 || header.cluster_bits > 16) {
-        error_setg(errp, "Cluster size must be between 512 and 64k");
-        ret = -EINVAL;
-        goto fail;
-    }
-
-    /* l2_bits specifies number of entries; storing a uint64_t in each entry,
-     * so bytes = num_entries << 3. */
-    if (header.l2_bits < 9 - 3 || header.l2_bits > 16 - 3) {
-        error_setg(errp, "L2 table size must be between 512 and 64k");
-        ret = -EINVAL;
-        goto fail;
-    }
-
    if (header.crypt_method > QCOW_CRYPT_AES) {
        ret = -EINVAL;
        goto fail;
@@ -164,19 +147,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,

    /* read the level 1 table */
    shift = s->cluster_bits + s->l2_bits;
-    if (header.size > UINT64_MAX - (1LL << shift)) {
-        error_setg(errp, "Image too large");
-        ret = -EINVAL;
-        goto fail;
-    } else {
-        uint64_t l1_size = (header.size + (1LL << shift) - 1) >> shift;
-        if (l1_size > INT_MAX / sizeof(uint64_t)) {
-            error_setg(errp, "Image too large");
-            ret = -EINVAL;
-            goto fail;
-        }
-        s->l1_size = l1_size;
-    }
+    s->l1_size = (header.size + (1LL << shift) - 1) >> shift;

    s->l1_table_offset = header.l1_table_offset;
    s->l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
@@ -200,9 +171,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
    if (header.backing_file_offset != 0) {
        len = header.backing_file_size;
        if (len > 1023) {
-            error_setg(errp, "Backing file name too long");
-            ret = -EINVAL;
-            goto fail;
+            len = 1023;
        }
        ret = bdrv_pread(bs->file, header.backing_file_offset,
                   bs->backing_file, len);
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -55,7 +55,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
        }
    }

-    if (new_l1_size > INT_MAX / sizeof(uint64_t)) {
+    if (new_l1_size > INT_MAX) {
        return -EFBIG;
    }

@@ -359,6 +359,15 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs,
    struct iovec iov;
    int n, ret;

+    /*
+     * If this is the last cluster and it is only partially used, we must only
+     * copy until the end of the image, or bdrv_check_request will fail for the
+     * bdrv_read/write calls below.
+     */
+    if (start_sect + n_end > bs->total_sectors) {
+        n_end = bs->total_sectors - start_sect;
+    }
+
    n = n_end - n_start;
    if (n <= 0) {
        return 0;
@@ -1392,7 +1401,7 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,

    /* Round start up and end down */
    offset = align_offset(offset, s->cluster_size);
-    end_offset &= ~(s->cluster_size - 1);
+    end_offset = start_of_cluster(s, end_offset);

    if (offset > end_offset) {
        return 0;
@@ -1604,7 +1613,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
            }

            ret = bdrv_write_zeroes(bs->file, offset / BDRV_SECTOR_SIZE,
-                                    s->cluster_sectors);
+                                    s->cluster_sectors, 0);
            if (ret < 0) {
                if (!preallocated) {
                    qcow2_free_clusters(bs, offset, s->cluster_size,
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -28,7 +28,7 @@
 #include "qemu/range.h"
 #include "qapi/qmp/types.h"

-static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
+static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
                            int64_t offset, int64_t length,
                            int addend, enum qcow2_discard_type type);
@@ -40,10 +40,8 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
 int qcow2_refcount_init(BlockDriverState *bs)
 {
    BDRVQcowState *s = bs->opaque;
-    unsigned int refcount_table_size2, i;
-    int ret;
+    int ret, refcount_table_size2, i;

-    assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
    s->refcount_table = g_malloc(refcount_table_size2);
    if (s->refcount_table_size > 0) {
@@ -89,7 +87,7 @@ static int load_refcount_block(BlockDriverState *bs,
 static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
 {
    BDRVQcowState *s = bs->opaque;
-    uint64_t refcount_table_index, block_index;
+    int refcount_table_index, block_index;
    int64_t refcount_block_offset;
    int ret;
    uint16_t *refcount_block;
@@ -193,11 +191,10 @@ static int alloc_refcount_block(BlockDriverState *bs,
     *   they can describe them themselves.
     *
     * - We need to consider that at this point we are inside update_refcounts
-     *   and potentially doing an initial refcount increase. This means that
-     *   some clusters have already been allocated by the caller, but their
-     *   refcount isn't accurate yet. If we allocate clusters for metadata, we
-     *   need to return -EAGAIN to signal the caller that it needs to restart
-     *   the search for free clusters.
+     *   and doing the initial refcount increase. This means that some clusters
+     *   have already been allocated by the caller, but their refcount isn't
+     *   accurate yet. free_cluster_index tells us where this allocation ends
+     *   as long as we don't overwrite it by freeing clusters.
     *
     * - alloc_clusters_noref and qcow2_free_clusters may load a different
     *   refcount block into the cache
@@ -282,10 +279,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
        }

        s->refcount_table[refcount_table_index] = new_block;
-
-        /* The new refcount block may be where the caller intended to put its
-         * data, so let it restart the search. */
-        return -EAGAIN;
+        return 0;
    }

    ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
@@ -308,11 +302,8 @@ static int alloc_refcount_block(BlockDriverState *bs,

    /* Calculate the number of refcount blocks needed so far */
    uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT);
-    uint64_t blocks_used = DIV_ROUND_UP(cluster_index, refcount_block_clusters);
-
-    if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
-        return -EFBIG;
-    }
+    uint64_t blocks_used = (s->free_cluster_index +
+        refcount_block_clusters - 1) / refcount_block_clusters;

    /* And now we need at least one block more for the new metadata */
    uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
@@ -345,6 +336,8 @@ static int alloc_refcount_block(BlockDriverState *bs,
    uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size);
    uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t));

+    assert(meta_offset >= (s->free_cluster_index * s->cluster_size));
+
    /* Fill the new refcount table */
    memcpy(new_table, s->refcount_table,
        s->refcount_table_size * sizeof(uint64_t));
@@ -407,19 +400,18 @@ static int alloc_refcount_block(BlockDriverState *bs,
    s->refcount_table_size = table_size;
    s->refcount_table_offset = table_offset;

-    /* Free old table. */
+    /* Free old table. Remember, we must not change free_cluster_index */
+    uint64_t old_free_cluster_index = s->free_cluster_index;
    qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
                        QCOW2_DISCARD_OTHER);
+    s->free_cluster_index = old_free_cluster_index;

    ret = load_refcount_block(bs, new_block, (void**) refcount_block);
    if (ret < 0) {
        return ret;
    }

-    /* If we were trying to do the initial refcount update for some cluster
-     * allocation, we might have used the same clusters to store newly
-     * allocated metadata. Make the caller search some new space. */
-    return -EAGAIN;
+    return 0;

 fail_table:
    g_free(new_table);
@@ -523,8 +515,8 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
            s->l2_table_cache);
    }

-    start = offset & ~(s->cluster_size - 1);
-    last = (offset + length - 1) & ~(s->cluster_size - 1);
+    start = start_of_cluster(s, offset);
+    last = start_of_cluster(s, offset + length - 1);
    for(cluster_offset = start; cluster_offset <= last;
        cluster_offset += s->cluster_size)
    {
@@ -634,16 +626,15 @@ int qcow2_update_cluster_refcount(BlockDriverState *bs,


 /* return < 0 if error */
-static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
+static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
 {
    BDRVQcowState *s = bs->opaque;
-    uint64_t i, nb_clusters;
-    int refcount;
+    int i, nb_clusters, refcount;

    nb_clusters = size_to_clusters(s, size);
 retry:
    for(i = 0; i < nb_clusters; i++) {
-        uint64_t next_cluster_index = s->free_cluster_index++;
+        int64_t next_cluster_index = s->free_cluster_index++;
        refcount = get_refcount(bs, next_cluster_index);

        if (refcount < 0) {
@@ -660,21 +651,18 @@ retry:
    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
 }

-int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
+int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size)
 {
    int64_t offset;
    int ret;

    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
-    do {
-        offset = alloc_clusters_noref(bs, size);
-        if (offset < 0) {
-            return offset;
-        }
-
-        ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER);
-    } while (ret == -EAGAIN);
+    offset = alloc_clusters_noref(bs, size);
+    if (offset < 0) {
+        return offset;
+    }

+    ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER);
    if (ret < 0) {
        return ret;
    }
@@ -687,36 +675,33 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
 {
    BDRVQcowState *s = bs->opaque;
    uint64_t cluster_index;
-    uint64_t i;
-    int refcount, ret;
+    uint64_t old_free_cluster_index;
+    int i, refcount, ret;

-    assert(nb_clusters >= 0);
-    if (nb_clusters == 0) {
-        return 0;
+    /* Check how many clusters there are free */
+    cluster_index = offset >> s->cluster_bits;
+    for(i = 0; i < nb_clusters; i++) {
+        refcount = get_refcount(bs, cluster_index++);
+
+        if (refcount < 0) {
+            return refcount;
+        } else if (refcount != 0) {
+            break;
+        }
    }

-    do {
-        /* Check how many clusters there are free */
-        cluster_index = offset >> s->cluster_bits;
-        for(i = 0; i < nb_clusters; i++) {
-            refcount = get_refcount(bs, cluster_index++);
-
-            if (refcount < 0) {
-                return refcount;
-            } else if (refcount != 0) {
-                break;
-            }
-        }
-
-        /* And then allocate them */
-        ret = update_refcount(bs, offset, i << s->cluster_bits, 1,
-                              QCOW2_DISCARD_NEVER);
-    } while (ret == -EAGAIN);
+    /* And then allocate them */
+    old_free_cluster_index = s->free_cluster_index;
+    s->free_cluster_index = cluster_index + i;

+    ret = update_refcount(bs, offset, i << s->cluster_bits, 1,
+                          QCOW2_DISCARD_NEVER);
    if (ret < 0) {
        return ret;
    }

+    s->free_cluster_index = old_free_cluster_index;
+
    return i;
 }

@@ -739,7 +724,7 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
    }
 redo:
    free_in_cluster = s->cluster_size -
-        (s->free_byte_offset & (s->cluster_size - 1));
+        offset_into_cluster(s, s->free_byte_offset);
    if (size <= free_in_cluster) {
        /* enough space in current cluster */
        offset = s->free_byte_offset;
@@ -747,7 +732,7 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
        free_in_cluster -= size;
        if (free_in_cluster == 0)
            s->free_byte_offset = 0;
-        if ((offset & (s->cluster_size - 1)) != 0)
+        if (offset_into_cluster(s, offset) != 0)
            qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
                                          QCOW2_DISCARD_NEVER);
    } else {
@@ -755,7 +740,7 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
        if (offset < 0) {
            return offset;
        }
-        cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
+        cluster_offset = start_of_cluster(s, s->free_byte_offset);
        if ((cluster_offset + s->cluster_size) == offset) {
            /* we are lucky: contiguous data */
            offset = s->free_byte_offset;
@@ -1019,17 +1004,22 @@ static void inc_refcounts(BlockDriverState *bs,
                          int64_t offset, int64_t size)
 {
    BDRVQcowState *s = bs->opaque;
-    uint64_t start, last, cluster_offset, k;
+    int64_t start, last, cluster_offset;
+    int k;

    if (size <= 0)
        return;

-    start = offset & ~(s->cluster_size - 1);
-    last = (offset + size - 1) & ~(s->cluster_size - 1);
+    start = start_of_cluster(s, offset);
+    last = start_of_cluster(s, offset + size - 1);
    for(cluster_offset = start; cluster_offset <= last;
        cluster_offset += s->cluster_size) {
        k = cluster_offset >> s->cluster_bits;
-        if (k >= refcount_table_size) {
+        if (k < 0) {
+            fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n",
+                cluster_offset);
+            res->corruptions++;
+        } else if (k >= refcount_table_size) {
            fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after "
                "the end of the image file, can't properly check refcounts.\n",
                cluster_offset);
@@ -1132,7 +1122,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
                offset, s->cluster_size);

            /* Correct offsets are cluster aligned */
-            if (offset & (s->cluster_size - 1)) {
+            if (offset_into_cluster(s, offset)) {
                fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
                    "properly aligned; L2 entry corrupted.\n", offset);
                res->corruptions++;
@@ -1204,7 +1194,7 @@ static int check_refcounts_l1(BlockDriverState *bs,
                l2_offset, s->cluster_size);

            /* L2 tables are cluster aligned */
-            if (l2_offset & (s->cluster_size - 1)) {
+            if (offset_into_cluster(s, l2_offset)) {
                fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
                    "cluster aligned; L1 entry corrupted\n", l2_offset);
                res->corruptions++;
@@ -1433,7 +1423,7 @@ static int64_t realloc_refcount_block(BlockDriverState *bs, int reftable_index,
    }

    /* update refcount table */
-    assert(!(new_offset & (s->cluster_size - 1)));
+    assert(!offset_into_cluster(s, new_offset));
    s->refcount_table[reftable_index] = new_offset;
    ret = write_reftable_entry(bs, reftable_index);
    if (ret < 0) {
@@ -1470,19 +1460,14 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
                          BdrvCheckMode fix)
 {
    BDRVQcowState *s = bs->opaque;
-    int64_t size, i, highest_cluster, nb_clusters;
-    int refcount1, refcount2;
+    int64_t size, i, highest_cluster;
+    int nb_clusters, refcount1, refcount2;
    QCowSnapshot *sn;
    uint16_t *refcount_table;
    int ret;

    size = bdrv_getlength(bs->file);
    nb_clusters = size_to_clusters(s, size);
-    if (nb_clusters > INT_MAX) {
-        res->check_errors++;
-        return -EFBIG;
-    }
-
    refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t));

    res->bfi.total_clusters =
@@ -1522,7 +1507,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
        cluster = offset >> s->cluster_bits;

        /* Refcount blocks are cluster aligned */
-        if (offset & (s->cluster_size - 1)) {
+        if (offset_into_cluster(s, offset)) {
            fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
                "cluster aligned; refcount table entry corrupted\n", i);
            res->corruptions++;
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -26,6 +26,31 @@
 #include "block/block_int.h"
 #include "block/qcow2.h"

+typedef struct QEMU_PACKED QCowSnapshotHeader {
+    /* header is 8 byte aligned */
+    uint64_t l1_table_offset;
+
+    uint32_t l1_size;
+    uint16_t id_str_size;
+    uint16_t name_size;
+
+    uint32_t date_sec;
+    uint32_t date_nsec;
+
+    uint64_t vm_clock_nsec;
+
+    uint32_t vm_state_size;
+    uint32_t extra_data_size; /* for extension */
+    /* extra data follows */
+    /* id_str follows */
+    /* name follows  */
+} QCowSnapshotHeader;
+
+typedef struct QEMU_PACKED QCowSnapshotExtraData {
+    uint64_t vm_state_size_large;
+    uint64_t disk_size;
+} QCowSnapshotExtraData;
+
 void qcow2_free_snapshots(BlockDriverState *bs)
 {
    BDRVQcowState *s = bs->opaque;
@@ -332,10 +357,6 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
    uint64_t *l1_table = NULL;
    int64_t l1_table_offset;

-    if (s->nb_snapshots >= QCOW_MAX_SNAPSHOTS) {
-        return -EFBIG;
-    }
-
    memset(sn, 0, sizeof(*sn));

    /* Generate an ID if it wasn't passed */
@@ -654,7 +675,10 @@ int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
    return s->nb_snapshots;
 }

-int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name)
+int qcow2_snapshot_load_tmp(BlockDriverState *bs,
+                            const char *snapshot_id,
+                            const char *name,
+                            Error **errp)
 {
    int i, snapshot_index;
    BDRVQcowState *s = bs->opaque;
@@ -666,22 +690,21 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name)
    assert(bs->read_only);

    /* Search the snapshot */
-    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_name);
+    snapshot_index = find_snapshot_by_id_and_name(bs, snapshot_id, name);
    if (snapshot_index < 0) {
+        error_setg(errp,
+                   "Can't find snapshot");
        return -ENOENT;
    }
    sn = &s->snapshots[snapshot_index];

    /* Allocate and read in the snapshot's L1 table */
-    if (sn->l1_size > QCOW_MAX_L1_SIZE) {
-        error_report("Snapshot L1 table too large");
-        return -EFBIG;
-    }
-    new_l1_bytes = sn->l1_size * sizeof(uint64_t);
+    new_l1_bytes = s->l1_size * sizeof(uint64_t);
    new_l1_table = g_malloc0(align_offset(new_l1_bytes, 512));

    ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes);
    if (ret < 0) {
+        error_setg(errp, "Failed to read l1 table for snapshot");
        g_free(new_l1_table);
        return ret;
    }
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -269,15 +269,12 @@ static int qcow2_mark_clean(BlockDriverState *bs)
    BDRVQcowState *s = bs->opaque;

    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
-        int ret;
-
-        s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;
-
-        ret = bdrv_flush(bs);
+        int ret = bdrv_flush(bs);
        if (ret < 0) {
            return ret;
        }

+        s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;
        return qcow2_update_header(bs);
    }
    return 0;
@@ -332,32 +329,6 @@ static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
    return ret;
 }

-static int validate_table_offset(BlockDriverState *bs, uint64_t offset,
-                                 uint64_t entries, size_t entry_len)
-{
-    BDRVQcowState *s = bs->opaque;
-    uint64_t size;
-
-    /* Use signed INT64_MAX as the maximum even for uint64_t header fields,
-     * because values will be passed to qemu functions taking int64_t. */
-    if (entries > INT64_MAX / entry_len) {
-        return -EINVAL;
-    }
-
-    size = entries * entry_len;
-
-    if (INT64_MAX - size < offset) {
-        return -EINVAL;
-    }
-
-    /* Tables must be cluster aligned */
-    if (offset & (s->cluster_size - 1)) {
-        return -EINVAL;
-    }
-
-    return 0;
-}
-
 static QemuOptsList qcow2_runtime_opts = {
    .name = "qcow2",
    .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head),
@@ -448,8 +419,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
                      Error **errp)
 {
    BDRVQcowState *s = bs->opaque;
-    unsigned int len, i;
-    int ret = 0;
+    int len, i, ret = 0;
    QCowHeader header;
    QemuOpts *opts;
    Error *local_err = NULL;
@@ -490,18 +460,6 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,

    s->qcow_version = header.version;

-    /* Initialise cluster size */
-    if (header.cluster_bits < MIN_CLUSTER_BITS ||
-        header.cluster_bits > MAX_CLUSTER_BITS) {
-        error_setg(errp, "Unsupported cluster size: 2^%i", header.cluster_bits);
-        ret = -EINVAL;
-        goto fail;
-    }
-
-    s->cluster_bits = header.cluster_bits;
-    s->cluster_size = 1 << s->cluster_bits;
-    s->cluster_sectors = 1 << (s->cluster_bits - 9);
-
    /* Initialise version 3 header fields */
    if (header.version == 2) {
        header.incompatible_features    = 0;
@@ -515,18 +473,6 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
        be64_to_cpus(&header.autoclear_features);
        be32_to_cpus(&header.refcount_order);
        be32_to_cpus(&header.header_length);
-
-        if (header.header_length < 104) {
-            error_setg(errp, "qcow2 header too short");
-            ret = -EINVAL;
-            goto fail;
-        }
-    }
-
-    if (header.header_length > s->cluster_size) {
-        error_setg(errp, "qcow2 header exceeds cluster size");
-        ret = -EINVAL;
-        goto fail;
    }

    if (header.header_length > sizeof(header)) {
@@ -541,12 +487,6 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
        }
    }

-    if (header.backing_file_offset > s->cluster_size) {
-        error_setg(errp, "Invalid backing file offset");
-        ret = -EINVAL;
-        goto fail;
-    }
-
    if (header.backing_file_offset) {
        ext_end = header.backing_file_offset;
    } else {
@@ -589,6 +529,12 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    }
    s->refcount_order = header.refcount_order;

+    if (header.cluster_bits < MIN_CLUSTER_BITS ||
+        header.cluster_bits > MAX_CLUSTER_BITS) {
+        error_setg(errp, "Unsupported cluster size: 2^%i", header.cluster_bits);
+        ret = -EINVAL;
+        goto fail;
+    }
    if (header.crypt_method > QCOW_CRYPT_AES) {
        error_setg(errp, "Unsupported encryption method: %i",
                   header.crypt_method);
@@ -599,52 +545,23 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    if (s->crypt_method_header) {
        bs->encrypted = 1;
    }
-
+    s->cluster_bits = header.cluster_bits;
+    s->cluster_size = 1 << s->cluster_bits;
+    s->cluster_sectors = 1 << (s->cluster_bits - 9);
    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
    s->l2_size = 1 << s->l2_bits;
    bs->total_sectors = header.size / 512;
    s->csize_shift = (62 - (s->cluster_bits - 8));
    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
-
    s->refcount_table_offset = header.refcount_table_offset;
    s->refcount_table_size =
        header.refcount_table_clusters << (s->cluster_bits - 3);

-    if (header.refcount_table_clusters > qcow2_max_refcount_clusters(s)) {
-        error_setg(errp, "Reference count table too large");
-        ret = -EINVAL;
-        goto fail;
-    }
-
-    ret = validate_table_offset(bs, s->refcount_table_offset,
-                                s->refcount_table_size, sizeof(uint64_t));
-    if (ret < 0) {
-        error_setg(errp, "Invalid reference count table offset");
-        goto fail;
-    }
-
-    /* Snapshot table offset/length */
-    if (header.nb_snapshots > QCOW_MAX_SNAPSHOTS) {
-        error_setg(errp, "Too many snapshots");
-        ret = -EINVAL;
-        goto fail;
-    }
-
-    ret = validate_table_offset(bs, header.snapshots_offset,
-                                header.nb_snapshots,
-                                sizeof(QCowSnapshotHeader));
-    if (ret < 0) {
-        error_setg(errp, "Invalid snapshot table offset");
-        goto fail;
-    }
+    s->snapshots_offset = header.snapshots_offset;
+    s->nb_snapshots = header.nb_snapshots;

    /* read the level 1 table */
-    if (header.l1_size > QCOW_MAX_L1_SIZE) {
-        error_setg(errp, "Active L1 table too large");
-        ret = -EFBIG;
-        goto fail;
-    }
    s->l1_size = header.l1_size;

    l1_vm_state_index = size_to_l1(s, header.size);
@@ -662,16 +579,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
        ret = -EINVAL;
        goto fail;
    }
-
-    ret = validate_table_offset(bs, header.l1_table_offset,
-                                header.l1_size, sizeof(uint64_t));
-    if (ret < 0) {
-        error_setg(errp, "Invalid L1 table offset");
-        goto fail;
-    }
    s->l1_table_offset = header.l1_table_offset;
-
-
    if (s->l1_size > 0) {
        s->l1_table = g_malloc0(
            align_offset(s->l1_size * sizeof(uint64_t), 512));
@@ -717,10 +625,8 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    /* read the backing file name */
    if (header.backing_file_offset != 0) {
        len = header.backing_file_size;
-        if (len > MIN(1023, s->cluster_size - header.backing_file_offset)) {
-            error_setg(errp, "Backing file name too long");
-            ret = -EINVAL;
-            goto fail;
+        if (len > 1023) {
+            len = 1023;
        }
        ret = bdrv_pread(bs->file, header.backing_file_offset,
                         bs->backing_file, len);
@@ -731,10 +637,6 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
        bs->backing_file[len] = '\0';
    }

-    /* Internal snapshots */
-    s->snapshots_offset = header.snapshots_offset;
-    s->nb_snapshots = header.nb_snapshots;
-
    ret = qcow2_read_snapshots(bs);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not read snapshots");
@@ -816,6 +718,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    }

    qemu_opts_del(opts);
+    bs->bl.write_zeroes_alignment = s->cluster_sectors;

    if (s->use_lazy_refcounts && s->qcow_version < 3) {
        error_setg(errp, "Lazy refcounts require a qcow2 image with at least "
@@ -890,25 +793,11 @@ static int qcow2_set_key(BlockDriverState *bs, const char *key)
    return 0;
 }

-/* We have no actual commit/abort logic for qcow2, but we need to write out any
- * unwritten data if we reopen read-only. */
+/* We have nothing to do for QCOW2 reopen, stubs just return
+ * success */
 static int qcow2_reopen_prepare(BDRVReopenState *state,
                                BlockReopenQueue *queue, Error **errp)
 {
-    int ret;
-
-    if ((state->flags & BDRV_O_RDWR) == 0) {
-        ret = bdrv_flush(state->bs);
-        if (ret < 0) {
-            return ret;
-        }
-
-        ret = qcow2_mark_clean(state->bs);
-        if (ret < 0) {
-            return ret;
-        }
-    }
-
    return 0;
 }

@@ -1584,7 +1473,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
     */
    BlockDriverState* bs;
    QCowHeader *header;
-    uint64_t* refcount_table;
+    uint8_t* refcount_table;
    Error *local_err = NULL;
    int ret;

@@ -1634,10 +1523,9 @@ static int qcow2_create2(const char *filename, int64_t total_size,
        goto out;
    }

-    /* Write a refcount table with one refcount block */
-    refcount_table = g_malloc0(2 * cluster_size);
-    refcount_table[0] = cpu_to_be64(2 * cluster_size);
-    ret = bdrv_pwrite(bs, cluster_size, refcount_table, 2 * cluster_size);
+    /* Write an empty refcount table */
+    refcount_table = g_malloc0(cluster_size);
+    ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size);
    g_free(refcount_table);

    if (ret < 0) {
@@ -1661,7 +1549,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
        goto out;
    }

-    ret = qcow2_alloc_clusters(bs, 3 * cluster_size);
+    ret = qcow2_alloc_clusters(bs, 2 * cluster_size);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 "
                         "header and refcount table");
@@ -1814,7 +1702,7 @@ static int qcow2_make_empty(BlockDriverState *bs)
 }

 static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors)
+    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
 {
    int ret;
    BDRVQcowState *s = bs->opaque;
@@ -2010,6 +1898,8 @@ static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
 static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
    BDRVQcowState *s = bs->opaque;
+    bdi->unallocated_blocks_are_zero = true;
+    bdi->can_write_zeroes_with_unmap = (s->qcow_version >= 3);
    bdi->cluster_size = s->cluster_size;
    bdi->vm_state_offset = qcow2_vm_state_offset(s);
    return 0;
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -38,15 +38,6 @@
 #define QCOW_CRYPT_AES  1

 #define QCOW_MAX_CRYPT_CLUSTERS 32
-#define QCOW_MAX_SNAPSHOTS 65536
-
-/* 8 MB refcount table is enough for 2 PB images at 64k cluster size
- * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */
-#define QCOW_MAX_REFTABLE_SIZE 0x800000
-
-/* 32 MB L1 table is enough for 2 PB images at 64k cluster size
- * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */
-#define QCOW_MAX_L1_SIZE 0x2000000

 /* indicate that the refcount of the referenced cluster is exactly one. */
 #define QCOW_OFLAG_COPIED     (1ULL << 63)
@@ -106,32 +97,6 @@ typedef struct QCowHeader {
    uint32_t header_length;
 } QEMU_PACKED QCowHeader;

-typedef struct QEMU_PACKED QCowSnapshotHeader {
-    /* header is 8 byte aligned */
-    uint64_t l1_table_offset;
-
-    uint32_t l1_size;
-    uint16_t id_str_size;
-    uint16_t name_size;
-
-    uint32_t date_sec;
-    uint32_t date_nsec;
-
-    uint64_t vm_clock_nsec;
-
-    uint32_t vm_state_size;
-    uint32_t extra_data_size; /* for extension */
-    /* extra data follows */
-    /* id_str follows */
-    /* name follows  */
-} QCowSnapshotHeader;
-
-typedef struct QEMU_PACKED QCowSnapshotExtraData {
-    uint64_t vm_state_size_large;
-    uint64_t disk_size;
-} QCowSnapshotExtraData;
-
-
 typedef struct QCowSnapshot {
    uint64_t l1_table_offset;
    uint32_t l1_size;
@@ -226,8 +191,8 @@ typedef struct BDRVQcowState {
    uint64_t *refcount_table;
    uint64_t refcount_table_offset;
    uint32_t refcount_table_size;
-    uint64_t free_cluster_index;
-    uint64_t free_byte_offset;
+    int64_t free_cluster_index;
+    int64_t free_byte_offset;

    CoMutex lock;

@@ -237,7 +202,7 @@ typedef struct BDRVQcowState {
    AES_KEY aes_decrypt_key;
    uint64_t snapshots_offset;
    int snapshots_size;
-    unsigned int nb_snapshots;
+    int nb_snapshots;
    QCowSnapshot *snapshots;

    int flags;
@@ -418,11 +383,6 @@ static inline int64_t qcow2_vm_state_offset(BDRVQcowState *s)
    return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits);
 }

-static inline uint64_t qcow2_max_refcount_clusters(BDRVQcowState *s)
-{
-    return QCOW_MAX_REFTABLE_SIZE >> s->cluster_bits;
-}
-
 static inline int qcow2_get_cluster_type(uint64_t l2_entry)
 {
    if (l2_entry & QCOW_OFLAG_COMPRESSED) {
@@ -471,7 +431,7 @@ void qcow2_refcount_close(BlockDriverState *bs);
 int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index,
                                  int addend, enum qcow2_discard_type type);

-int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size);
+int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size);
 int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
    int nb_clusters);
 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size);
@@ -528,7 +488,10 @@ int qcow2_snapshot_delete(BlockDriverState *bs,
                          const char *name,
                          Error **errp);
 int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab);
-int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name);
+int qcow2_snapshot_load_tmp(BlockDriverState *bs,
+                            const char *snapshot_id,
+                            const char *name,
+                            Error **errp);

 void qcow2_free_snapshots(BlockDriverState *bs);
 int qcow2_read_snapshots(BlockDriverState *bs);
--- a/block/qed.c
+++ b/block/qed.c
@@ -495,6 +495,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
        }
    }

+    bs->bl.write_zeroes_alignment = s->header.cluster_size >> BDRV_SECTOR_BITS;
    s->need_check_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
                                            qed_need_check_timer_cb, s);

@@ -1397,7 +1398,8 @@ static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret)

 static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
                                                 int64_t sector_num,
-                                                 int nb_sectors)
+                                                 int nb_sectors,
+                                                 BdrvRequestFlags flags)
 {
    BlockDriverAIOCB *blockacb;
    BDRVQEDState *s = bs->opaque;
@@ -1474,6 +1476,8 @@ static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    memset(bdi, 0, sizeof(*bdi));
    bdi->cluster_size = s->header.cluster_size;
    bdi->is_dirty = s->header.features & QED_F_NEED_CHECK;
+    bdi->unallocated_blocks_are_zero = true;
+    bdi->can_write_zeroes_with_unmap = true;
    return 0;
 }

--- a/block/raw-aio.h
+++ b/block/raw-aio.h
@@ -21,9 +21,10 @@
 #define QEMU_AIO_IOCTL        0x0004
 #define QEMU_AIO_FLUSH        0x0008
 #define QEMU_AIO_DISCARD      0x0010
+#define QEMU_AIO_WRITE_ZEROES 0x0020
 #define QEMU_AIO_TYPE_MASK \
        (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH| \
-         QEMU_AIO_DISCARD)
+         QEMU_AIO_DISCARD|QEMU_AIO_WRITE_ZEROES)

 /* AIO flags */
 #define QEMU_AIO_MISALIGNED   0x1000
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -139,9 +139,11 @@ typedef struct BDRVRawState {
    void *aio_ctx;
 #endif
 #ifdef CONFIG_XFS
-    bool is_xfs : 1;
+    bool is_xfs:1;
 #endif
-    bool has_discard : 1;
+    bool has_discard:1;
+    bool has_write_zeroes:1;
+    bool discard_zeroes:1;
 } BDRVRawState;

 typedef struct BDRVRawReopenState {
@@ -283,6 +285,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
    Error *local_err = NULL;
    const char *filename;
    int fd, ret;
+    struct stat st;

    opts = qemu_opts_create_nofail(&raw_runtime_opts);
    qemu_opts_absorb_qdict(opts, options, &local_err);
@@ -323,10 +326,38 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
    }
 #endif

-    s->has_discard = 1;
+    s->has_discard = true;
+    s->has_write_zeroes = true;
+
+    if (fstat(s->fd, &st) < 0) {
+        error_setg_errno(errp, errno, "Could not stat file");
+        goto fail;
+    }
+    if (S_ISREG(st.st_mode)) {
+        s->discard_zeroes = true;
+    }
+    if (S_ISBLK(st.st_mode)) {
+#ifdef BLKDISCARDZEROES
+        unsigned int arg;
+        if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
+            s->discard_zeroes = true;
+        }
+#endif
+#ifdef __linux__
+        /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
+         * not rely on the contents of discarded blocks unless using O_DIRECT.
+         * Same for BLKZEROOUT.
+         */
+        if (!(bs->open_flags & BDRV_O_NOCACHE)) {
+            s->discard_zeroes = false;
+            s->has_write_zeroes = false;
+        }
+#endif
+    }
+
 #ifdef CONFIG_XFS
    if (platform_test_xfs_fd(s->fd)) {
-        s->is_xfs = 1;
+        s->is_xfs = true;
    }
 #endif

@@ -675,6 +706,23 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
 }

 #ifdef CONFIG_XFS
+static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
+{
+    struct xfs_flock64 fl;
+
+    memset(&fl, 0, sizeof(fl));
+    fl.l_whence = SEEK_SET;
+    fl.l_start = offset;
+    fl.l_len = bytes;
+
+    if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) {
+        DEBUG_BLOCK_PRINT("cannot write zero range (%s)\n", strerror(errno));
+        return -errno;
+    }
+
+    return 0;
+}
+
 static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
 {
    struct xfs_flock64 fl;
@@ -693,13 +741,49 @@ static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
 }
 #endif

+static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
+{
+    int ret = -EOPNOTSUPP;
+    BDRVRawState *s = aiocb->bs->opaque;
+
+    if (s->has_write_zeroes == 0) {
+        return -ENOTSUP;
+    }
+
+    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
+#ifdef BLKZEROOUT
+        do {
+            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
+            if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
+                return 0;
+            }
+        } while (errno == EINTR);
+
+        ret = -errno;
+#endif
+    } else {
+#ifdef CONFIG_XFS
+        if (s->is_xfs) {
+            return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes);
+        }
+#endif
+    }
+
+    if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP ||
+        ret == -ENOTTY) {
+        s->has_write_zeroes = false;
+        ret = -ENOTSUP;
+    }
+    return ret;
+}
+
 static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
 {
    int ret = -EOPNOTSUPP;
    BDRVRawState *s = aiocb->bs->opaque;

-    if (s->has_discard == 0) {
-        return 0;
+    if (!s->has_discard) {
+        return -ENOTSUP;
    }

    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
@@ -734,8 +818,8 @@ static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)

    if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP ||
        ret == -ENOTTY) {
-        s->has_discard = 0;
-        ret = 0;
+        s->has_discard = false;
+        ret = -ENOTSUP;
    }
    return ret;
 }
@@ -777,6 +861,9 @@ static int aio_worker(void *arg)
    case QEMU_AIO_DISCARD:
        ret = handle_aiocb_discard(aiocb);
        break;
+    case QEMU_AIO_WRITE_ZEROES:
+        ret = handle_aiocb_write_zeroes(aiocb);
+        break;
    default:
        fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
        ret = -EINVAL;
@@ -787,6 +874,29 @@ static int aio_worker(void *arg)
    return ret;
 }

+static int paio_submit_co(BlockDriverState *bs, int fd,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        int type)
+{
+    RawPosixAIOData *acb = g_slice_new(RawPosixAIOData);
+    ThreadPool *pool;
+
+    acb->bs = bs;
+    acb->aio_type = type;
+    acb->aio_fildes = fd;
+
+    if (qiov) {
+        acb->aio_iov = qiov->iov;
+        acb->aio_niov = qiov->niov;
+    }
+    acb->aio_nbytes = nb_sectors * 512;
+    acb->aio_offset = sector_num * 512;
+
+    trace_paio_submit_co(sector_num, nb_sectors, type);
+    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
+    return thread_pool_submit_co(pool, aio_worker, acb);
+}
+
 static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
        BlockDriverCompletionFunc *cb, void *opaque, int type)
@@ -1199,6 +1309,31 @@ static coroutine_fn BlockDriverAIOCB *raw_aio_discard(BlockDriverState *bs,
                       cb, opaque, QEMU_AIO_DISCARD);
 }

+static int coroutine_fn raw_co_write_zeroes(
+    BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, BdrvRequestFlags flags)
+{
+    BDRVRawState *s = bs->opaque;
+
+    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
+        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+                              QEMU_AIO_WRITE_ZEROES);
+    } else if (s->discard_zeroes) {
+        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+                              QEMU_AIO_DISCARD);
+    }
+    return -ENOTSUP;
+}
+
+static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    BDRVRawState *s = bs->opaque;
+
+    bdi->unallocated_blocks_are_zero = s->discard_zeroes;
+    bdi->can_write_zeroes_with_unmap = s->discard_zeroes;
+    return 0;
+}
+
 static QEMUOptionParameter raw_create_options[] = {
    {
        .name = BLOCK_OPT_SIZE,
@@ -1222,6 +1357,7 @@ static BlockDriver bdrv_file = {
    .bdrv_create = raw_create,
    .bdrv_has_zero_init = bdrv_has_zero_init_1,
    .bdrv_co_get_block_status = raw_co_get_block_status,
+    .bdrv_co_write_zeroes = raw_co_write_zeroes,

    .bdrv_aio_readv = raw_aio_readv,
    .bdrv_aio_writev = raw_aio_writev,
@@ -1230,6 +1366,7 @@ static BlockDriver bdrv_file = {

    .bdrv_truncate = raw_truncate,
    .bdrv_getlength = raw_getlength,
+    .bdrv_get_info = raw_get_info,
    .bdrv_get_allocated_file_size
                        = raw_get_allocated_file_size,

@@ -1525,6 +1662,26 @@ static coroutine_fn BlockDriverAIOCB *hdev_aio_discard(BlockDriverState *bs,
                       cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
 }

+static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+{
+    BDRVRawState *s = bs->opaque;
+    int rc;
+
+    rc = fd_open(bs);
+    if (rc < 0) {
+        return rc;
+    }
+    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
+        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+                              QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV);
+    } else if (s->discard_zeroes) {
+        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+                              QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
+    }
+    return -ENOTSUP;
+}
+
 static int hdev_create(const char *filename, QEMUOptionParameter *options,
                       Error **errp)
 {
@@ -1577,6 +1734,7 @@ static BlockDriver bdrv_host_device = {
    .bdrv_reopen_abort   = raw_reopen_abort,
    .bdrv_create        = hdev_create,
    .create_options     = raw_create_options,
+    .bdrv_co_write_zeroes = hdev_co_write_zeroes,

    .bdrv_aio_readv	= raw_aio_readv,
    .bdrv_aio_writev	= raw_aio_writev,
@@ -1585,6 +1743,7 @@ static BlockDriver bdrv_host_device = {

    .bdrv_truncate      = raw_truncate,
    .bdrv_getlength	= raw_getlength,
+    .bdrv_get_info = raw_get_info,
    .bdrv_get_allocated_file_size
                        = raw_get_allocated_file_size,

--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -68,9 +68,10 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
 }

 static int coroutine_fn raw_co_write_zeroes(BlockDriverState *bs,
-                                            int64_t sector_num, int nb_sectors)
+                                            int64_t sector_num, int nb_sectors,
+                                            BdrvRequestFlags flags)
 {
-    return bdrv_co_write_zeroes(bs->file, sector_num, nb_sectors);
+    return bdrv_co_write_zeroes(bs->file, sector_num, nb_sectors, flags);
 }

 static int coroutine_fn raw_co_discard(BlockDriverState *bs,
@@ -149,6 +150,7 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
                    Error **errp)
 {
    bs->sg = bs->file->sg;
+    bs->bl = bs->file->bl;
    return 0;
 }

--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -91,6 +91,14 @@
 #define SD_NR_VDIS   (1U << 24)
 #define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
 #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
+/*
+ * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and
+ * (SD_EC_MAX_STRIP - 1) for parity strips
+ *
+ * SD_MAX_COPIES is sum of number of data strips and parity strips.
+ */
+#define SD_EC_MAX_STRIP 16
+#define SD_MAX_COPIES (SD_EC_MAX_STRIP * 2 - 1)

 #define SD_INODE_SIZE (sizeof(SheepdogInode))
 #define CURRENT_VDI_ID 0
@@ -1464,9 +1472,7 @@ out:
    return ret;
 }

-static int do_sd_create(BDRVSheepdogState *s, char *filename, int64_t vdi_size,
-                        uint32_t base_vid, uint32_t *vdi_id, int snapshot,
-                        uint8_t copy_policy)
+static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot)
 {
    SheepdogVdiReq hdr;
    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
@@ -1483,11 +1489,11 @@ static int do_sd_create(BDRVSheepdogState *s, char *filename, int64_t vdi_size,
     * does not fit in buf?  For now, just truncate and avoid buffer overrun.
     */
    memset(buf, 0, sizeof(buf));
-    pstrcpy(buf, sizeof(buf), filename);
+    pstrcpy(buf, sizeof(buf), s->name);

    memset(&hdr, 0, sizeof(hdr));
    hdr.opcode = SD_OP_NEW_VDI;
-    hdr.vdi_id = base_vid;
+    hdr.vdi_id = s->inode.vdi_id;

    wlen = SD_MAX_VDI_LEN;

@@ -1495,8 +1501,9 @@ static int do_sd_create(BDRVSheepdogState *s, char *filename, int64_t vdi_size,
    hdr.snapid = snapshot;

    hdr.data_length = wlen;
-    hdr.vdi_size = vdi_size;
-    hdr.copy_policy = copy_policy;
+    hdr.vdi_size = s->inode.vdi_size;
+    hdr.copy_policy = s->inode.copy_policy;
+    hdr.copies = s->inode.nr_copies;

    ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);

@@ -1507,7 +1514,7 @@ static int do_sd_create(BDRVSheepdogState *s, char *filename, int64_t vdi_size,
    }

    if (rsp->result != SD_RES_SUCCESS) {
-        error_report("%s, %s", sd_strerror(rsp->result), filename);
+        error_report("%s, %s", sd_strerror(rsp->result), s->inode.name);
        return -EIO;
    }

@@ -1564,27 +1571,79 @@ out:
    return ret;
 }

+/*
+ * Sheepdog support two kinds of redundancy, full replication and erasure
+ * coding.
+ *
+ * # create a fully replicated vdi with x copies
+ * -o redundancy=x (1 <= x <= SD_MAX_COPIES)
+ *
+ * # create a erasure coded vdi with x data strips and y parity strips
+ * -o redundancy=x:y (x must be one of {2,4,8,16} and 1 <= y < SD_EC_MAX_STRIP)
+ */
+static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
+{
+    struct SheepdogInode *inode = &s->inode;
+    const char *n1, *n2;
+    long copy, parity;
+    char p[10];
+
+    pstrcpy(p, sizeof(p), opt);
+    n1 = strtok(p, ":");
+    n2 = strtok(NULL, ":");
+
+    if (!n1) {
+        return -EINVAL;
+    }
+
+    copy = strtol(n1, NULL, 10);
+    if (copy > SD_MAX_COPIES || copy < 1) {
+        return -EINVAL;
+    }
+    if (!n2) {
+        inode->copy_policy = 0;
+        inode->nr_copies = copy;
+        return 0;
+    }
+
+    if (copy != 2 && copy != 4 && copy != 8 && copy != 16) {
+        return -EINVAL;
+    }
+
+    parity = strtol(n2, NULL, 10);
+    if (parity >= SD_EC_MAX_STRIP || parity < 1) {
+        return -EINVAL;
+    }
+
+    /*
+     * 4 bits for parity and 4 bits for data.
+     * We have to compress upper data bits because it can't represent 16
+     */
+    inode->copy_policy = ((copy / 2) << 4) + parity;
+    inode->nr_copies = copy + parity;
+
+    return 0;
+}
+
 static int sd_create(const char *filename, QEMUOptionParameter *options,
                     Error **errp)
 {
    int ret = 0;
-    uint32_t vid = 0, base_vid = 0;
-    int64_t vdi_size = 0;
+    uint32_t vid = 0;
    char *backing_file = NULL;
    BDRVSheepdogState *s;
-    char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
+    char tag[SD_MAX_VDI_TAG_LEN];
    uint32_t snapid;
    bool prealloc = false;
    Error *local_err = NULL;

    s = g_malloc0(sizeof(BDRVSheepdogState));

-    memset(vdi, 0, sizeof(vdi));
    memset(tag, 0, sizeof(tag));
    if (strstr(filename, "://")) {
-        ret = sd_parse_uri(s, filename, vdi, &snapid, tag);
+        ret = sd_parse_uri(s, filename, s->name, &snapid, tag);
    } else {
-        ret = parse_vdiname(s, filename, vdi, &snapid, tag);
+        ret = parse_vdiname(s, filename, s->name, &snapid, tag);
    }
    if (ret < 0) {
        goto out;
@@ -1592,7 +1651,7 @@ static int sd_create(const char *filename, QEMUOptionParameter *options,

    while (options && options->name) {
        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
-            vdi_size = options->value.n;
+            s->inode.vdi_size = options->value.n;
        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
            backing_file = options->value.s;
        } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
@@ -1606,11 +1665,18 @@ static int sd_create(const char *filename, QEMUOptionParameter *options,
                ret = -EINVAL;
                goto out;
            }
+        } else if (!strcmp(options->name, BLOCK_OPT_REDUNDANCY)) {
+            if (options->value.s) {
+                ret = parse_redundancy(s, options->value.s);
+                if (ret < 0) {
+                    goto out;
+                }
+            }
        }
        options++;
    }

-    if (vdi_size > SD_MAX_VDI_SIZE) {
+    if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
        error_report("too big image size");
        ret = -EINVAL;
        goto out;
@@ -1645,12 +1711,10 @@ static int sd_create(const char *filename, QEMUOptionParameter *options,
            goto out;
        }

-        base_vid = s->inode.vdi_id;
        bdrv_unref(bs);
    }

-    /* TODO: allow users to specify copy number */
-    ret = do_sd_create(s, vdi, vdi_size, base_vid, &vid, 0, 0);
+    ret = do_sd_create(s, &vid, 0);
    if (!prealloc || ret) {
        goto out;
    }
@@ -1833,8 +1897,7 @@ static int sd_create_branch(BDRVSheepdogState *s)
     * false bail out.
     */
    deleted = sd_delete(s);
-    ret = do_sd_create(s, s->name, s->inode.vdi_size, s->inode.vdi_id, &vid,
-                       !deleted, s->inode.copy_policy);
+    ret = do_sd_create(s, &vid, !deleted);
    if (ret) {
        goto out;
    }
@@ -2082,7 +2145,6 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
    strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
    /* we don't need to update entire object */
    datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
-    inode = g_malloc(datalen);

    /* refresh inode. */
    fd = connect_to_sdog(s);
@@ -2098,14 +2160,15 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
        goto cleanup;
    }

-    ret = do_sd_create(s, s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid,
-                       1, s->inode.copy_policy);
+    ret = do_sd_create(s, &new_vid, 1);
    if (ret < 0) {
        error_report("failed to create inode for snapshot. %s",
                     strerror(errno));
        goto cleanup;
    }

+    inode = (SheepdogInode *)g_malloc(datalen);
+
    ret = read_object(fd, (char *)inode, vid_to_vdi_oid(new_vid),
                      s->inode.nr_copies, datalen, 0, s->cache_flags);

@@ -2119,7 +2182,6 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
            s->inode.name, s->inode.snap_id, s->inode.vdi_id);

 cleanup:
-    g_free(inode);
    closesocket(fd);
    return ret;
 }
@@ -2407,6 +2469,22 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
    return ret;
 }

+static int64_t sd_get_allocated_file_size(BlockDriverState *bs)
+{
+    BDRVSheepdogState *s = bs->opaque;
+    SheepdogInode *inode = &s->inode;
+    unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, SD_DATA_OBJ_SIZE);
+    uint64_t size = 0;
+
+    for (i = 0; i < last; i++) {
+        if (inode->data_vdi_id[i] == 0) {
+            continue;
+        }
+        size += SD_DATA_OBJ_SIZE;
+    }
+    return size;
+}
+
 static QEMUOptionParameter sd_create_options[] = {
    {
        .name = BLOCK_OPT_SIZE,
@@ -2423,6 +2501,11 @@ static QEMUOptionParameter sd_create_options[] = {
        .type = OPT_STRING,
        .help = "Preallocation mode (allowed values: off, full)"
    },
+    {
+        .name = BLOCK_OPT_REDUNDANCY,
+        .type = OPT_STRING,
+        .help = "Redundancy of the image"
+    },
    { NULL }
 };

@@ -2436,6 +2519,7 @@ static BlockDriver bdrv_sheepdog = {
    .bdrv_create    = sd_create,
    .bdrv_has_zero_init = bdrv_has_zero_init_1,
    .bdrv_getlength = sd_getlength,
+    .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
    .bdrv_truncate  = sd_truncate,

    .bdrv_co_readv  = sd_co_readv,
@@ -2465,6 +2549,7 @@ static BlockDriver bdrv_sheepdog_tcp = {
    .bdrv_create    = sd_create,
    .bdrv_has_zero_init = bdrv_has_zero_init_1,
    .bdrv_getlength = sd_getlength,
+    .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
    .bdrv_truncate  = sd_truncate,

    .bdrv_co_readv  = sd_co_readv,
@@ -2494,6 +2579,7 @@ static BlockDriver bdrv_sheepdog_unix = {
    .bdrv_create    = sd_create,
    .bdrv_has_zero_init = bdrv_has_zero_init_1,
    .bdrv_getlength = sd_getlength,
+    .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
    .bdrv_truncate  = sd_truncate,

    .bdrv_co_readv  = sd_co_readv,
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -25,6 +25,24 @@
 #include "block/snapshot.h"
 #include "block/block_int.h"

+QemuOptsList internal_snapshot_opts = {
+    .name = "snapshot",
+    .head = QTAILQ_HEAD_INITIALIZER(internal_snapshot_opts.head),
+    .desc = {
+        {
+            .name = SNAPSHOT_OPT_ID,
+            .type = QEMU_OPT_STRING,
+            .help = "snapshot id"
+        },{
+            .name = SNAPSHOT_OPT_NAME,
+            .type = QEMU_OPT_STRING,
+            .help = "snapshot name"
+        },{
+            /* end of list */
+        }
+    },
+};
+
 int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info,
                       const char *name)
 {
@@ -194,7 +212,7 @@ int bdrv_snapshot_goto(BlockDriverState *bs,
 * If only @snapshot_id is specified, delete the first one with id
 * @snapshot_id.
 * If only @name is specified, delete the first one with name @name.
- * if none is specified, return -ENINVAL.
+ * if none is specified, return -EINVAL.
 *
 * Returns: 0 on success, -errno on failure. If @bs is not inserted, return
 * -ENOMEDIUM. If @snapshot_id and @name are both NULL, return -EINVAL. If @bs
@@ -265,18 +283,71 @@ int bdrv_snapshot_list(BlockDriverState *bs,
    return -ENOTSUP;
 }

+/**
+ * Temporarily load an internal snapshot by @snapshot_id and @name.
+ * @bs: block device used in the operation
+ * @snapshot_id: unique snapshot ID, or NULL
+ * @name: snapshot name, or NULL
+ * @errp: location to store error
+ *
+ * If both @snapshot_id and @name are specified, load the first one with
+ * id @snapshot_id and name @name.
+ * If only @snapshot_id is specified, load the first one with id
+ * @snapshot_id.
+ * If only @name is specified, load the first one with name @name.
+ * if none is specified, return -EINVAL.
+ *
+ * Returns: 0 on success, -errno on fail. If @bs is not inserted, return
+ * -ENOMEDIUM. If @bs is not readonly, return -EINVAL. If @bs did not support
+ * internal snapshot, return -ENOTSUP. If qemu can't find a matching @id and
+ * @name, return -ENOENT. If @errp != NULL, it will always be filled on
+ * failure.
+ */
 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
-        const char *snapshot_name)
+                           const char *snapshot_id,
+                           const char *name,
+                           Error **errp)
 {
    BlockDriver *drv = bs->drv;
+
    if (!drv) {
+        error_set(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs));
        return -ENOMEDIUM;
    }
+    if (!snapshot_id && !name) {
+        error_setg(errp, "snapshot_id and name are both NULL");
+        return -EINVAL;
+    }
    if (!bs->read_only) {
+        error_setg(errp, "Device is not readonly");
        return -EINVAL;
    }
    if (drv->bdrv_snapshot_load_tmp) {
-        return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
+        return drv->bdrv_snapshot_load_tmp(bs, snapshot_id, name, errp);
    }
+    error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
+              drv->format_name, bdrv_get_device_name(bs),
+              "temporarily load internal snapshot");
    return -ENOTSUP;
 }
+
+int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs,
+                                         const char *id_or_name,
+                                         Error **errp)
+{
+    int ret;
+    Error *local_err = NULL;
+
+    ret = bdrv_snapshot_load_tmp(bs, id_or_name, NULL, &local_err);
+    if (ret == -ENOENT || ret == -EINVAL) {
+        error_free(local_err);
+        local_err = NULL;
+        ret = bdrv_snapshot_load_tmp(bs, NULL, id_or_name, &local_err);
+    }
+
+    if (error_is_set(&local_err)) {
+        error_propagate(errp, local_err);
+    }
+
+    return ret;
+}
--- a/block/stream.c
+++ b/block/stream.c
@@ -88,6 +88,11 @@ static void coroutine_fn stream_run(void *opaque)
    int n = 0;
    void *buf;

+    if (!bs->backing_hd) {
+        block_job_completed(&s->common, 0);
+        return;
+    }
+
    s->common.len = bdrv_getlength(bs);
    if (s->common.len < 0) {
        block_job_completed(&s->common, s->common.len);
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -120,11 +120,6 @@ typedef unsigned char uuid_t[16];

 #define VDI_IS_ALLOCATED(X) ((X) < VDI_DISCARDED)

-/* max blocks in image is (0xffffffff / 4) */
-#define VDI_BLOCKS_IN_IMAGE_MAX  0x3fffffff
-#define VDI_DISK_SIZE_MAX        ((uint64_t)VDI_BLOCKS_IN_IMAGE_MAX * \
-                                  (uint64_t)DEFAULT_CLUSTER_SIZE)
-
 #if !defined(CONFIG_UUID)
 static inline void uuid_generate(uuid_t out)
 {
@@ -336,6 +331,7 @@ static int vdi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    logout("\n");
    bdi->cluster_size = s->block_size;
    bdi->vm_state_offset = 0;
+    bdi->unallocated_blocks_are_zero = true;
    return 0;
 }

@@ -389,13 +385,6 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
    vdi_header_print(&header);
 #endif

-    if (header.disk_size > VDI_DISK_SIZE_MAX) {
-        logout("disk size is 0x%" PRIx64 ", max supported is 0x%" PRIx64,
-               header.disk_size, VDI_DISK_SIZE_MAX);
-        ret = -ENOTSUP;
-        goto fail;
-    }
-
    if (header.disk_size % SECTOR_SIZE != 0) {
        /* 'VBoxManage convertfromraw' can create images with odd disk sizes.
           We accept them but round the disk size to the next multiple of
@@ -428,7 +417,7 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
        logout("unsupported sector size %u B\n", header.sector_size);
        ret = -ENOTSUP;
        goto fail;
-    } else if (header.block_size != DEFAULT_CLUSTER_SIZE) {
+    } else if (header.block_size != 1 * MiB) {
        logout("unsupported block size %u B\n", header.block_size);
        ret = -ENOTSUP;
        goto fail;
@@ -445,11 +434,6 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
        logout("parent uuid != 0, unsupported\n");
        ret = -ENOTSUP;
        goto fail;
-    } else if (header.blocks_in_image > VDI_BLOCKS_IN_IMAGE_MAX) {
-        logout("too many blocks %u, max is %u)",
-               header.blocks_in_image, VDI_BLOCKS_IN_IMAGE_MAX);
-        ret = -ENOTSUP;
-        goto fail;
    }

    bs->total_sectors = header.disk_size / SECTOR_SIZE;
@@ -698,20 +682,11 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options,
        options++;
    }

-    if (bytes > VDI_DISK_SIZE_MAX) {
-        result = -ENOTSUP;
-        logout("image size (size is 0x%" PRIx64
-               ", max supported is 0x%" PRIx64 ")",
-               bytes, VDI_DISK_SIZE_MAX);
-        goto exit;
-    }
-
    fd = qemu_open(filename,
                   O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
                   0644);
    if (fd < 0) {
-        result = -errno;
-        goto exit;
+        return -errno;
    }

    /* We need enough blocks to store the given disk size,
@@ -772,7 +747,6 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options,
        result = -errno;
    }

-exit:
    return result;
 }

--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -785,20 +785,12 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
    le32_to_cpus(&s->logical_sector_size);
    le32_to_cpus(&s->physical_sector_size);

-    if (s->params.block_size < VHDX_BLOCK_SIZE_MIN ||
-        s->params.block_size > VHDX_BLOCK_SIZE_MAX) {
+    if (s->logical_sector_size == 0 || s->params.block_size == 0) {
        ret = -EINVAL;
        goto exit;
    }

-    /* only 2 supported sector sizes */
-    if (s->logical_sector_size != 512 && s->logical_sector_size != 4096) {
-        ret = -EINVAL;
-        goto exit;
-    }
-
-    /* Both block_size and sector_size are guaranteed powers of 2, below.
-       Due to range checks above, s->sectors_per_block can never be < 256 */
+    /* both block_size and sector_size are guaranteed powers of 2 */
    s->sectors_per_block = s->params.block_size / s->logical_sector_size;
    s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) *
                     (uint64_t)s->logical_sector_size /
@@ -1051,6 +1043,18 @@ static void vhdx_block_translate(BDRVVHDXState *s, int64_t sector_num,
 }


+static int vhdx_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    BDRVVHDXState *s = bs->opaque;
+
+    bdi->cluster_size = s->block_size;
+
+    bdi->unallocated_blocks_are_zero =
+        (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) == 0;
+
+    return 0;
+}
+

 static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num,
                                      int nb_sectors, QEMUIOVector *qiov)
@@ -1893,6 +1897,7 @@ static BlockDriver bdrv_vhdx = {
    .bdrv_co_readv          = vhdx_co_readv,
    .bdrv_co_writev         = vhdx_co_writev,
    .bdrv_create            = vhdx_create,
+    .bdrv_get_info          = vhdx_get_info,

    .create_options         = vhdx_create_options,
 };
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -428,6 +428,10 @@ static int vmdk_add_extent(BlockDriverState *bs,
    extent->l2_size = l2_size;
    extent->cluster_sectors = flat ? sectors : cluster_sectors;

+    if (!flat) {
+        bs->bl.write_zeroes_alignment =
+            MAX(bs->bl.write_zeroes_alignment, cluster_sectors);
+    }
    if (s->num_extents > 1) {
        extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
    } else {
@@ -605,13 +609,20 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
        header = footer.header;
    }

-    if (le32_to_cpu(header.version) >= 3) {
+    if (le32_to_cpu(header.version) > 3) {
        char buf[64];
        snprintf(buf, sizeof(buf), "VMDK version %d",
                 le32_to_cpu(header.version));
        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
                bs->device_name, "vmdk", buf);
        return -ENOTSUP;
+    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
+        /* VMware KB 2064959 explains that version 3 added support for
+         * persistent changed block tracking (CBT), and backup software can
+         * read it as version=1 if it doesn't care about the changed area
+         * information. So we are safe to enable read only. */
+        error_setg(errp, "VMDK version 3 must be read only");
+        return -EINVAL;
    }

    if (le32_to_cpu(header.num_gtes_per_gt) > 512) {
@@ -1419,7 +1430,8 @@ static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num,

 static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs,
                                             int64_t sector_num,
-                                             int nb_sectors)
+                                             int nb_sectors,
+                                             BdrvRequestFlags flags)
 {
    int ret;
    BDRVVmdkState *s = bs->opaque;
@@ -1588,7 +1600,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options,
                       Error **errp)
 {
    int fd, idx = 0;
-    char desc[BUF_SIZE];
+    char *desc = NULL;
    int64_t total_size = 0, filesize;
    const char *adapter_type = NULL;
    const char *backing_file = NULL;
@@ -1596,7 +1608,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options,
    int flags = 0;
    int ret = 0;
    bool flat, split, compress;
-    char ext_desc_lines[BUF_SIZE] = "";
+    GString *ext_desc_lines;
    char path[PATH_MAX], prefix[PATH_MAX], postfix[PATH_MAX];
    const int64_t split_size = 0x80000000;  /* VMDK has constant split size */
    const char *desc_extent_line;
@@ -1624,8 +1636,11 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options,
        "ddb.geometry.sectors = \"63\"\n"
        "ddb.adapterType = \"%s\"\n";

+    ext_desc_lines = g_string_new(NULL);
+
    if (filename_decompose(filename, path, prefix, postfix, PATH_MAX, errp)) {
-        return -EINVAL;
+        ret = -EINVAL;
+        goto exit;
    }
    /* Read out options */
    while (options && options->name) {
@@ -1651,7 +1666,8 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options,
               strcmp(adapter_type, "lsilogic") &&
               strcmp(adapter_type, "legacyESX")) {
        error_setg(errp, "Unknown adapter type: '%s'", adapter_type);
-        return -EINVAL;
+        ret = -EINVAL;
+        goto exit;
    }
    if (strcmp(adapter_type, "ide") != 0) {
        /* that's the number of heads with which vmware operates when
@@ -1667,7 +1683,8 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options,
               strcmp(fmt, "twoGbMaxExtentFlat") &&
               strcmp(fmt, "streamOptimized")) {
        error_setg(errp, "Unknown subformat: '%s'", fmt);
-        return -EINVAL;
+        ret = -EINVAL;
+        goto exit;
    }
    split = !(strcmp(fmt, "twoGbMaxExtentFlat") &&
              strcmp(fmt, "twoGbMaxExtentSparse"));
@@ -1681,22 +1698,25 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options,
    }
    if (flat && backing_file) {
        error_setg(errp, "Flat image can't have backing file");
-        return -ENOTSUP;
+        ret = -ENOTSUP;
+        goto exit;
    }
    if (flat && zeroed_grain) {
        error_setg(errp, "Flat image can't enable zeroed grain");
-        return -ENOTSUP;
+        ret = -ENOTSUP;
+        goto exit;
    }
    if (backing_file) {
        BlockDriverState *bs = bdrv_new("");
        ret = bdrv_open(bs, backing_file, NULL, BDRV_O_NO_BACKING, NULL, errp);
        if (ret != 0) {
            bdrv_unref(bs);
-            return ret;
+            goto exit;
        }
        if (strcmp(bs->drv->format_name, "vmdk")) {
            bdrv_unref(bs);
-            return -EINVAL;
+            ret = -EINVAL;
+            goto exit;
        }
        parent_cid = vmdk_read_cid(bs, 0);
        bdrv_unref(bs);
@@ -1730,25 +1750,27 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options,

        if (vmdk_create_extent(ext_filename, size,
                               flat, compress, zeroed_grain)) {
-            return -EINVAL;
+            ret = -EINVAL;
+            goto exit;
        }
        filesize -= size;

        /* Format description line */
        snprintf(desc_line, sizeof(desc_line),
                    desc_extent_line, size / 512, desc_filename);
-        pstrcat(ext_desc_lines, sizeof(ext_desc_lines), desc_line);
+        g_string_append(ext_desc_lines, desc_line);
    }
    /* generate descriptor file */
-    snprintf(desc, sizeof(desc), desc_template,
-            (unsigned int)time(NULL),
-            parent_cid,
-            fmt,
-            parent_desc_line,
-            ext_desc_lines,
-            (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
-            total_size / (int64_t)(63 * number_heads * 512), number_heads,
-                adapter_type);
+    desc = g_strdup_printf(desc_template,
+                           (unsigned int)time(NULL),
+                           parent_cid,
+                           fmt,
+                           parent_desc_line,
+                           ext_desc_lines->str,
+                           (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
+                           total_size / (int64_t)(63 * number_heads * 512),
+                           number_heads,
+                           adapter_type);
    if (split || flat) {
        fd = qemu_open(filename,
                       O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
@@ -1759,21 +1781,25 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options,
                       0644);
    }
    if (fd < 0) {
-        return -errno;
+        ret = -errno;
+        goto exit;
    }
    /* the descriptor offset = 0x200 */
    if (!split && !flat && 0x200 != lseek(fd, 0x200, SEEK_SET)) {
        ret = -errno;
-        goto exit;
+        goto close_exit;
    }
    ret = qemu_write_full(fd, desc, strlen(desc));
    if (ret != strlen(desc)) {
        ret = -errno;
-        goto exit;
+        goto close_exit;
    }
    ret = 0;
-exit:
+close_exit:
    qemu_close(fd);
+exit:
+    g_free(desc);
+    g_string_free(ext_desc_lines, true);
    return ret;
 }

--- a/block/vpc.c
+++ b/block/vpc.c
@@ -45,8 +45,6 @@ enum vhd_type {
 // Seconds since Jan 1, 2000 0:00:00 (UTC)
 #define VHD_TIMESTAMP_BASE 946684800

-#define VHD_MAX_SECTORS       (65535LL * 255 * 255)
-
 // always big-endian
 typedef struct vhd_footer {
    char        creator[8]; // "conectix"
@@ -166,7 +164,6 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
    VHDDynDiskHeader *dyndisk_header;
    uint8_t buf[HEADER_SIZE];
    uint32_t checksum;
-    uint64_t computed_size;
    int disk_type = VHD_DYNAMIC;
    int ret;

@@ -224,7 +221,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
    }

    /* Allow a maximum disk size of approximately 2 TB */
-    if (bs->total_sectors >= VHD_MAX_SECTORS) {
+    if (bs->total_sectors >= 65535LL * 255 * 255) {
        ret = -EFBIG;
        goto fail;
    }
@@ -244,31 +241,10 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
        }

        s->block_size = be32_to_cpu(dyndisk_header->block_size);
-        if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
-            error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
-            ret = -EINVAL;
-            goto fail;
-        }
        s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;

        s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries);
-
-        if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
-            ret = -EINVAL;
-            goto fail;
-        }
-        if (s->max_table_entries > (VHD_MAX_SECTORS * 512) / s->block_size) {
-            ret = -EINVAL;
-            goto fail;
-        }
-
-        computed_size = (uint64_t) s->max_table_entries * s->block_size;
-        if (computed_size < bs->total_sectors * 512) {
-            ret = -EINVAL;
-            goto fail;
-        }
-
-        s->pagetable = qemu_blockalign(bs, s->max_table_entries * 4);
+        s->pagetable = g_malloc(s->max_table_entries * 4);

        s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);

@@ -321,7 +297,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
    return 0;

 fail:
-    qemu_vfree(s->pagetable);
+    g_free(s->pagetable);
 #ifdef CACHE
    g_free(s->pageentry_u8);
 #endif
@@ -479,6 +455,19 @@ fail:
    return -1;
 }

+static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
+    VHDFooter *footer = (VHDFooter *) s->footer_buf;
+
+    if (cpu_to_be32(footer->type) != VHD_FIXED) {
+        bdi->cluster_size = s->block_size;
+    }
+
+    bdi->unallocated_blocks_are_zero = true;
+    return 0;
+}
+
 static int vpc_read(BlockDriverState *bs, int64_t sector_num,
                    uint8_t *buf, int nb_sectors)
 {
@@ -843,7 +832,7 @@ static int vpc_has_zero_init(BlockDriverState *bs)
 static void vpc_close(BlockDriverState *bs)
 {
    BDRVVPCState *s = bs->opaque;
-    qemu_vfree(s->pagetable);
+    g_free(s->pagetable);
 #ifdef CACHE
    g_free(s->pageentry_u8);
 #endif
@@ -881,6 +870,8 @@ static BlockDriver bdrv_vpc = {
    .bdrv_read              = vpc_co_read,
    .bdrv_write             = vpc_co_write,

+    .bdrv_get_info          = vpc_get_info,
+
    .create_options         = vpc_create_options,
    .bdrv_has_zero_init     = vpc_has_zero_init,
 };
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -266,8 +266,7 @@ typedef struct mbr_t {
 } QEMU_PACKED mbr_t;

 typedef struct direntry_t {
-    uint8_t name[8];
-    uint8_t extension[3];
+    uint8_t name[8 + 3];
    uint8_t attributes;
    uint8_t reserved[2];
    uint16_t ctime;
@@ -518,11 +517,9 @@ static inline uint8_t fat_chksum(const direntry_t* entry)
    uint8_t chksum=0;
    int i;

-    for(i=0;i<11;i++) {
-        unsigned char c;
-
-        c = (i < 8) ? entry->name[i] : entry->extension[i-8];
-        chksum=(((chksum&0xfe)>>1)|((chksum&0x01)?0x80:0)) + c;
+    for (i = 0; i < ARRAY_SIZE(entry->name); i++) {
+        chksum = (((chksum & 0xfe) >> 1) |
+                  ((chksum & 0x01) ? 0x80 : 0)) + entry->name[i];
    }

    return chksum;
@@ -617,7 +614,7 @@ static inline direntry_t* create_short_and_long_name(BDRVVVFATState* s,

    if(is_dot) {
 	entry=array_get_next(&(s->directory));
-	memset(entry->name,0x20,11);
+        memset(entry->name, 0x20, sizeof(entry->name));
 	memcpy(entry->name,filename,strlen(filename));
 	return entry;
    }
@@ -632,12 +629,14 @@ static inline direntry_t* create_short_and_long_name(BDRVVVFATState* s,
 	i = 8;

    entry=array_get_next(&(s->directory));
-    memset(entry->name,0x20,11);
+    memset(entry->name, 0x20, sizeof(entry->name));
    memcpy(entry->name, filename, i);

-    if(j > 0)
-	for (i = 0; i < 3 && filename[j+1+i]; i++)
-	    entry->extension[i] = filename[j+1+i];
+    if (j > 0) {
+        for (i = 0; i < 3 && filename[j + 1 + i]; i++) {
+            entry->name[8 + i] = filename[j + 1 + i];
+        }
+    }

    /* upcase & remove unwanted characters */
    for(i=10;i>=0;i--) {
@@ -788,9 +787,7 @@ static int read_directory(BDRVVVFATState* s, int mapping_index)
 	    s->current_mapping->path=buffer;
 	    s->current_mapping->read_only =
 		(st.st_mode & (S_IWUSR | S_IWGRP | S_IWOTH)) == 0;
-        } else {
-            g_free(buffer);
-        }
+	}
    }
    closedir(dir);

@@ -863,8 +860,7 @@ static int init_directories(BDRVVVFATState* s,
    {
 	direntry_t* entry=array_get_next(&(s->directory));
 	entry->attributes=0x28; /* archive | volume label */
-	memcpy(entry->name,"QEMU VVF",8);
-	memcpy(entry->extension,"AT ",3);
+        memcpy(entry->name, "QEMU VVFAT ", sizeof(entry->name));
    }

    /* Now build FAT, and write back information into directory */
@@ -1593,17 +1589,20 @@ static int parse_short_name(BDRVVVFATState* s,
 	    lfn->name[i] = direntry->name[i];
    }

-    for (j = 2; j >= 0 && direntry->extension[j] == ' '; j--);
+    for (j = 2; j >= 0 && direntry->name[8 + j] == ' '; j--) {
+    }
    if (j >= 0) {
 	lfn->name[i++] = '.';
 	lfn->name[i + j + 1] = '\0';
 	for (;j >= 0; j--) {
-	    if (direntry->extension[j] <= ' ' || direntry->extension[j] > 0x7f)
-		return -2;
-	    else if (s->downcase_short_names)
-		lfn->name[i + j] = qemu_tolower(direntry->extension[j]);
-	    else
-		lfn->name[i + j] = direntry->extension[j];
+            uint8_t c = direntry->name[8 + j];
+            if (c <= ' ' || c > 0x7f) {
+                return -2;
+            } else if (s->downcase_short_names) {
+                lfn->name[i + j] = qemu_tolower(c);
+            } else {
+                lfn->name[i + j] = c;
+            }
 	}
    } else
 	lfn->name[i + j + 1] = '\0';
@@ -1868,7 +1867,7 @@ static int check_directory_consistency(BDRVVVFATState *s,

 	if (s->used_clusters[cluster_num] & USED_ANY) {
 	    fprintf(stderr, "cluster %d used more than once\n", (int)cluster_num);
-            goto fail;
+	    return 0;
 	}
 	s->used_clusters[cluster_num] = USED_DIRECTORY;

--- a/blockdev-nbd.c
+++ b/blockdev-nbd.c
@@ -27,9 +27,8 @@ static void nbd_accept(void *opaque)
    socklen_t addr_len = sizeof(addr);

    int fd = accept(server_fd, (struct sockaddr *)&addr, &addr_len);
-    if (fd >= 0 && !nbd_client_new(NULL, fd, nbd_client_put)) {
-        shutdown(fd, 2);
-        close(fd);
+    if (fd >= 0) {
+        nbd_client_new(NULL, fd, nbd_client_put);
    }
 }

@@ -92,10 +91,6 @@ void qmp_nbd_server_add(const char *device, bool has_writable, bool writable,
        error_set(errp, QERR_DEVICE_NOT_FOUND, device);
        return;
    }
-    if (!bdrv_is_inserted(bs)) {
-        error_set(errp, QERR_DEVICE_HAS_NO_MEDIUM, device);
-        return;
-    }

    if (!has_writable) {
        writable = false;
--- a/blockdev.c
+++ b/blockdev.c
@@ -334,7 +334,7 @@ static DriveInfo *blockdev_init(QDict *bs_opts,
    opts = qemu_opts_create(&qemu_common_drive_opts, id, 1, &error);
    if (error_is_set(&error)) {
        error_propagate(errp, error);
-        goto err_no_opts;
+        return NULL;
    }

    qemu_opts_absorb_qdict(opts, bs_opts, &error);
@@ -535,9 +535,8 @@ err:
    QTAILQ_REMOVE(&drives, dinfo, next);
    g_free(dinfo);
 early_err:
-    qemu_opts_del(opts);
-err_no_opts:
    QDECREF(bs_opts);
+    qemu_opts_del(opts);
    return NULL;
 }

@@ -868,7 +867,6 @@ DriveInfo *drive_init(QemuOpts *all_opts, BlockInterfaceType block_default_type)

    /* Actual block device init: Functionality shared with blockdev-add */
    dinfo = blockdev_init(bs_opts, type, &local_err);
-    bs_opts = NULL;
    if (dinfo == NULL) {
        if (error_is_set(&local_err)) {
            qerror_report_err(local_err);
@@ -905,7 +903,6 @@ DriveInfo *drive_init(QemuOpts *all_opts, BlockInterfaceType block_default_type)

 fail:
    qemu_opts_del(legacy_opts);
-    QDECREF(bs_opts);
    return dinfo;
 }

@@ -1526,7 +1523,7 @@ static void qmp_bdrv_open_encrypted(BlockDriverState *bs, const char *filename,
 }

 void qmp_change_blockdev(const char *device, const char *filename,
-                         bool has_format, const char *format, Error **errp)
+                         const char *format, Error **errp)
 {
    BlockDriverState *bs;
    BlockDriver *drv = NULL;
@@ -1789,10 +1786,6 @@ void qmp_block_commit(const char *device,
     */
    BlockdevOnError on_error = BLOCKDEV_ON_ERROR_REPORT;

-    if (!has_speed) {
-        speed = 0;
-    }
-
    /* drain all i/o before commits */
    bdrv_drain_all();

--- a/54
+++ b/54
@@ -169,6 +169,7 @@ curl=""
 curses=""
 docs=""
 fdt=""
+netmap="no"
 pixman=""
 sdl=""
 virtfs=""
@@ -488,6 +489,7 @@ FreeBSD)
  audio_possible_drivers="oss sdl esd pa"
  # needed for kinfo_getvmmap(3) in libutil.h
  LIBS="-lutil $LIBS"
+  netmap=""  # enable netmap autodetect
 ;;
 DragonFly)
  bsd="yes"
@@ -797,6 +799,10 @@ for opt do
  ;;
  --enable-vde) vde="yes"
  ;;
+  --disable-netmap) netmap="no"
+  ;;
+  --enable-netmap) netmap="yes"
+  ;;
  --disable-xen) xen="no"
  ;;
  --enable-xen) xen="yes"
@@ -1182,6 +1188,8 @@ echo "  --disable-uuid           disable uuid support"
 echo "  --enable-uuid            enable uuid support"
 echo "  --disable-vde            disable support for vde network"
 echo "  --enable-vde             enable support for vde network"
+echo "  --disable-netmap         disable support for netmap network"
+echo "  --enable-netmap          enable support for netmap network"
 echo "  --disable-linux-aio      disable Linux AIO support"
 echo "  --enable-linux-aio       enable Linux AIO support"
 echo "  --disable-cap-ng         disable libcap-ng support"
@@ -1357,11 +1365,6 @@ EOF
      pie="no"
    fi
  fi
-
-  if compile_prog "-fno-pie" "-nopie"; then
-    CFLAGS_NOPIE="-fno-pie"
-    LDFLAGS_NOPIE="-nopie"
-  fi
 fi

 ##########################################
@@ -2099,6 +2102,26 @@ EOF
  fi
 fi

+##########################################
+# netmap headers probe
+if test "$netmap" != "no" ; then
+  cat > $TMPC << EOF
+#include <inttypes.h>
+#include <net/if.h>
+#include <net/netmap.h>
+#include <net/netmap_user.h>
+int main(void) { return 0; }
+EOF
+  if compile_prog "" "" ; then
+    netmap=yes
+  else
+    if test "$netmap" = "yes" ; then
+      feature_not_found "netmap"
+    fi
+    netmap=no
+  fi
+fi
+
 ##########################################
 # libcap-ng library probe
 if test "$cap_ng" != "no" ; then
@@ -3030,13 +3053,13 @@ fi

 ##########################################
 # Do we have libiscsi
-# We check for iscsi_unmap_sync() to make sure we have a
-# recent enough version of libiscsi.
+# We check for iscsi_write16_sync() to make sure we have a
+# at least version 1.4.0 of libiscsi.
 if test "$libiscsi" != "no" ; then
  cat > $TMPC << EOF
 #include <stdio.h>
 #include <iscsi/iscsi.h>
-int main(void) { iscsi_unmap_sync(NULL,0,0,0,NULL,0); return 0; }
+int main(void) { iscsi_write16_sync(NULL,0,0,NULL,0,0,0,0,0,0,0); return 0; }
 EOF
  if $pkg_config --atleast-version=1.7.0 libiscsi; then
    libiscsi="yes"
@@ -3520,11 +3543,6 @@ fi

 int128=no
 cat > $TMPC << EOF
-#if defined(__clang_major__) && defined(__clang_minor__)
-# if ((__clang_major__ < 3) || (__clang_major__ == 3) && (__clang_minor__ < 2))
-#  error __int128_t does not work in CLANG before 3.2
-# endif
-#endif
 __int128_t a;
 __uint128_t b;
 int main (void) {
@@ -3761,6 +3779,7 @@ echo "uname -r          $uname_release"
 echo "GUEST_BASE        $guest_base"
 echo "PIE               $pie"
 echo "vde support       $vde"
+echo "netmap support    $netmap"
 echo "Linux AIO support $linux_aio"
 echo "ATTR/XATTR support $attr"
 echo "Install blobs     $blobs"
@@ -3898,6 +3917,9 @@ fi
 if test "$vde" = "yes" ; then
  echo "CONFIG_VDE=y" >> $config_host_mak
 fi
+if test "$netmap" = "yes" ; then
+  echo "CONFIG_NETMAP=y" >> $config_host_mak
+fi
 if test "$cap_ng" = "yes" ; then
  echo "CONFIG_LIBCAP=y" >> $config_host_mak
 fi
@@ -4298,7 +4320,6 @@ echo "LD=$ld" >> $config_host_mak
 echo "WINDRES=$windres" >> $config_host_mak
 echo "LIBTOOL=$libtool" >> $config_host_mak
 echo "CFLAGS=$CFLAGS" >> $config_host_mak
-echo "CFLAGS_NOPIE=$CFLAGS_NOPIE" >> $config_host_mak
 echo "QEMU_CFLAGS=$QEMU_CFLAGS" >> $config_host_mak
 echo "QEMU_INCLUDES=$QEMU_INCLUDES" >> $config_host_mak
 if test "$sparse" = "yes" ; then
@@ -4312,7 +4333,6 @@ else
  echo "AUTOCONF_HOST := "                             >> $config_host_mak
 fi
 echo "LDFLAGS=$LDFLAGS" >> $config_host_mak
-echo "LDFLAGS_NOPIE=$LDFLAGS_NOPIE" >> $config_host_mak
 echo "LIBTOOLFLAGS=$LIBTOOLFLAGS" >> $config_host_mak
 echo "LIBS+=$LIBS" >> $config_host_mak
 echo "LIBS_TOOLS+=$libs_tools" >> $config_host_mak
@@ -4418,7 +4438,7 @@ case "$target_name" in
  aarch64)
    TARGET_BASE_ARCH=arm
    bflt="yes"
-    gdb_xml_files="aarch64-core.xml"
+    gdb_xml_files="aarch64-core.xml aarch64-fpu.xml"
  ;;
  cris)
  ;;
@@ -4530,7 +4550,7 @@ case "$target_name" in
  *)
 esac
 case "$target_name" in
-  arm|i386|x86_64|ppcemb|ppc|ppc64|s390x)
+  aarch64|arm|i386|x86_64|ppcemb|ppc|ppc64|s390x)
    # Make sure the target and host cpus are compatible
    if test "$kvm" = "yes" -a "$target_softmmu" = "yes" -a \
      \( "$target_name" = "$cpu" -o \
--- a/coroutine-win32.c
+++ b/coroutine-win32.c
@@ -36,17 +36,8 @@ typedef struct
 static __thread CoroutineWin32 leader;
 static __thread Coroutine *current;

-/* This function is marked noinline to prevent GCC from inlining it
- * into coroutine_trampoline(). If we allow it to do that then it
- * hoists the code to get the address of the TLS variable "current"
- * out of the while() loop. This is an invalid transformation because
- * the SwitchToFiber() call may be called when running thread A but
- * return in thread B, and so we might be in a different thread
- * context each time round the loop.
- */
-CoroutineAction __attribute__((noinline))
-qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
-                      CoroutineAction action)
+CoroutineAction qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
+                                      CoroutineAction action)
 {
    CoroutineWin32 *from = DO_UPCAST(CoroutineWin32, base, from_);
    CoroutineWin32 *to = DO_UPCAST(CoroutineWin32, base, to_);
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -205,6 +205,9 @@ int cpu_exec(CPUArchState *env)
 #if !(defined(CONFIG_USER_ONLY) && \
      (defined(TARGET_M68K) || defined(TARGET_PPC) || defined(TARGET_S390X)))
    CPUClass *cc = CPU_GET_CLASS(cpu);
+#endif
+#ifdef TARGET_I386
+    X86CPU *x86_cpu = X86_CPU(cpu);
 #endif
    int ret, interrupt_request;
    TranslationBlock *tb;
@@ -320,24 +323,24 @@ int cpu_exec(CPUArchState *env)
 #if !defined(CONFIG_USER_ONLY)
                    if (interrupt_request & CPU_INTERRUPT_POLL) {
                        cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
-                        apic_poll_irq(env->apic_state);
+                        apic_poll_irq(x86_cpu->apic_state);
                    }
 #endif
                    if (interrupt_request & CPU_INTERRUPT_INIT) {
                            cpu_svm_check_intercept_param(env, SVM_EXIT_INIT,
                                                          0);
-                            do_cpu_init(x86_env_get_cpu(env));
+                            do_cpu_init(x86_cpu);
                            env->exception_index = EXCP_HALTED;
                            cpu_loop_exit(env);
                    } else if (interrupt_request & CPU_INTERRUPT_SIPI) {
-                            do_cpu_sipi(x86_env_get_cpu(env));
+                            do_cpu_sipi(x86_cpu);
                    } else if (env->hflags2 & HF2_GIF_MASK) {
                        if ((interrupt_request & CPU_INTERRUPT_SMI) &&
                            !(env->hflags & HF_SMM_MASK)) {
                            cpu_svm_check_intercept_param(env, SVM_EXIT_SMI,
                                                          0);
                            cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
-                            do_smm_enter(x86_env_get_cpu(env));
+                            do_smm_enter(x86_cpu);
                            next_tb = 0;
                        } else if ((interrupt_request & CPU_INTERRUPT_NMI) &&
                                   !(env->hflags2 & HF2_NMI_MASK)) {
@@ -684,6 +687,9 @@ int cpu_exec(CPUArchState *env)
 #if !(defined(CONFIG_USER_ONLY) && \
      (defined(TARGET_M68K) || defined(TARGET_PPC) || defined(TARGET_S390X)))
            cc = CPU_GET_CLASS(cpu);
+#endif
+#ifdef TARGET_I386
+            x86_cpu = X86_CPU(cpu);
 #endif
        }
    } /* for(;;) */
--- a/cpus.c
+++ b/cpus.c
@@ -1458,12 +1458,11 @@ void qmp_inject_nmi(Error **errp)

    CPU_FOREACH(cs) {
        X86CPU *cpu = X86_CPU(cs);
-        CPUX86State *env = &cpu->env;

-        if (!env->apic_state) {
+        if (!cpu->apic_state) {
            cpu_interrupt(cs, CPU_INTERRUPT_NMI);
        } else {
-            apic_deliver_nmi(env->apic_state);
+            apic_deliver_nmi(cpu->apic_state);
        }
    }
 #elif defined(TARGET_S390X)
--- a/cputlb.c
+++ b/cputlb.c
@@ -33,13 +33,6 @@
 /* statistics */
 int tlb_flush_count;

-static const CPUTLBEntry s_cputlb_empty_entry = {
-    .addr_read  = -1,
-    .addr_write = -1,
-    .addr_code  = -1,
-    .addend     = -1,
-};
-
 /* NOTE:
 * If flush_global is true (the usual case), flush all tlb entries.
 * If flush_global is false, flush (at least) all tlb entries not
@@ -55,7 +48,6 @@ static const CPUTLBEntry s_cputlb_empty_entry = {
 void tlb_flush(CPUArchState *env, int flush_global)
 {
    CPUState *cpu = ENV_GET_CPU(env);
-    int i;

 #if defined(DEBUG_TLB)
    printf("tlb_flush:\n");
@@ -64,15 +56,8 @@ void tlb_flush(CPUArchState *env, int flush_global)
       links while we are modifying them */
    cpu->current_tb = NULL;

-    for (i = 0; i < CPU_TLB_SIZE; i++) {
-        int mmu_idx;
-
-        for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
-            env->tlb_table[mmu_idx][i] = s_cputlb_empty_entry;
-        }
-    }
-
-    memset(env->tb_jmp_cache, 0, TB_JMP_CACHE_SIZE * sizeof (void *));
+    memset(env->tlb_table, -1, sizeof(env->tlb_table));
+    memset(env->tb_jmp_cache, 0, sizeof(env->tb_jmp_cache));

    env->tlb_flush_addr = -1;
    env->tlb_flush_mask = 0;
@@ -87,7 +72,7 @@ static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr)
                 (TARGET_PAGE_MASK | TLB_INVALID_MASK)) ||
        addr == (tlb_entry->addr_code &
                 (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
-        *tlb_entry = s_cputlb_empty_entry;
+        memset(tlb_entry, -1, sizeof(*tlb_entry));
    }
 }

@@ -344,10 +329,8 @@ tb_page_addr_t get_page_addr_code(CPUArchState *env1, target_ulong addr)
 }

 #define MMUSUFFIX _cmmu
-#undef GETPC_ADJ
-#define GETPC_ADJ 0
-#undef GETRA
-#define GETRA() ((uintptr_t)0)
+#undef GETPC
+#define GETPC() ((uintptr_t)0)
 #define SOFTMMU_CODE_ACCESS

 #define SHIFT 0
--- a/default-configs/aarch64-softmmu.mak
+++ b/default-configs/aarch64-softmmu.mak
@@ -0,0 +1,6 @@
+# Default configuration for aarch64-softmmu
+
+# We support all the 32 bit boards so need all their config
+include arm-softmmu.mak
+
+# Currently no 64-bit specific config requirements
--- a/default-configs/arm-softmmu.mak
+++ b/default-configs/arm-softmmu.mak
@@ -41,6 +41,7 @@ CONFIG_ARM_GIC=y
 CONFIG_ARM_GIC_KVM=$(CONFIG_KVM)
 CONFIG_ARM_TIMER=y
 CONFIG_ARM_MPTIMER=y
+CONFIG_A9_GTIMER=y
 CONFIG_PL011=y
 CONFIG_PL022=y
 CONFIG_PL031=y
@@ -63,6 +64,7 @@ CONFIG_XILINX_SPIPS=y

 CONFIG_ARM11SCU=y
 CONFIG_A9SCU=y
+CONFIG_DIGIC=y
 CONFIG_MARVELL_88W8618=y
 CONFIG_OMAP=y
 CONFIG_TSC210X=y
@@ -81,3 +83,7 @@ CONFIG_VERSATILE_I2C=y

 CONFIG_SDHCI=y
 CONFIG_INTEGRATOR_DEBUG=y
+
+CONFIG_ALLWINNER_A10_PIT=y
+CONFIG_ALLWINNER_A10_PIC=y
+CONFIG_ALLWINNER_A10=y
--- a/device_tree.c
+++ b/device_tree.c
@@ -41,6 +41,10 @@ void *create_device_tree(int *sizep)
    if (ret < 0) {
        goto fail;
    }
+    ret = fdt_finish_reservemap(fdt);
+    if (ret < 0) {
+        goto fail;
+    }
    ret = fdt_begin_node(fdt, "");
    if (ret < 0) {
        goto fail;
@@ -127,12 +131,12 @@ static int findnode_nofail(void *fdt, const char *node_path)
    return offset;
 }

-int qemu_devtree_setprop(void *fdt, const char *node_path,
-                         const char *property, const void *val_array, int size)
+int qemu_fdt_setprop(void *fdt, const char *node_path,
+                     const char *property, const void *val, int size)
 {
    int r;

-    r = fdt_setprop(fdt, findnode_nofail(fdt, node_path), property, val_array, size);
+    r = fdt_setprop(fdt, findnode_nofail(fdt, node_path), property, val, size);
    if (r < 0) {
        fprintf(stderr, "%s: Couldn't set %s/%s: %s\n", __func__, node_path,
                property, fdt_strerror(r));
@@ -142,8 +146,8 @@ int qemu_devtree_setprop(void *fdt, const char *node_path,
    return r;
 }

-int qemu_devtree_setprop_cell(void *fdt, const char *node_path,
-                              const char *property, uint32_t val)
+int qemu_fdt_setprop_cell(void *fdt, const char *node_path,
+                          const char *property, uint32_t val)
 {
    int r;

@@ -157,15 +161,15 @@ int qemu_devtree_setprop_cell(void *fdt, const char *node_path,
    return r;
 }

-int qemu_devtree_setprop_u64(void *fdt, const char *node_path,
-                             const char *property, uint64_t val)
+int qemu_fdt_setprop_u64(void *fdt, const char *node_path,
+                         const char *property, uint64_t val)
 {
    val = cpu_to_be64(val);
-    return qemu_devtree_setprop(fdt, node_path, property, &val, sizeof(val));
+    return qemu_fdt_setprop(fdt, node_path, property, &val, sizeof(val));
 }

-int qemu_devtree_setprop_string(void *fdt, const char *node_path,
-                                const char *property, const char *string)
+int qemu_fdt_setprop_string(void *fdt, const char *node_path,
+                            const char *property, const char *string)
 {
    int r;

@@ -179,8 +183,8 @@ int qemu_devtree_setprop_string(void *fdt, const char *node_path,
    return r;
 }

-const void *qemu_devtree_getprop(void *fdt, const char *node_path,
-                                 const char *property, int *lenp)
+const void *qemu_fdt_getprop(void *fdt, const char *node_path,
+                             const char *property, int *lenp)
 {
    int len;
    const void *r;
@@ -196,11 +200,11 @@ const void *qemu_devtree_getprop(void *fdt, const char *node_path,
    return r;
 }

-uint32_t qemu_devtree_getprop_cell(void *fdt, const char *node_path,
-                                   const char *property)
+uint32_t qemu_fdt_getprop_cell(void *fdt, const char *node_path,
+                               const char *property)
 {
    int len;
-    const uint32_t *p = qemu_devtree_getprop(fdt, node_path, property, &len);
+    const uint32_t *p = qemu_fdt_getprop(fdt, node_path, property, &len);
    if (len != 4) {
        fprintf(stderr, "%s: %s/%s not 4 bytes long (not a cell?)\n",
                __func__, node_path, property);
@@ -209,7 +213,7 @@ uint32_t qemu_devtree_getprop_cell(void *fdt, const char *node_path,
    return be32_to_cpu(*p);
 }

-uint32_t qemu_devtree_get_phandle(void *fdt, const char *path)
+uint32_t qemu_fdt_get_phandle(void *fdt, const char *path)
 {
    uint32_t r;

@@ -223,15 +227,15 @@ uint32_t qemu_devtree_get_phandle(void *fdt, const char *path)
    return r;
 }

-int qemu_devtree_setprop_phandle(void *fdt, const char *node_path,
-                                 const char *property,
-                                 const char *target_node_path)
+int qemu_fdt_setprop_phandle(void *fdt, const char *node_path,
+                             const char *property,
+                             const char *target_node_path)
 {
-    uint32_t phandle = qemu_devtree_get_phandle(fdt, target_node_path);
-    return qemu_devtree_setprop_cell(fdt, node_path, property, phandle);
+    uint32_t phandle = qemu_fdt_get_phandle(fdt, target_node_path);
+    return qemu_fdt_setprop_cell(fdt, node_path, property, phandle);
 }

-uint32_t qemu_devtree_alloc_phandle(void *fdt)
+uint32_t qemu_fdt_alloc_phandle(void *fdt)
 {
    static int phandle = 0x0;

@@ -255,7 +259,7 @@ uint32_t qemu_devtree_alloc_phandle(void *fdt)
    return phandle++;
 }

-int qemu_devtree_nop_node(void *fdt, const char *node_path)
+int qemu_fdt_nop_node(void *fdt, const char *node_path)
 {
    int r;

@@ -269,7 +273,7 @@ int qemu_devtree_nop_node(void *fdt, const char *node_path)
    return r;
 }

-int qemu_devtree_add_subnode(void *fdt, const char *name)
+int qemu_fdt_add_subnode(void *fdt, const char *name)
 {
    char *dupname = g_strdup(name);
    char *basename = strrchr(dupname, '/');
@@ -299,7 +303,7 @@ int qemu_devtree_add_subnode(void *fdt, const char *name)
    return retval;
 }

-void qemu_devtree_dumpdtb(void *fdt, int size)
+void qemu_fdt_dumpdtb(void *fdt, int size)
 {
    const char *dumpdtb = qemu_opt_get(qemu_get_machine_opts(), "dumpdtb");

@@ -309,11 +313,11 @@ void qemu_devtree_dumpdtb(void *fdt, int size)
    }
 }

-int qemu_devtree_setprop_sized_cells_from_array(void *fdt,
-                                                const char *node_path,
-                                                const char *property,
-                                                int numvalues,
-                                                uint64_t *values)
+int qemu_fdt_setprop_sized_cells_from_array(void *fdt,
+                                            const char *node_path,
+                                            const char *property,
+                                            int numvalues,
+                                            uint64_t *values)
 {
    uint32_t *propcells;
    uint64_t value;
@@ -338,6 +342,6 @@ int qemu_devtree_setprop_sized_cells_from_array(void *fdt,
        propcells[cellnum++] = cpu_to_be32(value);
    }

-    return qemu_devtree_setprop(fdt, node_path, property, propcells,
-                                cellnum * sizeof(uint32_t));
+    return qemu_fdt_setprop(fdt, node_path, property, propcells,
+                            cellnum * sizeof(uint32_t));
 }
--- a/docs/migration.txt
+++ b/docs/migration.txt
@@ -139,6 +139,7 @@ static const VMStateDescription vmstate_kbd = {
    .name = "pckbd",
    .version_id = 3,
    .minimum_version_id = 3,
+    .minimum_version_id_old = 3,
    .fields      = (VMStateField []) {
        VMSTATE_UINT8(write_cmd, KBDState),
        VMSTATE_UINT8(status, KBDState),
@@ -167,13 +168,12 @@ You can see that there are several version fields:
 - minimum_version_id: the minimum version_id that VMState is able to understand
  for that device.
 - minimum_version_id_old: For devices that were not able to port to vmstate, we can
-  assign a function that knows how to read this old state. This field is
-  ignored if there is no load_state_old handler.
+  assign a function that knows how to read this old state.

 So, VMState is able to read versions from minimum_version_id to
-version_id.  And the function load_state_old() (if present) is able to
-load state from minimum_version_id_old to minimum_version_id.  This
-function is deprecated and will be removed when no more users are left.
+version_id.  And the function load_state_old() is able to load state
+from minimum_version_id_old to minimum_version_id.  This function is
+deprecated and will be removed when no more users are left.

 ===  Massaging functions ===

@@ -255,6 +255,7 @@ const VMStateDescription vmstate_ide_drive_pio_state = {
    .name = "ide_drive/pio_state",
    .version_id = 1,
    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
    .pre_save = ide_drive_pio_pre_save,
    .post_load = ide_drive_pio_post_load,
    .fields      = (VMStateField []) {
@@ -274,6 +275,7 @@ const VMStateDescription vmstate_ide_drive = {
    .name = "ide_drive",
    .version_id = 3,
    .minimum_version_id = 0,
+    .minimum_version_id_old = 0,
    .post_load = ide_drive_post_load,
    .fields      = (VMStateField []) {
        .... several fields ....
--- a/exec.c
+++ b/exec.c
@@ -50,6 +50,9 @@
 #include "translate-all.h"

 #include "exec/memory-internal.h"
+#include "qemu/cache-utils.h"
+
+#include "qemu/range.h"

 //#define DEBUG_SUBPAGE

@@ -84,15 +87,17 @@ typedef struct PhysPageEntry PhysPageEntry;

 struct PhysPageEntry {
    /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
-    uint16_t skip : 1;
+    uint32_t skip : 6;
     /* index into phys_sections (!skip) or phys_map_nodes (skip) */
-    uint16_t ptr : 15;
+    uint32_t ptr : 26;
 };

-/* Size of the L2 (and L3, etc) page tables.  */
-#define ADDR_SPACE_BITS TARGET_PHYS_ADDR_SPACE_BITS
+#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)

-#define P_L2_BITS 10
+/* Size of the L2 (and L3, etc) page tables.  */
+#define ADDR_SPACE_BITS 64
+
+#define P_L2_BITS 9
 #define P_L2_SIZE (1 << P_L2_BITS)

 #define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
@@ -130,8 +135,6 @@ typedef struct subpage_t {
 #define PHYS_SECTION_ROM 2
 #define PHYS_SECTION_WATCH 3

-#define PHYS_MAP_NODE_NIL (((uint16_t)~0) >> 1)
-
 static void io_mem_init(void);
 static void memory_map_init(void);

@@ -149,10 +152,10 @@ static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
    }
 }

-static uint16_t phys_map_node_alloc(PhysPageMap *map)
+static uint32_t phys_map_node_alloc(PhysPageMap *map)
 {
    unsigned i;
-    uint16_t ret;
+    uint32_t ret;

    ret = map->nodes_nb++;
    assert(ret != PHYS_MAP_NODE_NIL);
@@ -209,6 +212,68 @@ static void phys_page_set(AddressSpaceDispatch *d,
    phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
 }

+/* Compact a non leaf page entry. Simply detect that the entry has a single child,
+ * and update our entry so we can skip it and go directly to the destination.
+ */
+static void phys_page_compact(PhysPageEntry *lp, Node *nodes, unsigned long *compacted)
+{
+    unsigned valid_ptr = P_L2_SIZE;
+    int valid = 0;
+    PhysPageEntry *p;
+    int i;
+
+    if (lp->ptr == PHYS_MAP_NODE_NIL) {
+        return;
+    }
+
+    p = nodes[lp->ptr];
+    for (i = 0; i < P_L2_SIZE; i++) {
+        if (p[i].ptr == PHYS_MAP_NODE_NIL) {
+            continue;
+        }
+
+        valid_ptr = i;
+        valid++;
+        if (p[i].skip) {
+            phys_page_compact(&p[i], nodes, compacted);
+        }
+    }
+
+    /* We can only compress if there's only one child. */
+    if (valid != 1) {
+        return;
+    }
+
+    assert(valid_ptr < P_L2_SIZE);
+
+    /* Don't compress if it won't fit in the # of bits we have. */
+    if (lp->skip + p[valid_ptr].skip >= (1 << 3)) {
+        return;
+    }
+
+    lp->ptr = p[valid_ptr].ptr;
+    if (!p[valid_ptr].skip) {
+        /* If our only child is a leaf, make this a leaf. */
+        /* By design, we should have made this node a leaf to begin with so we
+         * should never reach here.
+         * But since it's so simple to handle this, let's do it just in case we
+         * change this rule.
+         */
+        lp->skip = 0;
+    } else {
+        lp->skip += p[valid_ptr].skip;
+    }
+}
+
+static void phys_page_compact_all(AddressSpaceDispatch *d, int nodes_nb)
+{
+    DECLARE_BITMAP(compacted, nodes_nb);
+
+    if (d->phys_map.skip) {
+        phys_page_compact(&d->phys_map, d->map.nodes, compacted);
+    }
+}
+
 static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr addr,
                                           Node *nodes, MemoryRegionSection *sections)
 {
@@ -223,7 +288,14 @@ static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr addr,
        p = nodes[lp.ptr];
        lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
    }
-    return &sections[lp.ptr];
+
+    if (sections[lp.ptr].size.hi ||
+        range_covers_byte(sections[lp.ptr].offset_within_address_space,
+                          sections[lp.ptr].size.lo, addr)) {
+        return &sections[lp.ptr];
+    } else {
+        return &sections[PHYS_SECTION_UNASSIGNED];
+    }
 }

 bool memory_region_is_unassigned(MemoryRegion *mr)
@@ -266,18 +338,6 @@ address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *x
    return section;
 }

-static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write)
-{
-    if (memory_region_is_ram(mr)) {
-        return !(is_write && mr->readonly);
-    }
-    if (memory_region_is_romd(mr)) {
-        return !is_write;
-    }
-
-    return false;
-}
-
 MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr,
                                      hwaddr *xlat, hwaddr *plen,
                                      bool is_write)
@@ -307,11 +367,6 @@ MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr,
        as = iotlb.target_as;
    }

-    if (memory_access_is_direct(mr, is_write)) {
-        hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
-        len = MIN(page, len);
-    }
-
    *plen = len;
    *xlat = addr;
    return mr;
@@ -923,6 +978,13 @@ static long gethugepagesize(const char *path)
    return fs.f_bsize;
 }

+static sigjmp_buf sigjump;
+
+static void sigbus_handler(int signal)
+{
+    siglongjmp(sigjump, 1);
+}
+
 static void *file_ram_alloc(RAMBlock *block,
                            ram_addr_t memory,
                            const char *path)
@@ -932,9 +994,6 @@ static void *file_ram_alloc(RAMBlock *block,
    char *c;
    void *area;
    int fd;
-#ifdef MAP_POPULATE
-    int flags;
-#endif
    unsigned long hpagesize;

    hpagesize = gethugepagesize(path);
@@ -982,21 +1041,52 @@ static void *file_ram_alloc(RAMBlock *block,
    if (ftruncate(fd, memory))
        perror("ftruncate");

-#ifdef MAP_POPULATE
-    /* NB: MAP_POPULATE won't exhaustively alloc all phys pages in the case
-     * MAP_PRIVATE is requested.  For mem_prealloc we mmap as MAP_SHARED
-     * to sidestep this quirk.
-     */
-    flags = mem_prealloc ? MAP_POPULATE | MAP_SHARED : MAP_PRIVATE;
-    area = mmap(0, memory, PROT_READ | PROT_WRITE, flags, fd, 0);
-#else
    area = mmap(0, memory, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
-#endif
    if (area == MAP_FAILED) {
        perror("file_ram_alloc: can't mmap RAM pages");
        close(fd);
        return (NULL);
    }
+
+    if (mem_prealloc) {
+        int ret, i;
+        struct sigaction act, oldact;
+        sigset_t set, oldset;
+
+        memset(&act, 0, sizeof(act));
+        act.sa_handler = &sigbus_handler;
+        act.sa_flags = 0;
+
+        ret = sigaction(SIGBUS, &act, &oldact);
+        if (ret) {
+            perror("file_ram_alloc: failed to install signal handler");
+            exit(1);
+        }
+
+        /* unblock SIGBUS */
+        sigemptyset(&set);
+        sigaddset(&set, SIGBUS);
+        pthread_sigmask(SIG_UNBLOCK, &set, &oldset);
+
+        if (sigsetjmp(sigjump, 1)) {
+            fprintf(stderr, "file_ram_alloc: failed to preallocate pages\n");
+            exit(1);
+        }
+
+        /* MAP_POPULATE silently ignores failures */
+        for (i = 0; i < (memory/hpagesize)-1; i++) {
+            memset(area + (hpagesize*i), 0, 1);
+        }
+
+        ret = sigaction(SIGBUS, &oldact, NULL);
+        if (ret) {
+            perror("file_ram_alloc: failed to reinstall signal handler");
+            exit(1);
+        }
+
+        pthread_sigmask(SIG_SETMASK, &oldset, NULL);
+    }
+
    block->fd = fd;
    return area;
 }
@@ -1679,6 +1769,8 @@ static void mem_commit(MemoryListener *listener)
    AddressSpaceDispatch *cur = as->dispatch;
    AddressSpaceDispatch *next = as->next_dispatch;

+    phys_page_compact_all(next, next->map.nodes_nb);
+
    as->dispatch = next;

    if (cur) {
@@ -1747,11 +1839,7 @@ static void memory_map_init(void)
 {
    system_memory = g_malloc(sizeof(*system_memory));

-    assert(ADDR_SPACE_BITS <= 64);
-
-    memory_region_init(system_memory, NULL, "system",
-                       ADDR_SPACE_BITS == 64 ?
-                       UINT64_MAX : (0x1ULL << ADDR_SPACE_BITS));
+    memory_region_init(system_memory, NULL, "system", UINT64_MAX);
    address_space_init(&address_space_memory, system_memory, "memory");

    system_io = g_malloc(sizeof(*system_io));
@@ -1832,6 +1920,18 @@ static void invalidate_and_set_dirty(hwaddr addr,
    xen_modified_memory(addr, length);
 }

+static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write)
+{
+    if (memory_region_is_ram(mr)) {
+        return !(is_write && mr->readonly);
+    }
+    if (memory_region_is_romd(mr)) {
+        return !is_write;
+    }
+
+    return false;
+}
+
 static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
 {
    unsigned access_size_max = mr->ops->valid.max_access_size;
@@ -1971,9 +2071,13 @@ void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
    address_space_rw(&address_space_memory, addr, buf, len, is_write);
 }

-/* used for ROM loading : can write in RAM and ROM */
-void cpu_physical_memory_write_rom(hwaddr addr,
-                                   const uint8_t *buf, int len)
+enum write_rom_type {
+    WRITE_DATA,
+    FLUSH_CACHE,
+};
+
+static inline void cpu_physical_memory_write_rom_internal(
+    hwaddr addr, const uint8_t *buf, int len, enum write_rom_type type)
 {
    hwaddr l;
    uint8_t *ptr;
@@ -1992,8 +2096,15 @@ void cpu_physical_memory_write_rom(hwaddr addr,
            addr1 += memory_region_get_ram_addr(mr);
            /* ROM/RAM case */
            ptr = qemu_get_ram_ptr(addr1);
-            memcpy(ptr, buf, l);
-            invalidate_and_set_dirty(addr1, l);
+            switch (type) {
+            case WRITE_DATA:
+                memcpy(ptr, buf, l);
+                invalidate_and_set_dirty(addr1, l);
+                break;
+            case FLUSH_CACHE:
+                flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
+                break;
+            }
        }
        len -= l;
        buf += l;
@@ -2001,6 +2112,28 @@ void cpu_physical_memory_write_rom(hwaddr addr,
    }
 }

+/* used for ROM loading : can write in RAM and ROM */
+void cpu_physical_memory_write_rom(hwaddr addr,
+                                   const uint8_t *buf, int len)
+{
+    cpu_physical_memory_write_rom_internal(addr, buf, len, WRITE_DATA);
+}
+
+void cpu_flush_icache_range(hwaddr start, int len)
+{
+    /*
+     * This function should do the same thing as an icache flush that was
+     * triggered from within the guest. For TCG we are always cache coherent,
+     * so there is no need to flush anything. For KVM / Xen we need to flush
+     * the host's instruction cache at least.
+     */
+    if (tcg_enabled()) {
+        return;
+    }
+
+    cpu_physical_memory_write_rom_internal(start, NULL, len, FLUSH_CACHE);
+}
+
 typedef struct {
    MemoryRegion *mr;
    void *buffer;
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -6705,10 +6705,17 @@ int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
 /* min() and max() functions. These can't be implemented as
 * 'compare and pick one input' because that would mishandle
 * NaNs and +0 vs -0.
+ *
+ * minnum() and maxnum() functions. These are similar to the min()
+ * and max() functions but if one of the arguments is a QNaN and
+ * the other is numerical then the numerical argument is returned.
+ * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
+ * and maxNum() operations. min() and max() are the typical min/max
+ * semantics provided by many CPUs which predate that specification.
 */
-#define MINMAX(s, nan_exp)                                              \
+#define MINMAX(s)                                                       \
 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
-                                        int ismin STATUS_PARAM )        \
+                                        int ismin, int isieee STATUS_PARAM) \
 {                                                                       \
    flag aSign, bSign;                                                  \
    uint ## s ## _t av, bv;                                             \
@@ -6716,6 +6723,15 @@ INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
    b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
    if (float ## s ## _is_any_nan(a) ||                                 \
        float ## s ## _is_any_nan(b)) {                                 \
+        if (isieee) {                                                   \
+            if (float ## s ## _is_quiet_nan(a) &&                       \
+                !float ## s ##_is_any_nan(b)) {                         \
+                return b;                                               \
+            } else if (float ## s ## _is_quiet_nan(b) &&                \
+                       !float ## s ## _is_any_nan(a)) {                 \
+                return a;                                               \
+            }                                                           \
+        }                                                               \
        return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
    }                                                                   \
    aSign = extractFloat ## s ## Sign(a);                               \
@@ -6739,16 +6755,26 @@ INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
                                                                        \
 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
 {                                                                       \
-    return float ## s ## _minmax(a, b, 1 STATUS_VAR);                   \
+    return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR);                \
 }                                                                       \
                                                                        \
 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
 {                                                                       \
-    return float ## s ## _minmax(a, b, 0 STATUS_VAR);                   \
+    return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR);                \
+}                                                                       \
+                                                                        \
+float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
+{                                                                       \
+    return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR);                \
+}                                                                       \
+                                                                        \
+float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
+{                                                                       \
+    return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR);                \
 }

-MINMAX(32, 0xff)
-MINMAX(64, 0x7ff)
+MINMAX(32)
+MINMAX(64)


 /* Multiply A by 2 raised to the power N.  */
--- a/gdb-xml/aarch64-fpu.xml
+++ b/gdb-xml/aarch64-fpu.xml
@@ -0,0 +1,86 @@
+<?xml version="1.0"?>
+<!-- Copyright (C) 2009-2012 Free Software Foundation, Inc.
+     Contributed by ARM Ltd.
+
+     Copying and distribution of this file, with or without modification,
+     are permitted in any medium without royalty provided the copyright
+     notice and this notice are preserved.  -->
+
+<!DOCTYPE feature SYSTEM "gdb-target.dtd">
+<feature name="org.gnu.gdb.aarch64.fpu">
+  <vector id="v2d" type="ieee_double" count="2"/>
+  <vector id="v2u" type="uint64" count="2"/>
+  <vector id="v2i" type="int64" count="2"/>
+  <vector id="v4f" type="ieee_single" count="4"/>
+  <vector id="v4u" type="uint32" count="4"/>
+  <vector id="v4i" type="int32" count="4"/>
+  <vector id="v8u" type="uint16" count="8"/>
+  <vector id="v8i" type="int16" count="8"/>
+  <vector id="v16u" type="uint8" count="16"/>
+  <vector id="v16i" type="int8" count="16"/>
+  <vector id="v1u" type="uint128" count="1"/>
+  <vector id="v1i" type="int128" count="1"/>
+  <union id="vnd">
+    <field name="f" type="v2d"/>
+    <field name="u" type="v2u"/>
+    <field name="s" type="v2i"/>
+  </union>
+  <union id="vns">
+    <field name="f" type="v4f"/>
+    <field name="u" type="v4u"/>
+    <field name="s" type="v4i"/>
+  </union>
+  <union id="vnh">
+    <field name="u" type="v8u"/>
+    <field name="s" type="v8i"/>
+  </union>
+  <union id="vnb">
+    <field name="u" type="v16u"/>
+    <field name="s" type="v16i"/>
+  </union>
+  <union id="vnq">
+    <field name="u" type="v1u"/>
+    <field name="s" type="v1i"/>
+  </union>
+  <union id="aarch64v">
+    <field name="d" type="vnd"/>
+    <field name="s" type="vns"/>
+    <field name="h" type="vnh"/>
+    <field name="b" type="vnb"/>
+    <field name="q" type="vnq"/>
+  </union>
+  <reg name="v0" bitsize="128" type="aarch64v" regnum="34"/>
+  <reg name="v1" bitsize="128" type="aarch64v" />
+  <reg name="v2" bitsize="128" type="aarch64v" />
+  <reg name="v3" bitsize="128" type="aarch64v" />
+  <reg name="v4" bitsize="128" type="aarch64v" />
+  <reg name="v5" bitsize="128" type="aarch64v" />
+  <reg name="v6" bitsize="128" type="aarch64v" />
+  <reg name="v7" bitsize="128" type="aarch64v" />
+  <reg name="v8" bitsize="128" type="aarch64v" />
+  <reg name="v9" bitsize="128" type="aarch64v" />
+  <reg name="v10" bitsize="128" type="aarch64v"/>
+  <reg name="v11" bitsize="128" type="aarch64v"/>
+  <reg name="v12" bitsize="128" type="aarch64v"/>
+  <reg name="v13" bitsize="128" type="aarch64v"/>
+  <reg name="v14" bitsize="128" type="aarch64v"/>
+  <reg name="v15" bitsize="128" type="aarch64v"/>
+  <reg name="v16" bitsize="128" type="aarch64v"/>
+  <reg name="v17" bitsize="128" type="aarch64v"/>
+  <reg name="v18" bitsize="128" type="aarch64v"/>
+  <reg name="v19" bitsize="128" type="aarch64v"/>
+  <reg name="v20" bitsize="128" type="aarch64v"/>
+  <reg name="v21" bitsize="128" type="aarch64v"/>
+  <reg name="v22" bitsize="128" type="aarch64v"/>
+  <reg name="v23" bitsize="128" type="aarch64v"/>
+  <reg name="v24" bitsize="128" type="aarch64v"/>
+  <reg name="v25" bitsize="128" type="aarch64v"/>
+  <reg name="v26" bitsize="128" type="aarch64v"/>
+  <reg name="v27" bitsize="128" type="aarch64v"/>
+  <reg name="v28" bitsize="128" type="aarch64v"/>
+  <reg name="v29" bitsize="128" type="aarch64v"/>
+  <reg name="v30" bitsize="128" type="aarch64v"/>
+  <reg name="v31" bitsize="128" type="aarch64v"/>
+  <reg name="fpsr" bitsize="32"/>
+  <reg name="fpcr" bitsize="32"/>
+</feature>
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1190,7 +1190,7 @@ ETEXI
    {
        .name       = "host_net_add",
        .args_type  = "device:s,opts:s?",
-        .params     = "tap|user|socket|vde|dump [options]",
+        .params     = "tap|user|socket|vde|netmap|dump [options]",
        .help       = "add host VLAN client",
        .mhandler.cmd = net_host_device_add,
    },
@@ -1218,7 +1218,7 @@ ETEXI
    {
        .name       = "netdev_add",
        .args_type  = "netdev:O",
-        .params     = "[user|tap|socket|hubport],id=str[,prop=value][,...]",
+        .params     = "[user|tap|socket|hubport|netmap],id=str[,prop=value][,...]",
        .help       = "add host network device",
        .mhandler.cmd = hmp_netdev_add,
    },
--- a/hw/9pfs/virtio-9p-device.c
+++ b/hw/9pfs/virtio-9p-device.c
@@ -41,15 +41,16 @@ static void virtio_9p_get_config(VirtIODevice *vdev, uint8_t *config)
    g_free(cfg);
 }

-static int virtio_9p_device_init(VirtIODevice *vdev)
+static void virtio_9p_device_realize(DeviceState *dev, Error **errp)
 {
-    V9fsState *s = VIRTIO_9P(vdev);
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    V9fsState *s = VIRTIO_9P(dev);
    int i, len;
    struct stat stat;
    FsDriverEntry *fse;
    V9fsPath path;

-    virtio_init(VIRTIO_DEVICE(s), "virtio-9p", VIRTIO_ID_9P,
+    virtio_init(vdev, "virtio-9p", VIRTIO_ID_9P,
                sizeof(struct virtio_9p_config) + MAX_TAG_LEN);

    /* initialize pdu allocator */
@@ -67,16 +68,16 @@ static int virtio_9p_device_init(VirtIODevice *vdev)

    if (!fse) {
        /* We don't have a fsdev identified by fsdev_id */
-        fprintf(stderr, "Virtio-9p device couldn't find fsdev with the "
-                "id = %s\n",
-                s->fsconf.fsdev_id ? s->fsconf.fsdev_id : "NULL");
+        error_setg(errp, "Virtio-9p device couldn't find fsdev with the "
+                   "id = %s",
+                   s->fsconf.fsdev_id ? s->fsconf.fsdev_id : "NULL");
        goto out;
    }

    if (!s->fsconf.tag) {
        /* we haven't specified a mount_tag */
-        fprintf(stderr, "fsdev with id %s needs mount_tag arguments\n",
-                s->fsconf.fsdev_id);
+        error_setg(errp, "fsdev with id %s needs mount_tag arguments",
+                   s->fsconf.fsdev_id);
        goto out;
    }

@@ -85,8 +86,8 @@ static int virtio_9p_device_init(VirtIODevice *vdev)
    s->ctx.exops.get_st_gen = NULL;
    len = strlen(s->fsconf.tag);
    if (len > MAX_TAG_LEN - 1) {
-        fprintf(stderr, "mount tag '%s' (%d bytes) is longer than "
-                "maximum (%d bytes)", s->fsconf.tag, len, MAX_TAG_LEN - 1);
+        error_setg(errp, "mount tag '%s' (%d bytes) is longer than "
+                   "maximum (%d bytes)", s->fsconf.tag, len, MAX_TAG_LEN - 1);
        goto out;
    }

@@ -99,12 +100,12 @@ static int virtio_9p_device_init(VirtIODevice *vdev)
    qemu_co_rwlock_init(&s->rename_lock);

    if (s->ops->init(&s->ctx) < 0) {
-        fprintf(stderr, "Virtio-9p Failed to initialize fs-driver with id:%s"
-                " and export path:%s\n", s->fsconf.fsdev_id, s->ctx.fs_root);
+        error_setg(errp, "Virtio-9p Failed to initialize fs-driver with id:%s"
+                   " and export path:%s", s->fsconf.fsdev_id, s->ctx.fs_root);
        goto out;
    }
    if (v9fs_init_worker_threads() < 0) {
-        fprintf(stderr, "worker thread initialization failed\n");
+        error_setg(errp, "worker thread initialization failed");
        goto out;
    }

@@ -114,28 +115,25 @@ static int virtio_9p_device_init(VirtIODevice *vdev)
     * use co-routines here.
     */
    if (s->ops->name_to_path(&s->ctx, NULL, "/", &path) < 0) {
-        fprintf(stderr,
-                "error in converting name to path %s", strerror(errno));
+        error_setg(errp,
+                   "error in converting name to path %s", strerror(errno));
        goto out;
    }
    if (s->ops->lstat(&s->ctx, &path, &stat)) {
-        fprintf(stderr, "share path %s does not exist\n", fse->path);
+        error_setg(errp, "share path %s does not exist", fse->path);
        goto out;
    } else if (!S_ISDIR(stat.st_mode)) {
-        fprintf(stderr, "share path %s is not a directory\n", fse->path);
+        error_setg(errp, "share path %s is not a directory", fse->path);
        goto out;
    }
    v9fs_path_free(&path);

-    return 0;
+    return;
 out:
    g_free(s->ctx.fs_root);
    g_free(s->tag);
    virtio_cleanup(vdev);
    v9fs_path_free(&path);
-
-    return -1;
-
 }

 /* virtio-9p device */
@@ -149,9 +147,10 @@ static void virtio_9p_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
    VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+
    dc->props = virtio_9p_properties;
    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
-    vdc->init = virtio_9p_device_init;
+    vdc->realize = virtio_9p_device_realize;
    vdc->get_features = virtio_9p_get_features;
    vdc->get_config = virtio_9p_get_config;
 }
--- a/hw/arm/Makefile.objs
+++ b/hw/arm/Makefile.objs
@@ -1,7 +1,10 @@
 obj-y += boot.o collie.o exynos4_boards.o gumstix.o highbank.o
+obj-$(CONFIG_DIGIC) += digic_boards.o
 obj-y += integratorcp.o kzm.o mainstone.o musicpal.o nseries.o
 obj-y += omap_sx1.o palm.o realview.o spitz.o stellaris.o
-obj-y += tosa.o versatilepb.o vexpress.o xilinx_zynq.o z2.o
+obj-y += tosa.o versatilepb.o vexpress.o virt.o xilinx_zynq.o z2.o

 obj-y += armv7m.o exynos4210.o pxa2xx.o pxa2xx_gpio.o pxa2xx_pic.o
+obj-$(CONFIG_DIGIC) += digic.o
 obj-y += omap1.o omap2.o strongarm.o
+obj-$(CONFIG_ALLWINNER_A10) += allwinner-a10.o cubieboard.o
--- a/hw/arm/allwinner-a10.c
+++ b/hw/arm/allwinner-a10.c
@@ -0,0 +1,103 @@
+/*
+ * Allwinner A10 SoC emulation
+ *
+ * Copyright (C) 2013 Li Guang
+ * Written by Li Guang <lig.fnst@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ */
+
+#include "hw/sysbus.h"
+#include "hw/devices.h"
+#include "hw/arm/allwinner-a10.h"
+
+static void aw_a10_init(Object *obj)
+{
+    AwA10State *s = AW_A10(obj);
+
+    object_initialize(&s->cpu, sizeof(s->cpu), "cortex-a8-" TYPE_ARM_CPU);
+    object_property_add_child(obj, "cpu", OBJECT(&s->cpu), NULL);
+
+    object_initialize(&s->intc, sizeof(s->intc), TYPE_AW_A10_PIC);
+    qdev_set_parent_bus(DEVICE(&s->intc), sysbus_get_default());
+
+    object_initialize(&s->timer, sizeof(s->timer), TYPE_AW_A10_PIT);
+    qdev_set_parent_bus(DEVICE(&s->timer), sysbus_get_default());
+}
+
+static void aw_a10_realize(DeviceState *dev, Error **errp)
+{
+    AwA10State *s = AW_A10(dev);
+    SysBusDevice *sysbusdev;
+    uint8_t i;
+    qemu_irq fiq, irq;
+    Error *err = NULL;
+
+    object_property_set_bool(OBJECT(&s->cpu), true, "realized", &err);
+    if (err != NULL) {
+        error_propagate(errp, err);
+        return;
+    }
+    irq = qdev_get_gpio_in(DEVICE(&s->cpu), ARM_CPU_IRQ);
+    fiq = qdev_get_gpio_in(DEVICE(&s->cpu), ARM_CPU_FIQ);
+
+    object_property_set_bool(OBJECT(&s->intc), true, "realized", &err);
+    if (err != NULL) {
+        error_propagate(errp, err);
+        return;
+    }
+    sysbusdev = SYS_BUS_DEVICE(&s->intc);
+    sysbus_mmio_map(sysbusdev, 0, AW_A10_PIC_REG_BASE);
+    sysbus_connect_irq(sysbusdev, 0, irq);
+    sysbus_connect_irq(sysbusdev, 1, fiq);
+    for (i = 0; i < AW_A10_PIC_INT_NR; i++) {
+        s->irq[i] = qdev_get_gpio_in(DEVICE(&s->intc), i);
+    }
+
+    object_property_set_bool(OBJECT(&s->timer), true, "realized", &err);
+    if (err != NULL) {
+        error_propagate(errp, err);
+        return;
+    }
+    sysbusdev = SYS_BUS_DEVICE(&s->timer);
+    sysbus_mmio_map(sysbusdev, 0, AW_A10_PIT_REG_BASE);
+    sysbus_connect_irq(sysbusdev, 0, s->irq[22]);
+    sysbus_connect_irq(sysbusdev, 1, s->irq[23]);
+    sysbus_connect_irq(sysbusdev, 2, s->irq[24]);
+    sysbus_connect_irq(sysbusdev, 3, s->irq[25]);
+    sysbus_connect_irq(sysbusdev, 4, s->irq[67]);
+    sysbus_connect_irq(sysbusdev, 5, s->irq[68]);
+
+    serial_mm_init(get_system_memory(), AW_A10_UART0_REG_BASE, 2, s->irq[1],
+                   115200, serial_hds[0], DEVICE_NATIVE_ENDIAN);
+}
+
+static void aw_a10_class_init(ObjectClass *oc, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(oc);
+
+    dc->realize = aw_a10_realize;
+}
+
+static const TypeInfo aw_a10_type_info = {
+    .name = TYPE_AW_A10,
+    .parent = TYPE_DEVICE,
+    .instance_size = sizeof(AwA10State),
+    .instance_init = aw_a10_init,
+    .class_init = aw_a10_class_init,
+};
+
+static void aw_a10_register_types(void)
+{
+    type_register_static(&aw_a10_type_info);
+}
+
+type_init(aw_a10_register_types)
--- a/hw/arm/boot.c
+++ b/hw/arm/boot.c
@@ -17,18 +17,55 @@
 #include "sysemu/device_tree.h"
 #include "qemu/config-file.h"

+/* Kernel boot protocol is specified in the kernel docs
+ * Documentation/arm/Booting and Documentation/arm64/booting.txt
+ * They have different preferred image load offsets from system RAM base.
+ */
 #define KERNEL_ARGS_ADDR 0x100
 #define KERNEL_LOAD_ADDR 0x00010000
+#define KERNEL64_LOAD_ADDR 0x00080000
+
+typedef enum {
+    FIXUP_NONE = 0,   /* do nothing */
+    FIXUP_TERMINATOR, /* end of insns */
+    FIXUP_BOARDID,    /* overwrite with board ID number */
+    FIXUP_ARGPTR,     /* overwrite with pointer to kernel args */
+    FIXUP_ENTRYPOINT, /* overwrite with kernel entry point */
+    FIXUP_GIC_CPU_IF, /* overwrite with GIC CPU interface address */
+    FIXUP_BOOTREG,    /* overwrite with boot register address */
+    FIXUP_DSB,        /* overwrite with correct DSB insn for cpu */
+    FIXUP_MAX,
+} FixupType;
+
+typedef struct ARMInsnFixup {
+    uint32_t insn;
+    FixupType fixup;
+} ARMInsnFixup;
+
+static const ARMInsnFixup bootloader_aarch64[] = {
+    { 0x580000c0 }, /* ldr x0, arg ; Load the lower 32-bits of DTB */
+    { 0xaa1f03e1 }, /* mov x1, xzr */
+    { 0xaa1f03e2 }, /* mov x2, xzr */
+    { 0xaa1f03e3 }, /* mov x3, xzr */
+    { 0x58000084 }, /* ldr x4, entry ; Load the lower 32-bits of kernel entry */
+    { 0xd61f0080 }, /* br x4      ; Jump to the kernel entry point */
+    { 0, FIXUP_ARGPTR }, /* arg: .word @DTB Lower 32-bits */
+    { 0 }, /* .word @DTB Higher 32-bits */
+    { 0, FIXUP_ENTRYPOINT }, /* entry: .word @Kernel Entry Lower 32-bits */
+    { 0 }, /* .word @Kernel Entry Higher 32-bits */
+    { 0, FIXUP_TERMINATOR }
+};

 /* The worlds second smallest bootloader.  Set r0-r2, then jump to kernel.  */
-static uint32_t bootloader[] = {
-  0xe3a00000, /* mov     r0, #0 */
-  0xe59f1004, /* ldr     r1, [pc, #4] */
-  0xe59f2004, /* ldr     r2, [pc, #4] */
-  0xe59ff004, /* ldr     pc, [pc, #4] */
-  0, /* Board ID */
-  0, /* Address of kernel args.  Set by integratorcp_init.  */
-  0  /* Kernel entry point.  Set by integratorcp_init.  */
+static const ARMInsnFixup bootloader[] = {
+    { 0xe3a00000 }, /* mov     r0, #0 */
+    { 0xe59f1004 }, /* ldr     r1, [pc, #4] */
+    { 0xe59f2004 }, /* ldr     r2, [pc, #4] */
+    { 0xe59ff004 }, /* ldr     pc, [pc, #4] */
+    { 0, FIXUP_BOARDID },
+    { 0, FIXUP_ARGPTR },
+    { 0, FIXUP_ENTRYPOINT },
+    { 0, FIXUP_TERMINATOR }
 };

 /* Handling for secondary CPU boot in a multicore system.
@@ -48,39 +85,83 @@ static uint32_t bootloader[] = {
 #define DSB_INSN 0xf57ff04f
 #define CP15_DSB_INSN 0xee070f9a /* mcr cp15, 0, r0, c7, c10, 4 */

-static uint32_t smpboot[] = {
-  0xe59f2028, /* ldr r2, gic_cpu_if */
-  0xe59f0028, /* ldr r0, startaddr */
-  0xe3a01001, /* mov r1, #1 */
-  0xe5821000, /* str r1, [r2] - set GICC_CTLR.Enable */
-  0xe3a010ff, /* mov r1, #0xff */
-  0xe5821004, /* str r1, [r2, 4] - set GIC_PMR.Priority to 0xff */
-  DSB_INSN,   /* dsb */
-  0xe320f003, /* wfi */
-  0xe5901000, /* ldr     r1, [r0] */
-  0xe1110001, /* tst     r1, r1 */
-  0x0afffffb, /* beq     <wfi> */
-  0xe12fff11, /* bx      r1 */
-  0,          /* gic_cpu_if: base address of GIC CPU interface */
-  0           /* bootreg: Boot register address is held here */
+static const ARMInsnFixup smpboot[] = {
+    { 0xe59f2028 }, /* ldr r2, gic_cpu_if */
+    { 0xe59f0028 }, /* ldr r0, bootreg_addr */
+    { 0xe3a01001 }, /* mov r1, #1 */
+    { 0xe5821000 }, /* str r1, [r2] - set GICC_CTLR.Enable */
+    { 0xe3a010ff }, /* mov r1, #0xff */
+    { 0xe5821004 }, /* str r1, [r2, 4] - set GIC_PMR.Priority to 0xff */
+    { 0, FIXUP_DSB },   /* dsb */
+    { 0xe320f003 }, /* wfi */
+    { 0xe5901000 }, /* ldr     r1, [r0] */
+    { 0xe1110001 }, /* tst     r1, r1 */
+    { 0x0afffffb }, /* beq     <wfi> */
+    { 0xe12fff11 }, /* bx      r1 */
+    { 0, FIXUP_GIC_CPU_IF }, /* gic_cpu_if: .word 0x.... */
+    { 0, FIXUP_BOOTREG }, /* bootreg_addr: .word 0x.... */
+    { 0, FIXUP_TERMINATOR }
 };

+static void write_bootloader(const char *name, hwaddr addr,
+                             const ARMInsnFixup *insns, uint32_t *fixupcontext)
+{
+    /* Fix up the specified bootloader fragment and write it into
+     * guest memory using rom_add_blob_fixed(). fixupcontext is
+     * an array giving the values to write in for the fixup types
+     * which write a value into the code array.
+     */
+    int i, len;
+    uint32_t *code;
+
+    len = 0;
+    while (insns[len].fixup != FIXUP_TERMINATOR) {
+        len++;
+    }
+
+    code = g_new0(uint32_t, len);
+
+    for (i = 0; i < len; i++) {
+        uint32_t insn = insns[i].insn;
+        FixupType fixup = insns[i].fixup;
+
+        switch (fixup) {
+        case FIXUP_NONE:
+            break;
+        case FIXUP_BOARDID:
+        case FIXUP_ARGPTR:
+        case FIXUP_ENTRYPOINT:
+        case FIXUP_GIC_CPU_IF:
+        case FIXUP_BOOTREG:
+        case FIXUP_DSB:
+            insn = fixupcontext[fixup];
+            break;
+        default:
+            abort();
+        }
+        code[i] = tswap32(insn);
+    }
+
+    rom_add_blob_fixed(name, code, len * sizeof(uint32_t), addr);
+
+    g_free(code);
+}
+
 static void default_write_secondary(ARMCPU *cpu,
                                    const struct arm_boot_info *info)
 {
-    int n;
-    smpboot[ARRAY_SIZE(smpboot) - 1] = info->smp_bootreg_addr;
-    smpboot[ARRAY_SIZE(smpboot) - 2] = info->gic_cpu_if_addr;
-    for (n = 0; n < ARRAY_SIZE(smpboot); n++) {
-        /* Replace DSB with the pre-v7 DSB if necessary. */
-        if (!arm_feature(&cpu->env, ARM_FEATURE_V7) &&
-            smpboot[n] == DSB_INSN) {
-            smpboot[n] = CP15_DSB_INSN;
-        }
-        smpboot[n] = tswap32(smpboot[n]);
+    uint32_t fixupcontext[FIXUP_MAX];
+
+    fixupcontext[FIXUP_GIC_CPU_IF] = info->gic_cpu_if_addr;
+    fixupcontext[FIXUP_BOOTREG] = info->smp_bootreg_addr;
+    if (arm_feature(&cpu->env, ARM_FEATURE_V7)) {
+        fixupcontext[FIXUP_DSB] = DSB_INSN;
+    } else {
+        fixupcontext[FIXUP_DSB] = CP15_DSB_INSN;
    }
-    rom_add_blob_fixed("smpboot", smpboot, sizeof(smpboot),
-                       info->smp_loader_start);
+
+    write_bootloader("smpboot", info->smp_loader_start,
+                     smpboot, fixupcontext);
 }

 static void default_reset_secondary(ARMCPU *cpu,
@@ -228,26 +309,34 @@ static void set_kernel_args_old(const struct arm_boot_info *info)
 static int load_dtb(hwaddr addr, const struct arm_boot_info *binfo)
 {
    void *fdt = NULL;
-    char *filename;
    int size, rc;
    uint32_t acells, scells;

-    filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, binfo->dtb_filename);
-    if (!filename) {
-        fprintf(stderr, "Couldn't open dtb file %s\n", binfo->dtb_filename);
-        goto fail;
-    }
+    if (binfo->dtb_filename) {
+        char *filename;
+        filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, binfo->dtb_filename);
+        if (!filename) {
+            fprintf(stderr, "Couldn't open dtb file %s\n", binfo->dtb_filename);
+            goto fail;
+        }

-    fdt = load_device_tree(filename, &size);
-    if (!fdt) {
-        fprintf(stderr, "Couldn't open dtb file %s\n", filename);
+        fdt = load_device_tree(filename, &size);
+        if (!fdt) {
+            fprintf(stderr, "Couldn't open dtb file %s\n", filename);
+            g_free(filename);
+            goto fail;
+        }
        g_free(filename);
-        goto fail;
+    } else if (binfo->get_dtb) {
+        fdt = binfo->get_dtb(binfo, &size);
+        if (!fdt) {
+            fprintf(stderr, "Board was unable to create a dtb blob\n");
+            goto fail;
+        }
    }
-    g_free(filename);

-    acells = qemu_devtree_getprop_cell(fdt, "/", "#address-cells");
-    scells = qemu_devtree_getprop_cell(fdt, "/", "#size-cells");
+    acells = qemu_fdt_getprop_cell(fdt, "/", "#address-cells");
+    scells = qemu_fdt_getprop_cell(fdt, "/", "#size-cells");
    if (acells == 0 || scells == 0) {
        fprintf(stderr, "dtb file invalid (#address-cells or #size-cells 0)\n");
        goto fail;
@@ -262,17 +351,17 @@ static int load_dtb(hwaddr addr, const struct arm_boot_info *binfo)
        goto fail;
    }

-    rc = qemu_devtree_setprop_sized_cells(fdt, "/memory", "reg",
-                                          acells, binfo->loader_start,
-                                          scells, binfo->ram_size);
+    rc = qemu_fdt_setprop_sized_cells(fdt, "/memory", "reg",
+                                      acells, binfo->loader_start,
+                                      scells, binfo->ram_size);
    if (rc < 0) {
        fprintf(stderr, "couldn't set /memory/reg\n");
        goto fail;
    }

    if (binfo->kernel_cmdline && *binfo->kernel_cmdline) {
-        rc = qemu_devtree_setprop_string(fdt, "/chosen", "bootargs",
-                                          binfo->kernel_cmdline);
+        rc = qemu_fdt_setprop_string(fdt, "/chosen", "bootargs",
+                                     binfo->kernel_cmdline);
        if (rc < 0) {
            fprintf(stderr, "couldn't set /chosen/bootargs\n");
            goto fail;
@@ -280,15 +369,15 @@ static int load_dtb(hwaddr addr, const struct arm_boot_info *binfo)
    }

    if (binfo->initrd_size) {
-        rc = qemu_devtree_setprop_cell(fdt, "/chosen", "linux,initrd-start",
-                binfo->initrd_start);
+        rc = qemu_fdt_setprop_cell(fdt, "/chosen", "linux,initrd-start",
+                                   binfo->initrd_start);
        if (rc < 0) {
            fprintf(stderr, "couldn't set /chosen/linux,initrd-start\n");
            goto fail;
        }

-        rc = qemu_devtree_setprop_cell(fdt, "/chosen", "linux,initrd-end",
-                    binfo->initrd_start + binfo->initrd_size);
+        rc = qemu_fdt_setprop_cell(fdt, "/chosen", "linux,initrd-end",
+                                   binfo->initrd_start + binfo->initrd_size);
        if (rc < 0) {
            fprintf(stderr, "couldn't set /chosen/linux,initrd-end\n");
            goto fail;
@@ -299,7 +388,7 @@ static int load_dtb(hwaddr addr, const struct arm_boot_info *binfo)
        binfo->modify_dtb(binfo, fdt);
    }

-    qemu_devtree_dumpdtb(fdt, size);
+    qemu_fdt_dumpdtb(fdt, size);

    cpu_physical_memory_write(addr, fdt, size);

@@ -326,7 +415,12 @@ static void do_cpu_reset(void *opaque)
            env->thumb = info->entry & 1;
        } else {
            if (CPU(cpu) == first_cpu) {
-                env->regs[15] = info->loader_start;
+                if (env->aarch64) {
+                    env->pc = info->loader_start;
+                } else {
+                    env->regs[15] = info->loader_start;
+                }
+
                if (!info->dtb_filename) {
                    if (old_param) {
                        set_kernel_args_old(info);
@@ -346,11 +440,11 @@ void arm_load_kernel(ARMCPU *cpu, struct arm_boot_info *info)
    CPUState *cs = CPU(cpu);
    int kernel_size;
    int initrd_size;
-    int n;
    int is_linux = 0;
    uint64_t elf_entry;
-    hwaddr entry;
+    hwaddr entry, kernel_load_offset;
    int big_endian;
+    static const ARMInsnFixup *primary_loader;

    /* Load the kernel.  */
    if (!info->kernel_filename) {
@@ -360,6 +454,14 @@ void arm_load_kernel(ARMCPU *cpu, struct arm_boot_info *info)
        return;
    }

+    if (arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) {
+        primary_loader = bootloader_aarch64;
+        kernel_load_offset = KERNEL64_LOAD_ADDR;
+    } else {
+        primary_loader = bootloader;
+        kernel_load_offset = KERNEL_LOAD_ADDR;
+    }
+
    info->dtb_filename = qemu_opt_get(qemu_get_machine_opts(), "dtb");

    if (!info->secondary_cpu_reset_hook) {
@@ -400,9 +502,9 @@ void arm_load_kernel(ARMCPU *cpu, struct arm_boot_info *info)
                                  &is_linux);
    }
    if (kernel_size < 0) {
-        entry = info->loader_start + KERNEL_LOAD_ADDR;
+        entry = info->loader_start + kernel_load_offset;
        kernel_size = load_image_targphys(info->kernel_filename, entry,
-                                          info->ram_size - KERNEL_LOAD_ADDR);
+                                          info->ram_size - kernel_load_offset);
        is_linux = 1;
    }
    if (kernel_size < 0) {
@@ -412,6 +514,8 @@ void arm_load_kernel(ARMCPU *cpu, struct arm_boot_info *info)
    }
    info->entry = entry;
    if (is_linux) {
+        uint32_t fixupcontext[FIXUP_MAX];
+
        if (info->initrd_filename) {
            initrd_size = load_ramdisk(info->initrd_filename,
                                       info->initrd_start,
@@ -433,12 +537,12 @@ void arm_load_kernel(ARMCPU *cpu, struct arm_boot_info *info)
        }
        info->initrd_size = initrd_size;

-        bootloader[4] = info->board_id;
+        fixupcontext[FIXUP_BOARDID] = info->board_id;

        /* for device tree boot, we pass the DTB directly in r2. Otherwise
         * we point to the kernel args.
         */
-        if (info->dtb_filename) {
+        if (info->dtb_filename || info->get_dtb) {
            /* Place the DTB after the initrd in memory. Note that some
             * kernels will trash anything in the 4K page the initrd
             * ends in, so make sure the DTB isn't caught up in that.
@@ -448,9 +552,9 @@ void arm_load_kernel(ARMCPU *cpu, struct arm_boot_info *info)
            if (load_dtb(dtb_start, info)) {
                exit(1);
            }
-            bootloader[5] = dtb_start;
+            fixupcontext[FIXUP_ARGPTR] = dtb_start;
        } else {
-            bootloader[5] = info->loader_start + KERNEL_ARGS_ADDR;
+            fixupcontext[FIXUP_ARGPTR] = info->loader_start + KERNEL_ARGS_ADDR;
            if (info->ram_size >= (1ULL << 32)) {
                fprintf(stderr, "qemu: RAM size must be less than 4GB to boot"
                        " Linux kernel using ATAGS (try passing a device tree"
@@ -458,12 +562,11 @@ void arm_load_kernel(ARMCPU *cpu, struct arm_boot_info *info)
                exit(1);
            }
        }
-        bootloader[6] = entry;
-        for (n = 0; n < sizeof(bootloader) / 4; n++) {
-            bootloader[n] = tswap32(bootloader[n]);
-        }
-        rom_add_blob_fixed("bootloader", bootloader, sizeof(bootloader),
-                           info->loader_start);
+        fixupcontext[FIXUP_ENTRYPOINT] = entry;
+
+        write_bootloader("bootloader", info->loader_start,
+                         primary_loader, fixupcontext);
+
        if (info->nb_cpus > 1) {
            info->write_secondary_boot(cpu, info);
        }
--- a/hw/arm/cubieboard.c
+++ b/hw/arm/cubieboard.c
@@ -0,0 +1,69 @@
+/*
+ * cubieboard emulation
+ *
+ * Copyright (C) 2013 Li Guang
+ * Written by Li Guang <lig.fnst@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ */
+
+#include "hw/sysbus.h"
+#include "hw/devices.h"
+#include "hw/boards.h"
+#include "hw/arm/allwinner-a10.h"
+
+static struct arm_boot_info cubieboard_binfo = {
+    .loader_start = AW_A10_SDRAM_BASE,
+    .board_id = 0x1008,
+};
+
+typedef struct CubieBoardState {
+    AwA10State *a10;
+    MemoryRegion sdram;
+} CubieBoardState;
+
+static void cubieboard_init(QEMUMachineInitArgs *args)
+{
+    CubieBoardState *s = g_new(CubieBoardState, 1);
+    Error *err = NULL;
+
+    s->a10 = AW_A10(object_new(TYPE_AW_A10));
+    object_property_set_bool(OBJECT(s->a10), true, "realized", &err);
+    if (err != NULL) {
+        error_report("Couldn't realize Allwinner A10: %s\n",
+                error_get_pretty(err));
+        exit(1);
+    }
+
+    memory_region_init_ram(&s->sdram, NULL, "cubieboard.ram", args->ram_size);
+    vmstate_register_ram_global(&s->sdram);
+    memory_region_add_subregion(get_system_memory(), AW_A10_SDRAM_BASE,
+                                &s->sdram);
+
+    cubieboard_binfo.ram_size = args->ram_size;
+    cubieboard_binfo.kernel_filename = args->kernel_filename;
+    cubieboard_binfo.kernel_cmdline = args->kernel_cmdline;
+    arm_load_kernel(&s->a10->cpu, &cubieboard_binfo);
+}
+
+static QEMUMachine cubieboard_machine = {
+    .name = "cubieboard",
+    .desc = "cubietech cubieboard",
+    .init = cubieboard_init,
+};
+
+
+static void cubieboard_machine_init(void)
+{
+    qemu_register_machine(&cubieboard_machine);
+}
+
+machine_init(cubieboard_machine_init)
--- a/hw/arm/digic.c
+++ b/hw/arm/digic.c
@@ -0,0 +1,115 @@
+/*
+ * QEMU model of the Canon DIGIC SoC.
+ *
+ * Copyright (C) 2013 Antony Pavlov <antonynpavlov@gmail.com>
+ *
+ * This model is based on reverse engineering efforts
+ * made by CHDK (http://chdk.wikia.com) and
+ * Magic Lantern (http://www.magiclantern.fm) projects
+ * contributors.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include "hw/arm/digic.h"
+
+#define DIGIC4_TIMER_BASE(n)    (0xc0210000 + (n) * 0x100)
+
+#define DIGIC_UART_BASE          0xc0800000
+
+static void digic_init(Object *obj)
+{
+    DigicState *s = DIGIC(obj);
+    DeviceState *dev;
+    int i;
+
+    object_initialize(&s->cpu, sizeof(s->cpu), "arm946-" TYPE_ARM_CPU);
+    object_property_add_child(obj, "cpu", OBJECT(&s->cpu), NULL);
+
+    for (i = 0; i < DIGIC4_NB_TIMERS; i++) {
+#define DIGIC_TIMER_NAME_MLEN    11
+        char name[DIGIC_TIMER_NAME_MLEN];
+
+        object_initialize(&s->timer[i], sizeof(s->timer[i]), TYPE_DIGIC_TIMER);
+        dev = DEVICE(&s->timer[i]);
+        qdev_set_parent_bus(dev, sysbus_get_default());
+        snprintf(name, DIGIC_TIMER_NAME_MLEN, "timer[%d]", i);
+        object_property_add_child(obj, name, OBJECT(&s->timer[i]), NULL);
+    }
+
+    object_initialize(&s->uart, sizeof(s->uart), TYPE_DIGIC_UART);
+    dev = DEVICE(&s->uart);
+    qdev_set_parent_bus(dev, sysbus_get_default());
+    object_property_add_child(obj, "uart", OBJECT(&s->uart), NULL);
+}
+
+static void digic_realize(DeviceState *dev, Error **errp)
+{
+    DigicState *s = DIGIC(dev);
+    Error *err = NULL;
+    SysBusDevice *sbd;
+    int i;
+
+    object_property_set_bool(OBJECT(&s->cpu), true, "reset-hivecs", &err);
+    if (err != NULL) {
+        error_propagate(errp, err);
+        return;
+    }
+
+    object_property_set_bool(OBJECT(&s->cpu), true, "realized", &err);
+    if (err != NULL) {
+        error_propagate(errp, err);
+        return;
+    }
+
+    for (i = 0; i < DIGIC4_NB_TIMERS; i++) {
+        object_property_set_bool(OBJECT(&s->timer[i]), true, "realized", &err);
+        if (err != NULL) {
+            error_propagate(errp, err);
+            return;
+        }
+
+        sbd = SYS_BUS_DEVICE(&s->timer[i]);
+        sysbus_mmio_map(sbd, 0, DIGIC4_TIMER_BASE(i));
+    }
+
+    object_property_set_bool(OBJECT(&s->uart), true, "realized", &err);
+    if (err != NULL) {
+        error_propagate(errp, err);
+        return;
+    }
+
+    sbd = SYS_BUS_DEVICE(&s->uart);
+    sysbus_mmio_map(sbd, 0, DIGIC_UART_BASE);
+}
+
+static void digic_class_init(ObjectClass *oc, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(oc);
+
+    dc->realize = digic_realize;
+}
+
+static const TypeInfo digic_type_info = {
+    .name = TYPE_DIGIC,
+    .parent = TYPE_DEVICE,
+    .instance_size = sizeof(DigicState),
+    .instance_init = digic_init,
+    .class_init = digic_class_init,
+};
+
+static void digic_register_types(void)
+{
+    type_register_static(&digic_type_info);
+}
+
+type_init(digic_register_types)
--- a/hw/arm/digic_boards.c
+++ b/hw/arm/digic_boards.c
@@ -0,0 +1,162 @@
+/*
+ * QEMU model of the Canon DIGIC boards (cameras indeed :).
+ *
+ * Copyright (C) 2013 Antony Pavlov <antonynpavlov@gmail.com>
+ *
+ * This model is based on reverse engineering efforts
+ * made by CHDK (http://chdk.wikia.com) and
+ * Magic Lantern (http://www.magiclantern.fm) projects
+ * contributors.
+ *
+ * See docs here:
+ *   http://magiclantern.wikia.com/wiki/Register_Map
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include "hw/boards.h"
+#include "exec/address-spaces.h"
+#include "qemu/error-report.h"
+#include "hw/arm/digic.h"
+#include "hw/block/flash.h"
+#include "hw/loader.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/qtest.h"
+
+#define DIGIC4_ROM0_BASE      0xf0000000
+#define DIGIC4_ROM1_BASE      0xf8000000
+#define DIGIC4_ROM_MAX_SIZE   0x08000000
+
+typedef struct DigicBoardState {
+    DigicState *digic;
+    MemoryRegion ram;
+} DigicBoardState;
+
+typedef struct DigicBoard {
+    hwaddr ram_size;
+    void (*add_rom0)(DigicBoardState *, hwaddr, const char *);
+    const char *rom0_def_filename;
+    void (*add_rom1)(DigicBoardState *, hwaddr, const char *);
+    const char *rom1_def_filename;
+} DigicBoard;
+
+static void digic4_board_setup_ram(DigicBoardState *s, hwaddr ram_size)
+{
+    memory_region_init_ram(&s->ram, NULL, "ram", ram_size);
+    memory_region_add_subregion(get_system_memory(), 0, &s->ram);
+    vmstate_register_ram_global(&s->ram);
+}
+
+static void digic4_board_init(DigicBoard *board)
+{
+    Error *err = NULL;
+
+    DigicBoardState *s = g_new(DigicBoardState, 1);
+
+    s->digic = DIGIC(object_new(TYPE_DIGIC));
+    object_property_set_bool(OBJECT(s->digic), true, "realized", &err);
+    if (err != NULL) {
+        error_report("Couldn't realize DIGIC SoC: %s\n",
+                     error_get_pretty(err));
+        exit(1);
+    }
+
+    digic4_board_setup_ram(s, board->ram_size);
+
+    if (board->add_rom0) {
+        board->add_rom0(s, DIGIC4_ROM0_BASE, board->rom0_def_filename);
+    }
+
+    if (board->add_rom1) {
+        board->add_rom1(s, DIGIC4_ROM1_BASE, board->rom1_def_filename);
+    }
+}
+
+static void digic_load_rom(DigicBoardState *s, hwaddr addr,
+                           hwaddr max_size, const char *def_filename)
+{
+    target_long rom_size;
+    const char *filename;
+
+    if (qtest_enabled()) {
+        /* qtest runs no code so don't attempt a ROM load which
+         * could fail and result in a spurious test failure.
+         */
+        return;
+    }
+
+    if (bios_name) {
+        filename = bios_name;
+    } else {
+        filename = def_filename;
+    }
+
+    if (filename) {
+        char *fn = qemu_find_file(QEMU_FILE_TYPE_BIOS, filename);
+
+        if (!fn) {
+            error_report("Couldn't find rom image '%s'.\n", filename);
+            exit(1);
+        }
+
+        rom_size = load_image_targphys(fn, addr, max_size);
+        if (rom_size < 0 || rom_size > max_size) {
+            error_report("Couldn't load rom image '%s'.\n", filename);
+            exit(1);
+        }
+    }
+}
+
+/*
+ * Samsung K8P3215UQB
+ * 64M Bit (4Mx16) Page Mode / Multi-Bank NOR Flash Memory
+ */
+static void digic4_add_k8p3215uqb_rom(DigicBoardState *s, hwaddr addr,
+                                      const char *def_filename)
+{
+#define FLASH_K8P3215UQB_SIZE (4 * 1024 * 1024)
+#define FLASH_K8P3215UQB_SECTOR_SIZE (64 * 1024)
+
+    pflash_cfi02_register(addr, NULL, "pflash", FLASH_K8P3215UQB_SIZE,
+                          NULL, FLASH_K8P3215UQB_SECTOR_SIZE,
+                          FLASH_K8P3215UQB_SIZE / FLASH_K8P3215UQB_SECTOR_SIZE,
+                          DIGIC4_ROM_MAX_SIZE / FLASH_K8P3215UQB_SIZE,
+                          4,
+                          0x00EC, 0x007E, 0x0003, 0x0001,
+                          0x0555, 0x2aa, 0);
+
+    digic_load_rom(s, addr, FLASH_K8P3215UQB_SIZE, def_filename);
+}
+
+static DigicBoard digic4_board_canon_a1100 = {
+    .ram_size = 64 * 1024 * 1024,
+    .add_rom1 = digic4_add_k8p3215uqb_rom,
+    .rom1_def_filename = "canon-a1100-rom1.bin",
+};
+
+static void canon_a1100_init(QEMUMachineInitArgs *args)
+{
+    digic4_board_init(&digic4_board_canon_a1100);
+}
+
+static QEMUMachine canon_a1100 = {
+    .name = "canon-a1100",
+    .desc = "Canon PowerShot A1100 IS",
+    .init = &canon_a1100_init,
+};
+
+static void digic_register_machines(void)
+{
+    qemu_register_machine(&canon_a1100);
+}
+
+machine_init(digic_register_machines)
--- a/hw/arm/highbank.c
+++ b/hw/arm/highbank.c
@@ -26,12 +26,13 @@
 #include "hw/boards.h"
 #include "sysemu/blockdev.h"
 #include "exec/address-spaces.h"
+#include "qemu/error-report.h"

-#define SMP_BOOT_ADDR 0x100
-#define SMP_BOOT_REG  0x40
-#define GIC_BASE_ADDR 0xfff10000
+#define SMP_BOOT_ADDR           0x100
+#define SMP_BOOT_REG            0x40
+#define MPCORE_PERIPHBASE       0xfff10000

-#define NIRQ_GIC      160
+#define NIRQ_GIC                160

 /* Board init.  */

@@ -54,7 +55,7 @@ static void hb_write_secondary(ARMCPU *cpu, const struct arm_boot_info *info)
        0xe1110001, /* tst     r1, r1 */
        0x0afffffb, /* beq     <wfi> */
        0xe12fff11, /* bx      r1 */
-        GIC_BASE_ADDR      /* privbase: gic address.  */
+        MPCORE_PERIPHBASE   /* privbase: MPCore peripheral base address.  */
    };
    for (n = 0; n < ARRAY_SIZE(smpboot); n++) {
        smpboot[n] = tswap32(smpboot[n]);
@@ -229,15 +230,23 @@ static void calxeda_init(QEMUMachineInitArgs *args, enum cxmachines machine)
    }

    for (n = 0; n < smp_cpus; n++) {
+        ObjectClass *oc = cpu_class_by_name(TYPE_ARM_CPU, cpu_model);
        ARMCPU *cpu;
-        cpu = cpu_arm_init(cpu_model);
-        if (cpu == NULL) {
-            fprintf(stderr, "Unable to find CPU definition\n");
+        Error *err = NULL;
+
+        cpu = ARM_CPU(object_new(object_class_get_name(oc)));
+
+        object_property_set_int(OBJECT(cpu), MPCORE_PERIPHBASE, "reset-cbar",
+                                &err);
+        if (err) {
+            error_report("%s", error_get_pretty(err));
+            exit(1);
+        }
+        object_property_set_bool(OBJECT(cpu), true, "realized", &err);
+        if (err) {
+            error_report("%s", error_get_pretty(err));
            exit(1);
        }
-
-        /* This will become a QOM property eventually */
-        cpu->reset_cbar = GIC_BASE_ADDR;
        cpu_irq[n] = qdev_get_gpio_in(DEVICE(cpu), ARM_CPU_IRQ);
    }

@@ -279,7 +288,7 @@ static void calxeda_init(QEMUMachineInitArgs *args, enum cxmachines machine)
    qdev_prop_set_uint32(dev, "num-irq", NIRQ_GIC);
    qdev_init_nofail(dev);
    busdev = SYS_BUS_DEVICE(dev);
-    sysbus_mmio_map(busdev, 0, GIC_BASE_ADDR);
+    sysbus_mmio_map(busdev, 0, MPCORE_PERIPHBASE);
    for (n = 0; n < smp_cpus; n++) {
        sysbus_connect_irq(busdev, n, cpu_irq[n]);
    }
--- a/hw/arm/integratorcp.c
+++ b/hw/arm/integratorcp.c
@@ -36,6 +36,7 @@ typedef struct IntegratorCMState {
    uint32_t cm_init;
    uint32_t cm_flags;
    uint32_t cm_nvflags;
+    uint32_t cm_refcnt_offset;
    uint32_t int_level;
    uint32_t irq_enabled;
    uint32_t fiq_enabled;
@@ -82,9 +83,13 @@ static uint64_t integratorcm_read(void *opaque, hwaddr offset,
        return s->cm_sdram;
    case 9: /* CM_INIT */
        return s->cm_init;
-    case 10: /* CM_REFCT */
-        /* ??? High frequency timer.  */
-        hw_error("integratorcm_read: CM_REFCT");
+    case 10: /* CM_REFCNT */
+        /* This register, CM_REFCNT, provides a 32-bit count value.
+         * The count increments at the fixed reference clock frequency of 24MHz
+         * and can be used as a real-time counter.
+         */
+        return (uint32_t)muldiv64(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), 24,
+                                  1000) - s->cm_refcnt_offset;
    case 12: /* CM_FLAGS */
        return s->cm_flags;
    case 14: /* CM_NVFLAGS */
@@ -257,6 +262,8 @@ static int integratorcm_init(SysBusDevice *dev)
    }
    memcpy(integrator_spd + 73, "QEMU-MEMORY", 11);
    s->cm_init = 0x00000112;
+    s->cm_refcnt_offset = muldiv64(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), 24,
+                                   1000);
    memory_region_init_ram(&s->flash, OBJECT(s), "integrator.flash", 0x100000);
    vmstate_register_ram_global(&s->flash);

--- a/hw/arm/mainstone.c
+++ b/hw/arm/mainstone.c
@@ -75,18 +75,9 @@ static struct keymap map[0xE0] = {
    [0x2c] = {4,3}, /* z */
    [0xc7] = {5,0}, /* Home */
    [0x2a] = {5,1}, /* shift */
-    /*
-     * There are two matrix positions which map to space,
-     * but QEMU can only use one of them for the reverse
-     * mapping, so simply use the second one.
-     */
-    /* [0x39] = {5,2}, space */
+    [0x39] = {5,2}, /* space */
    [0x39] = {5,3}, /* space */
-    /*
-     * Matrix position {5,4} and other keys are missing here.
-     * TODO: Compare with Linux code and test real hardware.
-     */
-    [0x1c] = {5,5}, /* enter (TODO: might be wrong) */
+    [0x1c] = {5,5}, /*  enter */
    [0xc8] = {6,0}, /* up */
    [0xd0] = {6,1}, /* down */
    [0xcb] = {6,2}, /* left */
--- a/hw/arm/musicpal.c
+++ b/hw/arm/musicpal.c
@@ -92,6 +92,8 @@
 #define MP_ETH_CRDP3            0x4AC
 #define MP_ETH_CTDP0            0x4E0
 #define MP_ETH_CTDP1            0x4E4
+#define MP_ETH_CTDP2            0x4E8
+#define MP_ETH_CTDP3            0x4EC

 /* MII PHY access */
 #define MP_ETH_SMIR_DATA        0x0000FFFF
@@ -306,7 +308,7 @@ static uint64_t mv88w8618_eth_read(void *opaque, hwaddr offset,
    case MP_ETH_CRDP0 ... MP_ETH_CRDP3:
        return s->rx_queue[(offset - MP_ETH_CRDP0)/4];

-    case MP_ETH_CTDP0 ... MP_ETH_CTDP1:
+    case MP_ETH_CTDP0 ... MP_ETH_CTDP3:
        return s->tx_queue[(offset - MP_ETH_CTDP0)/4];

    default:
@@ -360,7 +362,7 @@ static void mv88w8618_eth_write(void *opaque, hwaddr offset,
            s->cur_rx[(offset - MP_ETH_CRDP0)/4] = value;
        break;

-    case MP_ETH_CTDP0 ... MP_ETH_CTDP1:
+    case MP_ETH_CTDP0 ... MP_ETH_CTDP3:
        s->tx_queue[(offset - MP_ETH_CTDP0)/4] = value;
        break;
    }
--- a/hw/arm/omap1.c
+++ b/hw/arm/omap1.c
@@ -172,7 +172,7 @@ static void omap_timer_clk_update(void *opaque, int line, int on)
 static void omap_timer_clk_setup(struct omap_mpu_timer_s *timer)
 {
    omap_clk_adduser(timer->clk,
-                    qemu_allocate_irq(omap_timer_clk_update, timer, 0));
+                    qemu_allocate_irqs(omap_timer_clk_update, timer, 1)[0]);
    timer->rate = omap_clk_getrate(timer->clk);
 }

@@ -2094,7 +2094,7 @@ static struct omap_mpuio_s *omap_mpuio_init(MemoryRegion *memory,
                          "omap-mpuio", 0x800);
    memory_region_add_subregion(memory, base, &s->iomem);

-    omap_clk_adduser(clk, qemu_allocate_irq(omap_mpuio_onoff, s, 0));
+    omap_clk_adduser(clk, qemu_allocate_irqs(omap_mpuio_onoff, s, 1)[0]);

    return s;
 }
@@ -2397,7 +2397,7 @@ static struct omap_pwl_s *omap_pwl_init(MemoryRegion *system_memory,
                          "omap-pwl", 0x800);
    memory_region_add_subregion(system_memory, base, &s->iomem);

-    omap_clk_adduser(clk, qemu_allocate_irq(omap_pwl_clk_update, s, 0));
+    omap_clk_adduser(clk, qemu_allocate_irqs(omap_pwl_clk_update, s, 1)[0]);
    return s;
 }

@@ -3481,8 +3481,8 @@ static void omap_mcbsp_i2s_start(void *opaque, int line, int level)
 void omap_mcbsp_i2s_attach(struct omap_mcbsp_s *s, I2SCodec *slave)
 {
    s->codec = slave;
-    slave->rx_swallow = qemu_allocate_irq(omap_mcbsp_i2s_swallow, s, 0);
-    slave->tx_start = qemu_allocate_irq(omap_mcbsp_i2s_start, s, 0);
+    slave->rx_swallow = qemu_allocate_irqs(omap_mcbsp_i2s_swallow, s, 1)[0];
+    slave->tx_start = qemu_allocate_irqs(omap_mcbsp_i2s_start, s, 1)[0];
 }

 /* LED Pulse Generators */
@@ -3630,7 +3630,7 @@ static struct omap_lpg_s *omap_lpg_init(MemoryRegion *system_memory,
    memory_region_init_io(&s->iomem, NULL, &omap_lpg_ops, s, "omap-lpg", 0x800);
    memory_region_add_subregion(system_memory, base, &s->iomem);

-    omap_clk_adduser(clk, qemu_allocate_irq(omap_lpg_clk_update, s, 0));
+    omap_clk_adduser(clk, qemu_allocate_irqs(omap_lpg_clk_update, s, 1)[0]);

    return s;
 }
@@ -3844,7 +3844,7 @@ struct omap_mpu_state_s *omap310_mpu_init(MemoryRegion *system_memory,
    s->sdram_size = sdram_size;
    s->sram_size = OMAP15XX_SRAM_SIZE;

-    s->wakeup = qemu_allocate_irq(omap_mpu_wakeup, s, 0);
+    s->wakeup = qemu_allocate_irqs(omap_mpu_wakeup, s, 1)[0];

    /* Clocks */
    omap_clk_init(s);
--- a/hw/arm/omap2.c
+++ b/hw/arm/omap2.c
@@ -2260,7 +2260,7 @@ struct omap_mpu_state_s *omap2420_mpu_init(MemoryRegion *sysmem,
    s->sdram_size = sdram_size;
    s->sram_size = OMAP242X_SRAM_SIZE;

-    s->wakeup = qemu_allocate_irq(omap_mpu_wakeup, s, 0);
+    s->wakeup = qemu_allocate_irqs(omap_mpu_wakeup, s, 1)[0];

    /* Clocks */
    omap_clk_init(s);
--- a/hw/arm/pxa2xx.c
+++ b/hw/arm/pxa2xx.c
@@ -742,7 +742,7 @@ static void pxa2xx_ssp_save(QEMUFile *f, void *opaque)
 static int pxa2xx_ssp_load(QEMUFile *f, void *opaque, int version_id)
 {
    PXA2xxSSPState *s = (PXA2xxSSPState *) opaque;
-    int i, v;
+    int i;

    s->enable = qemu_get_be32(f);

@@ -756,11 +756,7 @@ static int pxa2xx_ssp_load(QEMUFile *f, void *opaque, int version_id)
    qemu_get_8s(f, &s->ssrsa);
    qemu_get_8s(f, &s->ssacd);

-    v = qemu_get_byte(f);
-    if (v < 0 || v > ARRAY_SIZE(s->rx_fifo)) {
-        return -EINVAL;
-    }
-    s->rx_level = v;
+    s->rx_level = qemu_get_byte(f);
    s->rx_start = 0;
    for (i = 0; i < s->rx_level; i ++)
        s->rx_fifo[i] = qemu_get_byte(f);
@@ -2057,7 +2053,7 @@ PXA2xxState *pxa270_init(MemoryRegion *address_space,
        fprintf(stderr, "Unable to find CPU definition\n");
        exit(1);
    }
-    s->reset = qemu_allocate_irq(pxa2xx_reset, s, 0);
+    s->reset = qemu_allocate_irqs(pxa2xx_reset, s, 1)[0];

    /* SDRAM & Internal Memory Storage */
    memory_region_init_ram(&s->sdram, NULL, "pxa270.sdram", sdram_size);
@@ -2188,7 +2184,7 @@ PXA2xxState *pxa255_init(MemoryRegion *address_space, unsigned int sdram_size)
        fprintf(stderr, "Unable to find CPU definition\n");
        exit(1);
    }
-    s->reset = qemu_allocate_irq(pxa2xx_reset, s, 0);
+    s->reset = qemu_allocate_irqs(pxa2xx_reset, s, 1)[0];

    /* SDRAM & Internal Memory Storage */
    memory_region_init_ram(&s->sdram, NULL, "pxa255.sdram", sdram_size);
--- a/hw/arm/spitz.c
+++ b/hw/arm/spitz.c
@@ -743,7 +743,7 @@ static void spitz_i2c_setup(PXA2xxState *cpu)

    spitz_wm8750_addr(wm, 0, 0);
    qdev_connect_gpio_out(cpu->gpio, SPITZ_GPIO_WM,
-                          qemu_allocate_irq(spitz_wm8750_addr, wm, 0));
+                    qemu_allocate_irqs(spitz_wm8750_addr, wm, 1)[0]);
    /* .. and to the sound interface.  */
    cpu->i2s->opaque = wm;
    cpu->i2s->codec_out = wm8750_dac_dat;
@@ -849,7 +849,7 @@ static void spitz_gpio_setup(PXA2xxState *cpu, int slots)
     * wouldn't guarantee that a guest ever exits the loop.
     */
    spitz_hsync = 0;
-    lcd_hsync = qemu_allocate_irq(spitz_lcd_hsync_handler, cpu, 0);
+    lcd_hsync = qemu_allocate_irqs(spitz_lcd_hsync_handler, cpu, 1)[0];
    pxa2xx_gpio_read_notifier(cpu->gpio, lcd_hsync);
    pxa2xx_lcd_vsync_notifier(cpu->lcd, lcd_hsync);

--- a/hw/arm/vexpress.c
+++ b/hw/arm/vexpress.c
@@ -419,13 +419,13 @@ static int add_virtio_mmio_node(void *fdt, uint32_t acells, uint32_t scells,
    int rc;
    char *nodename = g_strdup_printf("/virtio_mmio@%" PRIx64, addr);

-    rc = qemu_devtree_add_subnode(fdt, nodename);
-    rc |= qemu_devtree_setprop_string(fdt, nodename,
-                                      "compatible", "virtio,mmio");
-    rc |= qemu_devtree_setprop_sized_cells(fdt, nodename, "reg",
-                                           acells, addr, scells, size);
-    qemu_devtree_setprop_cells(fdt, nodename, "interrupt-parent", intc);
-    qemu_devtree_setprop_cells(fdt, nodename, "interrupts", 0, irq, 1);
+    rc = qemu_fdt_add_subnode(fdt, nodename);
+    rc |= qemu_fdt_setprop_string(fdt, nodename,
+                                  "compatible", "virtio,mmio");
+    rc |= qemu_fdt_setprop_sized_cells(fdt, nodename, "reg",
+                                       acells, addr, scells, size);
+    qemu_fdt_setprop_cells(fdt, nodename, "interrupt-parent", intc);
+    qemu_fdt_setprop_cells(fdt, nodename, "interrupts", 0, irq, 1);
    g_free(nodename);
    if (rc) {
        return -1;
@@ -456,8 +456,8 @@ static void vexpress_modify_dtb(const struct arm_boot_info *info, void *fdt)
    uint32_t acells, scells, intc;
    const VEDBoardInfo *daughterboard = (const VEDBoardInfo *)info;

-    acells = qemu_devtree_getprop_cell(fdt, "/", "#address-cells");
-    scells = qemu_devtree_getprop_cell(fdt, "/", "#size-cells");
+    acells = qemu_fdt_getprop_cell(fdt, "/", "#address-cells");
+    scells = qemu_fdt_getprop_cell(fdt, "/", "#size-cells");
    intc = find_int_controller(fdt);
    if (!intc) {
        /* Not fatal, we just won't provide virtio. This will
@@ -480,6 +480,36 @@ static void vexpress_modify_dtb(const struct arm_boot_info *info, void *fdt)
    }
 }

+
+/* Open code a private version of pflash registration since we
+ * need to set non-default device width for VExpress platform.
+ */
+static pflash_t *ve_pflash_cfi01_register(hwaddr base, const char *name,
+                                          DriveInfo *di)
+{
+    DeviceState *dev = qdev_create(NULL, "cfi.pflash01");
+
+    if (di && qdev_prop_set_drive(dev, "drive", di->bdrv)) {
+        abort();
+    }
+
+    qdev_prop_set_uint32(dev, "num-blocks",
+                         VEXPRESS_FLASH_SIZE / VEXPRESS_FLASH_SECT_SIZE);
+    qdev_prop_set_uint64(dev, "sector-length", VEXPRESS_FLASH_SECT_SIZE);
+    qdev_prop_set_uint8(dev, "width", 4);
+    qdev_prop_set_uint8(dev, "device-width", 2);
+    qdev_prop_set_uint8(dev, "big-endian", 0);
+    qdev_prop_set_uint16(dev, "id0", 0x89);
+    qdev_prop_set_uint16(dev, "id1", 0x18);
+    qdev_prop_set_uint16(dev, "id2", 0x00);
+    qdev_prop_set_uint16(dev, "id3", 0x00);
+    qdev_prop_set_string(dev, "name", name);
+    qdev_init_nofail(dev);
+
+    sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, base);
+    return OBJECT_CHECK(pflash_t, (dev), "cfi.pflash01");
+}
+
 static void vexpress_common_init(VEDBoardInfo *daughterboard,
                                 QEMUMachineInitArgs *args)
 {
@@ -561,11 +591,8 @@ static void vexpress_common_init(VEDBoardInfo *daughterboard,
    sysbus_create_simple("pl111", map[VE_CLCD], pic[14]);

    dinfo = drive_get_next(IF_PFLASH);
-    pflash0 = pflash_cfi01_register(map[VE_NORFLASH0], NULL, "vexpress.flash0",
-            VEXPRESS_FLASH_SIZE, dinfo ? dinfo->bdrv : NULL,
-            VEXPRESS_FLASH_SECT_SIZE,
-            VEXPRESS_FLASH_SIZE / VEXPRESS_FLASH_SECT_SIZE, 4,
-            0x00, 0x89, 0x00, 0x18, 0);
+    pflash0 = ve_pflash_cfi01_register(map[VE_NORFLASH0], "vexpress.flash0",
+                                       dinfo);
    if (!pflash0) {
        fprintf(stderr, "vexpress: error registering flash 0.\n");
        exit(1);
@@ -580,11 +607,8 @@ static void vexpress_common_init(VEDBoardInfo *daughterboard,
    }

    dinfo = drive_get_next(IF_PFLASH);
-    if (!pflash_cfi01_register(map[VE_NORFLASH1], NULL, "vexpress.flash1",
-            VEXPRESS_FLASH_SIZE, dinfo ? dinfo->bdrv : NULL,
-            VEXPRESS_FLASH_SECT_SIZE,
-            VEXPRESS_FLASH_SIZE / VEXPRESS_FLASH_SECT_SIZE, 4,
-            0x00, 0x89, 0x00, 0x18, 0)) {
+    if (!ve_pflash_cfi01_register(map[VE_NORFLASH1], "vexpress.flash1",
+                                  dinfo)) {
        fprintf(stderr, "vexpress: error registering flash 1.\n");
        exit(1);
    }
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -0,0 +1,452 @@
+/*
+ * ARM mach-virt emulation
+ *
+ * Copyright (c) 2013 Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Emulate a virtual board which works by passing Linux all the information
+ * it needs about what devices are present via the device tree.
+ * There are some restrictions about what we can do here:
+ *  + we can only present devices whose Linux drivers will work based
+ *    purely on the device tree with no platform data at all
+ *  + we want to present a very stripped-down minimalist platform,
+ *    both because this reduces the security attack surface from the guest
+ *    and also because it reduces our exposure to being broken when
+ *    the kernel updates its device tree bindings and requires further
+ *    information in a device binding that we aren't providing.
+ * This is essentially the same approach kvmtool uses.
+ */
+
+#include "hw/sysbus.h"
+#include "hw/arm/arm.h"
+#include "hw/arm/primecell.h"
+#include "hw/devices.h"
+#include "net/net.h"
+#include "sysemu/device_tree.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/kvm.h"
+#include "hw/boards.h"
+#include "exec/address-spaces.h"
+#include "qemu/bitops.h"
+#include "qemu/error-report.h"
+
+#define NUM_VIRTIO_TRANSPORTS 32
+
+/* Number of external interrupt lines to configure the GIC with */
+#define NUM_IRQS 128
+
+#define GIC_FDT_IRQ_TYPE_SPI 0
+#define GIC_FDT_IRQ_TYPE_PPI 1
+
+#define GIC_FDT_IRQ_FLAGS_EDGE_LO_HI 1
+#define GIC_FDT_IRQ_FLAGS_EDGE_HI_LO 2
+#define GIC_FDT_IRQ_FLAGS_LEVEL_HI 4
+#define GIC_FDT_IRQ_FLAGS_LEVEL_LO 8
+
+#define GIC_FDT_IRQ_PPI_CPU_START 8
+#define GIC_FDT_IRQ_PPI_CPU_WIDTH 8
+
+enum {
+    VIRT_FLASH,
+    VIRT_MEM,
+    VIRT_CPUPERIPHS,
+    VIRT_GIC_DIST,
+    VIRT_GIC_CPU,
+    VIRT_UART,
+    VIRT_MMIO,
+};
+
+typedef struct MemMapEntry {
+    hwaddr base;
+    hwaddr size;
+} MemMapEntry;
+
+typedef struct VirtBoardInfo {
+    struct arm_boot_info bootinfo;
+    const char *cpu_model;
+    const char *qdevname;
+    const char *gic_compatible;
+    const MemMapEntry *memmap;
+    const int *irqmap;
+    int smp_cpus;
+    void *fdt;
+    int fdt_size;
+    uint32_t clock_phandle;
+} VirtBoardInfo;
+
+/* Addresses and sizes of our components.
+ * 0..128MB is space for a flash device so we can run bootrom code such as UEFI.
+ * 128MB..256MB is used for miscellaneous device I/O.
+ * 256MB..1GB is reserved for possible future PCI support (ie where the
+ * PCI memory window will go if we add a PCI host controller).
+ * 1GB and up is RAM (which may happily spill over into the
+ * high memory region beyond 4GB).
+ * This represents a compromise between how much RAM can be given to
+ * a 32 bit VM and leaving space for expansion and in particular for PCI.
+ */
+static const MemMapEntry a15memmap[] = {
+    /* Space up to 0x8000000 is reserved for a boot ROM */
+    [VIRT_FLASH] = { 0, 0x8000000 },
+    [VIRT_CPUPERIPHS] = { 0x8000000, 0x8000 },
+    /* GIC distributor and CPU interfaces sit inside the CPU peripheral space */
+    [VIRT_GIC_DIST] = { 0x8001000, 0x1000 },
+    [VIRT_GIC_CPU] = { 0x8002000, 0x1000 },
+    [VIRT_UART] = { 0x9000000, 0x1000 },
+    [VIRT_MMIO] = { 0xa000000, 0x200 },
+    /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */
+    /* 0x10000000 .. 0x40000000 reserved for PCI */
+    [VIRT_MEM] = { 0x40000000, 30ULL * 1024 * 1024 * 1024 },
+};
+
+static const int a15irqmap[] = {
+    [VIRT_UART] = 1,
+    [VIRT_MMIO] = 16, /* ...to 16 + NUM_VIRTIO_TRANSPORTS - 1 */
+};
+
+static VirtBoardInfo machines[] = {
+    {
+        .cpu_model = "cortex-a15",
+        .qdevname = "a15mpcore_priv",
+        .gic_compatible = "arm,cortex-a15-gic",
+        .memmap = a15memmap,
+        .irqmap = a15irqmap,
+    },
+    {
+        .cpu_model = "host",
+        /* We use the A15 private peripheral model to get a V2 GIC */
+        .qdevname = "a15mpcore_priv",
+        .gic_compatible = "arm,cortex-a15-gic",
+        .memmap = a15memmap,
+        .irqmap = a15irqmap,
+    },
+};
+
+static VirtBoardInfo *find_machine_info(const char *cpu)
+{
+    int i;
+
+    for (i = 0; i < ARRAY_SIZE(machines); i++) {
+        if (strcmp(cpu, machines[i].cpu_model) == 0) {
+            return &machines[i];
+        }
+    }
+    return NULL;
+}
+
+static void create_fdt(VirtBoardInfo *vbi)
+{
+    void *fdt = create_device_tree(&vbi->fdt_size);
+
+    if (!fdt) {
+        error_report("create_device_tree() failed");
+        exit(1);
+    }
+
+    vbi->fdt = fdt;
+
+    /* Header */
+    qemu_fdt_setprop_string(fdt, "/", "compatible", "linux,dummy-virt");
+    qemu_fdt_setprop_cell(fdt, "/", "#address-cells", 0x2);
+    qemu_fdt_setprop_cell(fdt, "/", "#size-cells", 0x2);
+
+    /*
+     * /chosen and /memory nodes must exist for load_dtb
+     * to fill in necessary properties later
+     */
+    qemu_fdt_add_subnode(fdt, "/chosen");
+    qemu_fdt_add_subnode(fdt, "/memory");
+    qemu_fdt_setprop_string(fdt, "/memory", "device_type", "memory");
+
+    /* Clock node, for the benefit of the UART. The kernel device tree
+     * binding documentation claims the PL011 node clock properties are
+     * optional but in practice if you omit them the kernel refuses to
+     * probe for the device.
+     */
+    vbi->clock_phandle = qemu_fdt_alloc_phandle(fdt);
+    qemu_fdt_add_subnode(fdt, "/apb-pclk");
+    qemu_fdt_setprop_string(fdt, "/apb-pclk", "compatible", "fixed-clock");
+    qemu_fdt_setprop_cell(fdt, "/apb-pclk", "#clock-cells", 0x0);
+    qemu_fdt_setprop_cell(fdt, "/apb-pclk", "clock-frequency", 24000000);
+    qemu_fdt_setprop_string(fdt, "/apb-pclk", "clock-output-names",
+                                "clk24mhz");
+    qemu_fdt_setprop_cell(fdt, "/apb-pclk", "phandle", vbi->clock_phandle);
+
+    /* No PSCI for TCG yet */
+    if (kvm_enabled()) {
+        qemu_fdt_add_subnode(fdt, "/psci");
+        qemu_fdt_setprop_string(fdt, "/psci", "compatible", "arm,psci");
+        qemu_fdt_setprop_string(fdt, "/psci", "method", "hvc");
+        qemu_fdt_setprop_cell(fdt, "/psci", "cpu_suspend",
+                                  PSCI_FN_CPU_SUSPEND);
+        qemu_fdt_setprop_cell(fdt, "/psci", "cpu_off", PSCI_FN_CPU_OFF);
+        qemu_fdt_setprop_cell(fdt, "/psci", "cpu_on", PSCI_FN_CPU_ON);
+        qemu_fdt_setprop_cell(fdt, "/psci", "migrate", PSCI_FN_MIGRATE);
+    }
+}
+
+static void fdt_add_timer_nodes(const VirtBoardInfo *vbi)
+{
+    /* Note that on A15 h/w these interrupts are level-triggered,
+     * but for the GIC implementation provided by both QEMU and KVM
+     * they are edge-triggered.
+     */
+    uint32_t irqflags = GIC_FDT_IRQ_FLAGS_EDGE_LO_HI;
+
+    irqflags = deposit32(irqflags, GIC_FDT_IRQ_PPI_CPU_START,
+                         GIC_FDT_IRQ_PPI_CPU_WIDTH, (1 << vbi->smp_cpus) - 1);
+
+    qemu_fdt_add_subnode(vbi->fdt, "/timer");
+    qemu_fdt_setprop_string(vbi->fdt, "/timer",
+                                "compatible", "arm,armv7-timer");
+    qemu_fdt_setprop_cells(vbi->fdt, "/timer", "interrupts",
+                               GIC_FDT_IRQ_TYPE_PPI, 13, irqflags,
+                               GIC_FDT_IRQ_TYPE_PPI, 14, irqflags,
+                               GIC_FDT_IRQ_TYPE_PPI, 11, irqflags,
+                               GIC_FDT_IRQ_TYPE_PPI, 10, irqflags);
+}
+
+static void fdt_add_cpu_nodes(const VirtBoardInfo *vbi)
+{
+    int cpu;
+
+    qemu_fdt_add_subnode(vbi->fdt, "/cpus");
+    qemu_fdt_setprop_cell(vbi->fdt, "/cpus", "#address-cells", 0x1);
+    qemu_fdt_setprop_cell(vbi->fdt, "/cpus", "#size-cells", 0x0);
+
+    for (cpu = vbi->smp_cpus - 1; cpu >= 0; cpu--) {
+        char *nodename = g_strdup_printf("/cpus/cpu@%d", cpu);
+        ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(cpu));
+
+        qemu_fdt_add_subnode(vbi->fdt, nodename);
+        qemu_fdt_setprop_string(vbi->fdt, nodename, "device_type", "cpu");
+        qemu_fdt_setprop_string(vbi->fdt, nodename, "compatible",
+                                    armcpu->dtb_compatible);
+
+        if (vbi->smp_cpus > 1) {
+            qemu_fdt_setprop_string(vbi->fdt, nodename,
+                                        "enable-method", "psci");
+        }
+
+        qemu_fdt_setprop_cell(vbi->fdt, nodename, "reg", cpu);
+        g_free(nodename);
+    }
+}
+
+static void fdt_add_gic_node(const VirtBoardInfo *vbi)
+{
+    uint32_t gic_phandle;
+
+    gic_phandle = qemu_fdt_alloc_phandle(vbi->fdt);
+    qemu_fdt_setprop_cell(vbi->fdt, "/", "interrupt-parent", gic_phandle);
+
+    qemu_fdt_add_subnode(vbi->fdt, "/intc");
+    qemu_fdt_setprop_string(vbi->fdt, "/intc", "compatible",
+                                vbi->gic_compatible);
+    qemu_fdt_setprop_cell(vbi->fdt, "/intc", "#interrupt-cells", 3);
+    qemu_fdt_setprop(vbi->fdt, "/intc", "interrupt-controller", NULL, 0);
+    qemu_fdt_setprop_sized_cells(vbi->fdt, "/intc", "reg",
+                                     2, vbi->memmap[VIRT_GIC_DIST].base,
+                                     2, vbi->memmap[VIRT_GIC_DIST].size,
+                                     2, vbi->memmap[VIRT_GIC_CPU].base,
+                                     2, vbi->memmap[VIRT_GIC_CPU].size);
+    qemu_fdt_setprop_cell(vbi->fdt, "/intc", "phandle", gic_phandle);
+}
+
+static void create_uart(const VirtBoardInfo *vbi, qemu_irq *pic)
+{
+    char *nodename;
+    hwaddr base = vbi->memmap[VIRT_UART].base;
+    hwaddr size = vbi->memmap[VIRT_UART].size;
+    int irq = vbi->irqmap[VIRT_UART];
+    const char compat[] = "arm,pl011\0arm,primecell";
+    const char clocknames[] = "uartclk\0apb_pclk";
+
+    sysbus_create_simple("pl011", base, pic[irq]);
+
+    nodename = g_strdup_printf("/pl011@%" PRIx64, base);
+    qemu_fdt_add_subnode(vbi->fdt, nodename);
+    /* Note that we can't use setprop_string because of the embedded NUL */
+    qemu_fdt_setprop(vbi->fdt, nodename, "compatible",
+                         compat, sizeof(compat));
+    qemu_fdt_setprop_sized_cells(vbi->fdt, nodename, "reg",
+                                     2, base, 2, size);
+    qemu_fdt_setprop_cells(vbi->fdt, nodename, "interrupts",
+                               GIC_FDT_IRQ_TYPE_SPI, irq,
+                               GIC_FDT_IRQ_FLAGS_EDGE_LO_HI);
+    qemu_fdt_setprop_cells(vbi->fdt, nodename, "clocks",
+                               vbi->clock_phandle, vbi->clock_phandle);
+    qemu_fdt_setprop(vbi->fdt, nodename, "clock-names",
+                         clocknames, sizeof(clocknames));
+    g_free(nodename);
+}
+
+static void create_virtio_devices(const VirtBoardInfo *vbi, qemu_irq *pic)
+{
+    int i;
+    hwaddr size = vbi->memmap[VIRT_MMIO].size;
+
+    /* Note that we have to create the transports in forwards order
+     * so that command line devices are inserted lowest address first,
+     * and then add dtb nodes in reverse order so that they appear in
+     * the finished device tree lowest address first.
+     */
+    for (i = 0; i < NUM_VIRTIO_TRANSPORTS; i++) {
+        int irq = vbi->irqmap[VIRT_MMIO] + i;
+        hwaddr base = vbi->memmap[VIRT_MMIO].base + i * size;
+
+        sysbus_create_simple("virtio-mmio", base, pic[irq]);
+    }
+
+    for (i = NUM_VIRTIO_TRANSPORTS - 1; i >= 0; i--) {
+        char *nodename;
+        int irq = vbi->irqmap[VIRT_MMIO] + i;
+        hwaddr base = vbi->memmap[VIRT_MMIO].base + i * size;
+
+        nodename = g_strdup_printf("/virtio_mmio@%" PRIx64, base);
+        qemu_fdt_add_subnode(vbi->fdt, nodename);
+        qemu_fdt_setprop_string(vbi->fdt, nodename,
+                                "compatible", "virtio,mmio");
+        qemu_fdt_setprop_sized_cells(vbi->fdt, nodename, "reg",
+                                     2, base, 2, size);
+        qemu_fdt_setprop_cells(vbi->fdt, nodename, "interrupts",
+                               GIC_FDT_IRQ_TYPE_SPI, irq,
+                               GIC_FDT_IRQ_FLAGS_EDGE_LO_HI);
+        g_free(nodename);
+    }
+}
+
+static void *machvirt_dtb(const struct arm_boot_info *binfo, int *fdt_size)
+{
+    const VirtBoardInfo *board = (const VirtBoardInfo *)binfo;
+
+    *fdt_size = board->fdt_size;
+    return board->fdt;
+}
+
+static void machvirt_init(QEMUMachineInitArgs *args)
+{
+    qemu_irq pic[NUM_IRQS];
+    MemoryRegion *sysmem = get_system_memory();
+    int n;
+    MemoryRegion *ram = g_new(MemoryRegion, 1);
+    DeviceState *dev;
+    SysBusDevice *busdev;
+    const char *cpu_model = args->cpu_model;
+    VirtBoardInfo *vbi;
+
+    if (!cpu_model) {
+        cpu_model = "cortex-a15";
+    }
+
+    vbi = find_machine_info(cpu_model);
+
+    if (!vbi) {
+        error_report("mach-virt: CPU %s not supported", cpu_model);
+        exit(1);
+    }
+
+    vbi->smp_cpus = smp_cpus;
+
+    /*
+     * Only supported method of starting secondary CPUs is PSCI and
+     * PSCI is not yet supported with TCG, so limit smp_cpus to 1
+     * if we're not using KVM.
+     */
+    if (!kvm_enabled() && smp_cpus > 1) {
+        error_report("mach-virt: must enable KVM to use multiple CPUs");
+        exit(1);
+    }
+
+    if (args->ram_size > vbi->memmap[VIRT_MEM].size) {
+        error_report("mach-virt: cannot model more than 30GB RAM");
+        exit(1);
+    }
+
+    create_fdt(vbi);
+    fdt_add_timer_nodes(vbi);
+
+    for (n = 0; n < smp_cpus; n++) {
+        ObjectClass *oc = cpu_class_by_name(TYPE_ARM_CPU, cpu_model);
+        Object *cpuobj;
+
+        if (!oc) {
+            fprintf(stderr, "Unable to find CPU definition\n");
+            exit(1);
+        }
+        cpuobj = object_new(object_class_get_name(oc));
+
+        /* Secondary CPUs start in PSCI powered-down state */
+        if (n > 0) {
+            object_property_set_bool(cpuobj, true, "start-powered-off", NULL);
+        }
+        object_property_set_bool(cpuobj, true, "realized", NULL);
+    }
+    fdt_add_cpu_nodes(vbi);
+
+    memory_region_init_ram(ram, NULL, "mach-virt.ram", args->ram_size);
+    vmstate_register_ram_global(ram);
+    memory_region_add_subregion(sysmem, vbi->memmap[VIRT_MEM].base, ram);
+
+    dev = qdev_create(NULL, vbi->qdevname);
+    qdev_prop_set_uint32(dev, "num-cpu", smp_cpus);
+    /* Note that the num-irq property counts both internal and external
+     * interrupts; there are always 32 of the former (mandated by GIC spec).
+     */
+    qdev_prop_set_uint32(dev, "num-irq", NUM_IRQS + 32);
+    qdev_init_nofail(dev);
+    busdev = SYS_BUS_DEVICE(dev);
+    sysbus_mmio_map(busdev, 0, vbi->memmap[VIRT_CPUPERIPHS].base);
+    fdt_add_gic_node(vbi);
+    for (n = 0; n < smp_cpus; n++) {
+        DeviceState *cpudev = DEVICE(qemu_get_cpu(n));
+
+        sysbus_connect_irq(busdev, n, qdev_get_gpio_in(cpudev, ARM_CPU_IRQ));
+    }
+
+    for (n = 0; n < NUM_IRQS; n++) {
+        pic[n] = qdev_get_gpio_in(dev, n);
+    }
+
+    create_uart(vbi, pic);
+
+    /* Create mmio transports, so the user can create virtio backends
+     * (which will be automatically plugged in to the transports). If
+     * no backend is created the transport will just sit harmlessly idle.
+     */
+    create_virtio_devices(vbi, pic);
+
+    vbi->bootinfo.ram_size = args->ram_size;
+    vbi->bootinfo.kernel_filename = args->kernel_filename;
+    vbi->bootinfo.kernel_cmdline = args->kernel_cmdline;
+    vbi->bootinfo.initrd_filename = args->initrd_filename;
+    vbi->bootinfo.nb_cpus = smp_cpus;
+    vbi->bootinfo.board_id = -1;
+    vbi->bootinfo.loader_start = vbi->memmap[VIRT_MEM].base;
+    vbi->bootinfo.get_dtb = machvirt_dtb;
+    arm_load_kernel(ARM_CPU(first_cpu), &vbi->bootinfo);
+}
+
+static QEMUMachine machvirt_a15_machine = {
+    .name = "virt",
+    .desc = "ARM Virtual Machine",
+    .init = machvirt_init,
+    .max_cpus = 4,
+};
+
+static void machvirt_machine_init(void)
+{
+    qemu_register_machine(&machvirt_a15_machine);
+}
+
+machine_init(machvirt_machine_init);
--- a/hw/arm/xilinx_zynq.c
+++ b/hw/arm/xilinx_zynq.c
@@ -25,6 +25,7 @@
 #include "sysemu/blockdev.h"
 #include "hw/loader.h"
 #include "hw/ssi.h"
+#include "qemu/error-report.h"

 #define NUM_SPI_FLASHES 4
 #define NUM_QSPI_FLASHES 2
@@ -35,6 +36,8 @@

 #define IRQ_OFFSET 32 /* pic interrupts start from index 32 */

+#define MPCORE_PERIPHBASE 0xF8F00000
+
 static const int dma_irqs[8] = {
    46, 47, 48, 49, 72, 73, 74, 75
 };
@@ -102,6 +105,7 @@ static void zynq_init(QEMUMachineInitArgs *args)
    const char *kernel_filename = args->kernel_filename;
    const char *kernel_cmdline = args->kernel_cmdline;
    const char *initrd_filename = args->initrd_filename;
+    ObjectClass *cpu_oc;
    ARMCPU *cpu;
    MemoryRegion *address_space_mem = get_system_memory();
    MemoryRegion *ext_ram = g_new(MemoryRegion, 1);
@@ -110,15 +114,24 @@ static void zynq_init(QEMUMachineInitArgs *args)
    SysBusDevice *busdev;
    qemu_irq pic[64];
    NICInfo *nd;
+    Error *err = NULL;
    int n;

    if (!cpu_model) {
        cpu_model = "cortex-a9";
    }
+    cpu_oc = cpu_class_by_name(TYPE_ARM_CPU, cpu_model);

-    cpu = cpu_arm_init(cpu_model);
-    if (!cpu) {
-        fprintf(stderr, "Unable to find CPU definition\n");
+    cpu = ARM_CPU(object_new(object_class_get_name(cpu_oc)));
+
+    object_property_set_int(OBJECT(cpu), MPCORE_PERIPHBASE, "reset-cbar", &err);
+    if (err) {
+        error_report("%s", error_get_pretty(err));
+        exit(1);
+    }
+    object_property_set_bool(OBJECT(cpu), true, "realized", &err);
+    if (err) {
+        error_report("%s", error_get_pretty(err));
        exit(1);
    }

@@ -154,7 +167,7 @@ static void zynq_init(QEMUMachineInitArgs *args)
    qdev_prop_set_uint32(dev, "num-cpu", 1);
    qdev_init_nofail(dev);
    busdev = SYS_BUS_DEVICE(dev);
-    sysbus_mmio_map(busdev, 0, 0xF8F00000);
+    sysbus_mmio_map(busdev, 0, MPCORE_PERIPHBASE);
    sysbus_connect_irq(busdev, 0,
                       qdev_get_gpio_in(DEVICE(cpu), ARM_CPU_IRQ));

--- a/hw/arm/z2.c
+++ b/hw/arm/z2.c
@@ -359,7 +359,7 @@ static void z2_init(QEMUMachineInitArgs *args)
    wm8750_data_req_set(wm, mpu->i2s->data_req, mpu->i2s);

    qdev_connect_gpio_out(mpu->gpio, Z2_GPIO_LCD_CS,
-                          qemu_allocate_irq(z2_lcd_cs, z2_lcd, 0));
+        qemu_allocate_irqs(z2_lcd_cs, z2_lcd, 1)[0]);

    z2_binfo.kernel_filename = kernel_filename;
    z2_binfo.kernel_cmdline = kernel_cmdline;
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -380,8 +380,9 @@ static void start_data_plane_bh(void *opaque)
                       s, QEMU_THREAD_JOINABLE);
 }

-bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *blk,
-                                  VirtIOBlockDataPlane **dataplane)
+void virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *blk,
+                                  VirtIOBlockDataPlane **dataplane,
+                                  Error **errp)
 {
    VirtIOBlockDataPlane *s;
    int fd;
@@ -389,33 +390,35 @@ bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *blk,
    *dataplane = NULL;

    if (!blk->data_plane) {
-        return true;
+        return;
    }

    if (blk->scsi) {
-        error_report("device is incompatible with x-data-plane, use scsi=off");
-        return false;
+        error_setg(errp,
+                   "device is incompatible with x-data-plane, use scsi=off");
+        return;
    }

    if (blk->config_wce) {
-        error_report("device is incompatible with x-data-plane, "
-                     "use config-wce=off");
-        return false;
+        error_setg(errp, "device is incompatible with x-data-plane, "
+                         "use config-wce=off");
+        return;
    }

    /* If dataplane is (re-)enabled while the guest is running there could be
     * block jobs that can conflict.
     */
    if (bdrv_in_use(blk->conf.bs)) {
-        error_report("cannot start dataplane thread while device is in use");
-        return false;
+        error_setg(errp,
+                   "cannot start dataplane thread while device is in use");
+        return;
    }

    fd = raw_get_aio_fd(blk->conf.bs);
    if (fd < 0) {
-        error_report("drive is incompatible with x-data-plane, "
-                     "use format=raw,cache=none,aio=native");
-        return false;
+        error_setg(errp, "drive is incompatible with x-data-plane, "
+                         "use format=raw,cache=none,aio=native");
+        return;
    }

    s = g_new0(VirtIOBlockDataPlane, 1);
@@ -427,7 +430,6 @@ bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *blk,
    bdrv_set_in_use(blk->conf.bs, 1);

    *dataplane = s;
-    return true;
 }

 void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s)
--- a/hw/block/dataplane/virtio-blk.h
+++ b/hw/block/dataplane/virtio-blk.h
@@ -19,8 +19,9 @@

 typedef struct VirtIOBlockDataPlane VirtIOBlockDataPlane;

-bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *blk,
-                                  VirtIOBlockDataPlane **dataplane);
+void virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *blk,
+                                  VirtIOBlockDataPlane **dataplane,
+                                  Error **errp);
 void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s);
 void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s);
 void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s);
--- a/hw/block/pflash_cfi01.c
+++ b/hw/block/pflash_cfi01.c
@@ -40,6 +40,7 @@
 #include "hw/block/flash.h"
 #include "block/block.h"
 #include "qemu/timer.h"
+#include "qemu/bitops.h"
 #include "exec/address-spaces.h"
 #include "qemu/host-utils.h"
 #include "hw/sysbus.h"
@@ -71,7 +72,9 @@ struct pflash_t {
    BlockDriverState *bs;
    uint32_t nb_blocs;
    uint64_t sector_len;
-    uint8_t width;
+    uint8_t bank_width;
+    uint8_t device_width; /* If 0, device width not specified. */
+    uint8_t max_device_width;  /* max device width in bytes */
    uint8_t be;
    uint8_t wcycle; /* if 0, the flash is read normally */
    int ro;
@@ -116,6 +119,119 @@ static void pflash_timer (void *opaque)
    pfl->cmd = 0;
 }

+/* Perform a CFI query based on the bank width of the flash.
+ * If this code is called we know we have a device_width set for
+ * this flash.
+ */
+static uint32_t pflash_cfi_query(pflash_t *pfl, hwaddr offset)
+{
+    int i;
+    uint32_t resp = 0;
+    hwaddr boff;
+
+    /* Adjust incoming offset to match expected device-width
+     * addressing. CFI query addresses are always specified in terms of
+     * the maximum supported width of the device.  This means that x8
+     * devices and x8/x16 devices in x8 mode behave differently.  For
+     * devices that are not used at their max width, we will be
+     * provided with addresses that use higher address bits than
+     * expected (based on the max width), so we will shift them lower
+     * so that they will match the addresses used when
+     * device_width==max_device_width.
+     */
+    boff = offset >> (ctz32(pfl->bank_width) +
+                      ctz32(pfl->max_device_width) - ctz32(pfl->device_width));
+
+    if (boff > pfl->cfi_len) {
+        return 0;
+    }
+    /* Now we will construct the CFI response generated by a single
+     * device, then replicate that for all devices that make up the
+     * bus.  For wide parts used in x8 mode, CFI query responses
+     * are different than native byte-wide parts.
+     */
+    resp = pfl->cfi_table[boff];
+    if (pfl->device_width != pfl->max_device_width) {
+        /* The only case currently supported is x8 mode for a
+         * wider part.
+         */
+        if (pfl->device_width != 1 || pfl->bank_width > 4) {
+            DPRINTF("%s: Unsupported device configuration: "
+                    "device_width=%d, max_device_width=%d\n",
+                    __func__, pfl->device_width,
+                    pfl->max_device_width);
+            return 0;
+        }
+        /* CFI query data is repeated, rather than zero padded for
+         * wide devices used in x8 mode.
+         */
+        for (i = 1; i < pfl->max_device_width; i++) {
+            resp = deposit32(resp, 8 * i, 8, pfl->cfi_table[boff]);
+        }
+    }
+    /* Replicate responses for each device in bank. */
+    if (pfl->device_width < pfl->bank_width) {
+        for (i = pfl->device_width;
+             i < pfl->bank_width; i += pfl->device_width) {
+            resp = deposit32(resp, 8 * i, 8 * pfl->device_width, resp);
+        }
+    }
+
+    return resp;
+}
+
+
+
+/* Perform a device id query based on the bank width of the flash. */
+static uint32_t pflash_devid_query(pflash_t *pfl, hwaddr offset)
+{
+    int i;
+    uint32_t resp;
+    hwaddr boff;
+
+    /* Adjust incoming offset to match expected device-width
+     * addressing. Device ID read addresses are always specified in
+     * terms of the maximum supported width of the device.  This means
+     * that x8 devices and x8/x16 devices in x8 mode behave
+     * differently. For devices that are not used at their max width,
+     * we will be provided with addresses that use higher address bits
+     * than expected (based on the max width), so we will shift them
+     * lower so that they will match the addresses used when
+     * device_width==max_device_width.
+     */
+    boff = offset >> (ctz32(pfl->bank_width) +
+                      ctz32(pfl->max_device_width) - ctz32(pfl->device_width));
+
+    /* Mask off upper bits which may be used in to query block
+     * or sector lock status at other addresses.
+     * Offsets 2/3 are block lock status, is not emulated.
+     */
+    switch (boff & 0xFF) {
+    case 0:
+        resp = pfl->ident0;
+        DPRINTF("%s: Manufacturer Code %04x\n", __func__, ret);
+        break;
+    case 1:
+        resp = pfl->ident1;
+        DPRINTF("%s: Device ID Code %04x\n", __func__, ret);
+        break;
+    default:
+        DPRINTF("%s: Read Device Information offset=%x\n", __func__,
+                (unsigned)offset);
+        return 0;
+        break;
+    }
+    /* Replicate responses for each device in bank. */
+    if (pfl->device_width < pfl->bank_width) {
+        for (i = pfl->device_width;
+              i < pfl->bank_width; i += pfl->device_width) {
+            resp = deposit32(resp, 8 * i, 8 * pfl->device_width, resp);
+        }
+    }
+
+    return resp;
+}
+
 static uint32_t pflash_read (pflash_t *pfl, hwaddr offset,
                             int width, int be)
 {
@@ -124,12 +240,6 @@ static uint32_t pflash_read (pflash_t *pfl, hwaddr offset,
    uint8_t *p;

    ret = -1;
-    boff = offset & 0xFF; /* why this here ?? */
-
-    if (pfl->width == 2)
-        boff = boff >> 1;
-    else if (pfl->width == 4)
-        boff = boff >> 2;

 #if 0
    DPRINTF("%s: reading offset " TARGET_FMT_plx " under cmd %02x width %d\n",
@@ -190,35 +300,88 @@ static uint32_t pflash_read (pflash_t *pfl, hwaddr offset,
    case 0x60: /* Block /un)lock */
    case 0x70: /* Status Register */
    case 0xe8: /* Write block */
-        /* Status register read */
+        /* Status register read.  Return status from each device in
+         * bank.
+         */
        ret = pfl->status;
-        if (width > 2) {
+        if (pfl->device_width && width > pfl->device_width) {
+            int shift = pfl->device_width * 8;
+            while (shift + pfl->device_width * 8 <= width * 8) {
+                ret |= pfl->status << shift;
+                shift += pfl->device_width * 8;
+            }
+        } else if (!pfl->device_width && width > 2) {
+            /* Handle 32 bit flash cases where device width is not
+             * set. (Existing behavior before device width added.)
+             */
            ret |= pfl->status << 16;
        }
        DPRINTF("%s: status %x\n", __func__, ret);
        break;
    case 0x90:
-        switch (boff) {
-        case 0:
-            ret = pfl->ident0 << 8 | pfl->ident1;
-            DPRINTF("%s: Manufacturer Code %04x\n", __func__, ret);
-            break;
-        case 1:
-            ret = pfl->ident2 << 8 | pfl->ident3;
-            DPRINTF("%s: Device ID Code %04x\n", __func__, ret);
-            break;
-        default:
-            DPRINTF("%s: Read Device Information boff=%x\n", __func__,
-                    (unsigned)boff);
-            ret = 0;
-            break;
+        if (!pfl->device_width) {
+            /* Preserve old behavior if device width not specified */
+            boff = offset & 0xFF;
+            if (pfl->bank_width == 2) {
+                boff = boff >> 1;
+            } else if (pfl->bank_width == 4) {
+                boff = boff >> 2;
+            }
+
+            switch (boff) {
+            case 0:
+                ret = pfl->ident0 << 8 | pfl->ident1;
+                DPRINTF("%s: Manufacturer Code %04x\n", __func__, ret);
+                break;
+            case 1:
+                ret = pfl->ident2 << 8 | pfl->ident3;
+                DPRINTF("%s: Device ID Code %04x\n", __func__, ret);
+                break;
+            default:
+                DPRINTF("%s: Read Device Information boff=%x\n", __func__,
+                        (unsigned)boff);
+                ret = 0;
+                break;
+            }
+        } else {
+            /* If we have a read larger than the bank_width, combine multiple
+             * manufacturer/device ID queries into a single response.
+             */
+            int i;
+            for (i = 0; i < width; i += pfl->bank_width) {
+                ret = deposit32(ret, i * 8, pfl->bank_width * 8,
+                                pflash_devid_query(pfl,
+                                                 offset + i * pfl->bank_width));
+            }
        }
        break;
    case 0x98: /* Query mode */
-        if (boff > pfl->cfi_len)
-            ret = 0;
-        else
-            ret = pfl->cfi_table[boff];
+        if (!pfl->device_width) {
+            /* Preserve old behavior if device width not specified */
+            boff = offset & 0xFF;
+            if (pfl->bank_width == 2) {
+                boff = boff >> 1;
+            } else if (pfl->bank_width == 4) {
+                boff = boff >> 2;
+            }
+
+            if (boff > pfl->cfi_len) {
+                ret = 0;
+            } else {
+                ret = pfl->cfi_table[boff];
+            }
+        } else {
+            /* If we have a read larger than the bank_width, combine multiple
+             * CFI queries into a single response.
+             */
+            int i;
+            for (i = 0; i < width; i += pfl->bank_width) {
+                ret = deposit32(ret, i * 8, pfl->bank_width * 8,
+                                pflash_cfi_query(pfl,
+                                                 offset + i * pfl->bank_width));
+            }
+        }
+
        break;
    }
    return ret;
@@ -378,6 +541,14 @@ static void pflash_write(pflash_t *pfl, hwaddr offset,

            break;
        case 0xe8:
+            /* Mask writeblock size based on device width, or bank width if
+             * device width not specified.
+             */
+            if (pfl->device_width) {
+                value = extract32(value, 0, pfl->device_width * 8);
+            } else {
+                value = extract32(value, 0, pfl->bank_width * 8);
+            }
            DPRINTF("%s: block write of %x bytes\n", __func__, value);
            pfl->counter = value;
            pfl->wcycle++;
@@ -613,6 +784,13 @@ static void pflash_cfi01_realize(DeviceState *dev, Error **errp)
        pfl->ro = 0;
    }

+    /* Default to devices being used at their maximum device width. This was
+     * assumed before the device_width support was added.
+     */
+    if (!pfl->max_device_width) {
+        pfl->max_device_width = pfl->device_width;
+    }
+
    pfl->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, pflash_timer, pfl);
    pfl->wcycle = 0;
    pfl->cmd = 0;
@@ -665,7 +843,7 @@ static void pflash_cfi01_realize(DeviceState *dev, Error **errp)
    pfl->cfi_table[0x28] = 0x02;
    pfl->cfi_table[0x29] = 0x00;
    /* Max number of bytes in multi-bytes write */
-    if (pfl->width == 1) {
+    if (pfl->bank_width == 1) {
        pfl->cfi_table[0x2A] = 0x08;
    } else {
        pfl->cfi_table[0x2A] = 0x0B;
@@ -706,7 +884,25 @@ static Property pflash_cfi01_properties[] = {
    DEFINE_PROP_DRIVE("drive", struct pflash_t, bs),
    DEFINE_PROP_UINT32("num-blocks", struct pflash_t, nb_blocs, 0),
    DEFINE_PROP_UINT64("sector-length", struct pflash_t, sector_len, 0),
-    DEFINE_PROP_UINT8("width", struct pflash_t, width, 0),
+    /* width here is the overall width of this QEMU device in bytes.
+     * The QEMU device may be emulating a number of flash devices
+     * wired up in parallel; the width of each individual flash
+     * device should be specified via device-width. If the individual
+     * devices have a maximum width which is greater than the width
+     * they are being used for, this maximum width should be set via
+     * max-device-width (which otherwise defaults to device-width).
+     * So for instance a 32-bit wide QEMU flash device made from four
+     * 16-bit flash devices used in 8-bit wide mode would be configured
+     * with width = 4, device-width = 1, max-device-width = 2.
+     *
+     * If device-width is not specified we default to backwards
+     * compatible behaviour which is a bad emulation of two
+     * 16 bit devices making up a 32 bit wide QEMU device. This
+     * is deprecated for new uses of this device.
+     */
+    DEFINE_PROP_UINT8("width", struct pflash_t, bank_width, 0),
+    DEFINE_PROP_UINT8("device-width", struct pflash_t, device_width, 0),
+    DEFINE_PROP_UINT8("max-device-width", struct pflash_t, max_device_width, 0),
    DEFINE_PROP_UINT8("big-endian", struct pflash_t, be, 0),
    DEFINE_PROP_UINT16("id0", struct pflash_t, ident0, 0),
    DEFINE_PROP_UINT16("id1", struct pflash_t, ident1, 0),
@@ -745,8 +941,8 @@ pflash_t *pflash_cfi01_register(hwaddr base,
                                DeviceState *qdev, const char *name,
                                hwaddr size,
                                BlockDriverState *bs,
-                                uint32_t sector_len, int nb_blocs, int width,
-                                uint16_t id0, uint16_t id1,
+                                uint32_t sector_len, int nb_blocs,
+                                int bank_width, uint16_t id0, uint16_t id1,
                                uint16_t id2, uint16_t id3, int be)
 {
    DeviceState *dev = qdev_create(NULL, TYPE_CFI_PFLASH01);
@@ -756,7 +952,7 @@ pflash_t *pflash_cfi01_register(hwaddr base,
    }
    qdev_prop_set_uint32(dev, "num-blocks", nb_blocs);
    qdev_prop_set_uint64(dev, "sector-length", sector_len);
-    qdev_prop_set_uint8(dev, "width", width);
+    qdev_prop_set_uint8(dev, "width", bank_width);
    qdev_prop_set_uint8(dev, "big-endian", !!be);
    qdev_prop_set_uint16(dev, "id0", id0);
    qdev_prop_set_uint16(dev, "id1", id1);
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -657,6 +657,7 @@ static void virtio_blk_migration_state_changed(Notifier *notifier, void *data)
    VirtIOBlock *s = container_of(notifier, VirtIOBlock,
                                  migration_state_notifier);
    MigrationState *mig = data;
+    Error *err = NULL;

    if (migration_in_setup(mig)) {
        if (!s->dataplane) {
@@ -671,31 +672,39 @@ static void virtio_blk_migration_state_changed(Notifier *notifier, void *data)
        }
        bdrv_drain_all(); /* complete in-flight non-dataplane requests */
        virtio_blk_data_plane_create(VIRTIO_DEVICE(s), &s->blk,
-                                     &s->dataplane);
+                                     &s->dataplane, &err);
+        if (err != NULL) {
+            error_report("%s", error_get_pretty(err));
+            error_free(err);
+        }
    }
 }
 #endif /* CONFIG_VIRTIO_BLK_DATA_PLANE */

-static int virtio_blk_device_init(VirtIODevice *vdev)
+static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
 {
-    DeviceState *qdev = DEVICE(vdev);
-    VirtIOBlock *s = VIRTIO_BLK(vdev);
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VirtIOBlock *s = VIRTIO_BLK(dev);
    VirtIOBlkConf *blk = &(s->blk);
+#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE
+    Error *err = NULL;
+#endif
    static int virtio_blk_id;

    if (!blk->conf.bs) {
-        error_report("drive property not set");
-        return -1;
+        error_setg(errp, "drive property not set");
+        return;
    }
    if (!bdrv_is_inserted(blk->conf.bs)) {
-        error_report("Device needs media, but drive is empty");
-        return -1;
+        error_setg(errp, "Device needs media, but drive is empty");
+        return;
    }

    blkconf_serial(&blk->conf, &blk->serial);
    s->original_wce = bdrv_enable_write_cache(blk->conf.bs);
    if (blkconf_geometry(&blk->conf, NULL, 65535, 255, 255) < 0) {
-        return -1;
+        error_setg(errp, "Error setting geometry");
+        return;
    }

    virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
@@ -708,36 +717,39 @@ static int virtio_blk_device_init(VirtIODevice *vdev)

    s->vq = virtio_add_queue(vdev, 128, virtio_blk_handle_output);
 #ifdef CONFIG_VIRTIO_BLK_DATA_PLANE
-    if (!virtio_blk_data_plane_create(vdev, blk, &s->dataplane)) {
+    virtio_blk_data_plane_create(vdev, blk, &s->dataplane, &err);
+    if (err != NULL) {
+        error_propagate(errp, err);
        virtio_cleanup(vdev);
-        return -1;
+        return;
    }
    s->migration_state_notifier.notify = virtio_blk_migration_state_changed;
    add_migration_state_change_notifier(&s->migration_state_notifier);
 #endif

    s->change = qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s);
-    register_savevm(qdev, "virtio-blk", virtio_blk_id++, 2,
+    register_savevm(dev, "virtio-blk", virtio_blk_id++, 2,
                    virtio_blk_save, virtio_blk_load, s);
    bdrv_set_dev_ops(s->bs, &virtio_block_ops, s);
    bdrv_set_buffer_alignment(s->bs, s->conf->logical_block_size);

    bdrv_iostatus_enable(s->bs);

-    add_boot_device_path(s->conf->bootindex, qdev, "/disk@0,0");
-    return 0;
+    add_boot_device_path(s->conf->bootindex, dev, "/disk@0,0");
 }

-static void virtio_blk_device_exit(VirtIODevice *vdev)
+static void virtio_blk_device_unrealize(DeviceState *dev, Error **errp)
 {
-    VirtIOBlock *s = VIRTIO_BLK(vdev);
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VirtIOBlock *s = VIRTIO_BLK(dev);
+
 #ifdef CONFIG_VIRTIO_BLK_DATA_PLANE
    remove_migration_state_change_notifier(&s->migration_state_notifier);
    virtio_blk_data_plane_destroy(s->dataplane);
    s->dataplane = NULL;
 #endif
    qemu_del_vm_change_state_handler(s->change);
-    unregister_savevm(DEVICE(vdev), "virtio-blk", s);
+    unregister_savevm(dev, "virtio-blk", s);
    blockdev_mark_auto_del(s->bs);
    virtio_cleanup(vdev);
 }
@@ -751,10 +763,11 @@ static void virtio_blk_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
    VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+
    dc->props = virtio_blk_properties;
    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
-    vdc->init = virtio_blk_device_init;
-    vdc->exit = virtio_blk_device_exit;
+    vdc->realize = virtio_blk_device_realize;
+    vdc->unrealize = virtio_blk_device_unrealize;
    vdc->get_config = virtio_blk_update_config;
    vdc->set_config = virtio_blk_set_config;
    vdc->get_features = virtio_blk_get_features;
--- a/hw/char/Makefile.objs
+++ b/hw/char/Makefile.objs
@@ -14,6 +14,7 @@ obj-$(CONFIG_COLDFIRE) += mcf_uart.o
 obj-$(CONFIG_OMAP) += omap_uart.o
 obj-$(CONFIG_SH4) += sh_serial.o
 obj-$(CONFIG_PSERIES) += spapr_vty.o
+obj-$(CONFIG_DIGIC) += digic-uart.o

 common-obj-$(CONFIG_ETRAXFS) += etraxfs_ser.o
 common-obj-$(CONFIG_ISA_DEBUG) += debugcon.o
--- a/hw/char/cadence_uart.c
+++ b/hw/char/cadence_uart.c
@@ -120,8 +120,8 @@ typedef struct {
    uint64_t char_tx_time;
    CharDriverState *chr;
    qemu_irq irq;
-    struct QEMUTimer *fifo_trigger_handle;
-    struct QEMUTimer *tx_time_handle;
+    QEMUTimer *fifo_trigger_handle;
+    QEMUTimer *tx_time_handle;
 } UartState;

 static void uart_update_status(UartState *s)
--- a/hw/char/digic-uart.c
+++ b/hw/char/digic-uart.c
@@ -0,0 +1,195 @@
+/*
+ * QEMU model of the Canon DIGIC UART block.
+ *
+ * Copyright (C) 2013 Antony Pavlov <antonynpavlov@gmail.com>
+ *
+ * This model is based on reverse engineering efforts
+ * made by CHDK (http://chdk.wikia.com) and
+ * Magic Lantern (http://www.magiclantern.fm) projects
+ * contributors.
+ *
+ * See "Serial terminal" docs here:
+ *   http://magiclantern.wikia.com/wiki/Register_Map#Misc_Registers
+ *
+ * The QEMU model of the Milkymist UART block by Michael Walle
+ * is used as a template.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include "hw/hw.h"
+#include "hw/sysbus.h"
+#include "sysemu/char.h"
+
+#include "hw/char/digic-uart.h"
+
+enum {
+    ST_RX_RDY = (1 << 0),
+    ST_TX_RDY = (1 << 1),
+};
+
+static uint64_t digic_uart_read(void *opaque, hwaddr addr,
+                                unsigned size)
+{
+    DigicUartState *s = opaque;
+    uint64_t ret = 0;
+
+    addr >>= 2;
+
+    switch (addr) {
+    case R_RX:
+        s->reg_st &= ~(ST_RX_RDY);
+        ret = s->reg_rx;
+        break;
+
+    case R_ST:
+        ret = s->reg_st;
+        break;
+
+    default:
+        qemu_log_mask(LOG_UNIMP,
+                      "digic-uart: read access to unknown register 0x"
+                      TARGET_FMT_plx, addr << 2);
+    }
+
+    return ret;
+}
+
+static void digic_uart_write(void *opaque, hwaddr addr, uint64_t value,
+                             unsigned size)
+{
+    DigicUartState *s = opaque;
+    unsigned char ch = value;
+
+    addr >>= 2;
+
+    switch (addr) {
+    case R_TX:
+        if (s->chr) {
+            qemu_chr_fe_write_all(s->chr, &ch, 1);
+        }
+        break;
+
+    case R_ST:
+        /*
+         * Ignore write to R_ST.
+         *
+         * The point is that this register is actively used
+         * during receiving and transmitting symbols,
+         * but we don't know the function of most of bits.
+         *
+         * Ignoring writes to R_ST is only a simplification
+         * of the model. It has no perceptible side effects
+         * for existing guests.
+         */
+        break;
+
+    default:
+        qemu_log_mask(LOG_UNIMP,
+                      "digic-uart: write access to unknown register 0x"
+                      TARGET_FMT_plx, addr << 2);
+    }
+}
+
+static const MemoryRegionOps uart_mmio_ops = {
+    .read = digic_uart_read,
+    .write = digic_uart_write,
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    },
+    .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static int uart_can_rx(void *opaque)
+{
+    DigicUartState *s = opaque;
+
+    return !(s->reg_st & ST_RX_RDY);
+}
+
+static void uart_rx(void *opaque, const uint8_t *buf, int size)
+{
+    DigicUartState *s = opaque;
+
+    assert(uart_can_rx(opaque));
+
+    s->reg_st |= ST_RX_RDY;
+    s->reg_rx = *buf;
+}
+
+static void uart_event(void *opaque, int event)
+{
+}
+
+static void digic_uart_reset(DeviceState *d)
+{
+    DigicUartState *s = DIGIC_UART(d);
+
+    s->reg_rx = 0;
+    s->reg_st = ST_TX_RDY;
+}
+
+static void digic_uart_realize(DeviceState *dev, Error **errp)
+{
+    DigicUartState *s = DIGIC_UART(dev);
+
+    s->chr = qemu_char_get_next_serial();
+    if (s->chr) {
+        qemu_chr_add_handlers(s->chr, uart_can_rx, uart_rx, uart_event, s);
+    }
+}
+
+static void digic_uart_init(Object *obj)
+{
+    DigicUartState *s = DIGIC_UART(obj);
+
+    memory_region_init_io(&s->regs_region, OBJECT(s), &uart_mmio_ops, s,
+                          TYPE_DIGIC_UART, 0x18);
+    sysbus_init_mmio(SYS_BUS_DEVICE(obj), &s->regs_region);
+}
+
+static const VMStateDescription vmstate_digic_uart = {
+    .name = "digic-uart",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32(reg_rx, DigicUartState),
+        VMSTATE_UINT32(reg_st, DigicUartState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void digic_uart_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->realize = digic_uart_realize;
+    dc->reset = digic_uart_reset;
+    dc->vmsd = &vmstate_digic_uart;
+}
+
+static const TypeInfo digic_uart_info = {
+    .name = TYPE_DIGIC_UART,
+    .parent = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(DigicUartState),
+    .instance_init = digic_uart_init,
+    .class_init = digic_uart_class_init,
+};
+
+static void digic_uart_register_types(void)
+{
+    type_register_static(&digic_uart_info);
+}
+
+type_init(digic_uart_register_types)
--- a/hw/char/virtio-serial-bus.c
+++ b/hw/char/virtio-serial-bus.c
@@ -670,7 +670,6 @@ static int virtio_serial_load(QEMUFile *f, void *opaque, int version_id)
    uint32_t max_nr_ports, nr_active_ports, ports_map;
    unsigned int i;
    int ret;
-    uint32_t tmp;

    if (version_id > 3) {
        return -EINVAL;
@@ -686,12 +685,17 @@ static int virtio_serial_load(QEMUFile *f, void *opaque, int version_id)
        return 0;
    }

-    /* Unused */
-    qemu_get_be16s(f, (uint16_t *) &tmp);
-    qemu_get_be16s(f, (uint16_t *) &tmp);
-    qemu_get_be32s(f, &tmp);
+    /* The config space */
+    qemu_get_be16s(f, &s->config.cols);
+    qemu_get_be16s(f, &s->config.rows);
+
+    qemu_get_be32s(f, &max_nr_ports);
+    tswap32s(&max_nr_ports);
+    if (max_nr_ports > tswap32(s->config.max_nr_ports)) {
+        /* Source could have had more ports than us. Fail migration. */
+        return -EINVAL;
+    }

-    max_nr_ports = tswap32(s->config.max_nr_ports);
    for (i = 0; i < (max_nr_ports + 31) / 32; i++) {
        qemu_get_be32s(f, &ports_map);

@@ -885,22 +889,24 @@ static int virtser_port_qdev_exit(DeviceState *qdev)
    return 0;
 }

-static int virtio_serial_device_init(VirtIODevice *vdev)
+static void virtio_serial_device_realize(DeviceState *dev, Error **errp)
 {
-    DeviceState *qdev = DEVICE(vdev);
-    VirtIOSerial *vser = VIRTIO_SERIAL(vdev);
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VirtIOSerial *vser = VIRTIO_SERIAL(dev);
+    BusState *bus;
    uint32_t i, max_supported_ports;

    if (!vser->serial.max_virtserial_ports) {
-        return -1;
+        error_setg(errp, "Maximum number of serial ports not specified");
+        return;
    }

    /* Each port takes 2 queues, and one pair is for the control queue */
    max_supported_ports = VIRTIO_PCI_QUEUE_MAX / 2 - 1;

    if (vser->serial.max_virtserial_ports > max_supported_ports) {
-        error_report("maximum ports supported: %u", max_supported_ports);
-        return -1;
+        error_setg(errp, "maximum ports supported: %u", max_supported_ports);
+        return;
    }

    virtio_init(vdev, "virtio-serial", VIRTIO_ID_CONSOLE,
@@ -908,8 +914,9 @@ static int virtio_serial_device_init(VirtIODevice *vdev)

    /* Spawn a new virtio-serial bus on which the ports will ride as devices */
    qbus_create_inplace(&vser->bus, sizeof(vser->bus), TYPE_VIRTIO_SERIAL_BUS,
-                        qdev, vdev->bus_name);
-    vser->bus.qbus.allow_hotplug = 1;
+                        dev, vdev->bus_name);
+    bus = BUS(&vser->bus);
+    bus->allow_hotplug = 1;
    vser->bus.vser = vser;
    QTAILQ_INIT(&vser->ports);

@@ -957,10 +964,8 @@ static int virtio_serial_device_init(VirtIODevice *vdev)
     * Register for the savevm section with the virtio-console name
     * to preserve backward compat
     */
-    register_savevm(qdev, "virtio-console", -1, 3, virtio_serial_save,
+    register_savevm(dev, "virtio-console", -1, 3, virtio_serial_save,
                    virtio_serial_load, vser);
-
-    return 0;
 }

 static void virtio_serial_port_class_init(ObjectClass *klass, void *data)
@@ -983,11 +988,12 @@ static const TypeInfo virtio_serial_port_type_info = {
    .class_init = virtio_serial_port_class_init,
 };

-static void virtio_serial_device_exit(VirtIODevice *vdev)
+static void virtio_serial_device_unrealize(DeviceState *dev, Error **errp)
 {
-    VirtIOSerial *vser = VIRTIO_SERIAL(vdev);
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VirtIOSerial *vser = VIRTIO_SERIAL(dev);

-    unregister_savevm(DEVICE(vdev), "virtio-console", vser);
+    unregister_savevm(dev, "virtio-console", vser);

    g_free(vser->ivqs);
    g_free(vser->ovqs);
@@ -1010,10 +1016,11 @@ static void virtio_serial_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
    VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+
    dc->props = virtio_serial_properties;
    set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
-    vdc->init = virtio_serial_device_init;
-    vdc->exit = virtio_serial_device_exit;
+    vdc->realize = virtio_serial_device_realize;
+    vdc->unrealize = virtio_serial_device_unrealize;
    vdc->get_features = get_features;
    vdc->get_config = get_config;
    vdc->set_config = set_config;
--- a/hw/core/irq.c
+++ b/hw/core/irq.c
@@ -102,7 +102,7 @@ qemu_irq qemu_irq_invert(qemu_irq irq)
 {
    /* The default state for IRQs is low, so raise the output now.  */
    qemu_irq_raise(irq);
-    return qemu_allocate_irq(qemu_notirq, irq, 0);
+    return qemu_allocate_irqs(qemu_notirq, irq, 1)[0];
 }

 static void qemu_splitirq(void *opaque, int line, int level)
@@ -117,7 +117,7 @@ qemu_irq qemu_irq_split(qemu_irq irq1, qemu_irq irq2)
    qemu_irq *s = g_malloc0(2 * sizeof(qemu_irq));
    s[0] = irq1;
    s[1] = irq2;
-    return qemu_allocate_irq(qemu_splitirq, s, 0);
+    return qemu_allocate_irqs(qemu_splitirq, s, 1)[0];
 }

 static void proxy_irq_handler(void *opaque, int n, int level)
--- a/hw/core/loader.c
+++ b/hw/core/loader.c
@@ -785,6 +785,13 @@ static void rom_reset(void *unused)
            g_free(rom->data);
            rom->data = NULL;
        }
+        /*
+         * The rom loader is really on the same level as firmware in the guest
+         * shadowing a ROM into RAM. Such a shadowing mechanism needs to ensure
+         * that the instruction cache for that new region is clear, so that the
+         * CPU definitely fetches its instructions from the just written data.
+         */
+        cpu_flush_icache_range(rom->addr, rom->datasize);
    }
 }

--- a/hw/cpu/a9mpcore.c
+++ b/hw/cpu/a9mpcore.c
@@ -24,11 +24,14 @@ static void a9mp_priv_initfn(Object *obj)
    memory_region_init(&s->container, obj, "a9mp-priv-container", 0x2000);
    sysbus_init_mmio(SYS_BUS_DEVICE(obj), &s->container);

+    object_initialize(&s->scu, sizeof(s->scu), TYPE_A9_SCU);
+    qdev_set_parent_bus(DEVICE(&s->scu), sysbus_get_default());
+
    object_initialize(&s->gic, sizeof(s->gic), TYPE_ARM_GIC);
    qdev_set_parent_bus(DEVICE(&s->gic), sysbus_get_default());

-    object_initialize(&s->scu, sizeof(s->scu), TYPE_A9_SCU);
-    qdev_set_parent_bus(DEVICE(&s->scu), sysbus_get_default());
+    object_initialize(&s->gtimer, sizeof(s->gtimer), TYPE_A9_GTIMER);
+    qdev_set_parent_bus(DEVICE(&s->gtimer), sysbus_get_default());

    object_initialize(&s->mptimer, sizeof(s->mptimer), TYPE_ARM_MPTIMER);
    qdev_set_parent_bus(DEVICE(&s->mptimer), sysbus_get_default());
@@ -41,11 +44,21 @@ static void a9mp_priv_realize(DeviceState *dev, Error **errp)
 {
    SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
    A9MPPrivState *s = A9MPCORE_PRIV(dev);
-    DeviceState *gicdev, *scudev, *mptimerdev, *wdtdev;
-    SysBusDevice *timerbusdev, *wdtbusdev, *gicbusdev, *scubusdev;
+    DeviceState *scudev, *gicdev, *gtimerdev, *mptimerdev, *wdtdev;
+    SysBusDevice *scubusdev, *gicbusdev, *gtimerbusdev, *mptimerbusdev,
+                 *wdtbusdev;
    Error *err = NULL;
    int i;

+    scudev = DEVICE(&s->scu);
+    qdev_prop_set_uint32(scudev, "num-cpu", s->num_cpu);
+    object_property_set_bool(OBJECT(&s->scu), true, "realized", &err);
+    if (err != NULL) {
+        error_propagate(errp, err);
+        return;
+    }
+    scubusdev = SYS_BUS_DEVICE(&s->scu);
+
    gicdev = DEVICE(&s->gic);
    qdev_prop_set_uint32(gicdev, "num-cpu", s->num_cpu);
    qdev_prop_set_uint32(gicdev, "num-irq", s->num_irq);
@@ -62,14 +75,14 @@ static void a9mp_priv_realize(DeviceState *dev, Error **errp)
    /* Pass through inbound GPIO lines to the GIC */
    qdev_init_gpio_in(dev, a9mp_priv_set_irq, s->num_irq - 32);

-    scudev = DEVICE(&s->scu);
-    qdev_prop_set_uint32(scudev, "num-cpu", s->num_cpu);
-    object_property_set_bool(OBJECT(&s->scu), true, "realized", &err);
+    gtimerdev = DEVICE(&s->gtimer);
+    qdev_prop_set_uint32(gtimerdev, "num-cpu", s->num_cpu);
+    object_property_set_bool(OBJECT(&s->gtimer), true, "realized", &err);
    if (err != NULL) {
        error_propagate(errp, err);
        return;
    }
-    scubusdev = SYS_BUS_DEVICE(&s->scu);
+    gtimerbusdev = SYS_BUS_DEVICE(&s->gtimer);

    mptimerdev = DEVICE(&s->mptimer);
    qdev_prop_set_uint32(mptimerdev, "num-cpu", s->num_cpu);
@@ -78,7 +91,7 @@ static void a9mp_priv_realize(DeviceState *dev, Error **errp)
        error_propagate(errp, err);
        return;
    }
-    timerbusdev = SYS_BUS_DEVICE(&s->mptimer);
+    mptimerbusdev = SYS_BUS_DEVICE(&s->mptimer);

    wdtdev = DEVICE(&s->wdt);
    qdev_prop_set_uint32(wdtdev, "num-cpu", s->num_cpu);
@@ -97,30 +110,33 @@ static void a9mp_priv_realize(DeviceState *dev, Error **errp)
     *  0x0600-0x06ff -- private timers and watchdogs
     *  0x0700-0x0fff -- nothing
     *  0x1000-0x1fff -- GIC Distributor
-     *
-     * We should implement the global timer but don't currently do so.
     */
    memory_region_add_subregion(&s->container, 0,
                                sysbus_mmio_get_region(scubusdev, 0));
    /* GIC CPU interface */
    memory_region_add_subregion(&s->container, 0x100,
                                sysbus_mmio_get_region(gicbusdev, 1));
+    memory_region_add_subregion(&s->container, 0x200,
+                                sysbus_mmio_get_region(gtimerbusdev, 0));
    /* Note that the A9 exposes only the "timer/watchdog for this core"
     * memory region, not the "timer/watchdog for core X" ones 11MPcore has.
     */
    memory_region_add_subregion(&s->container, 0x600,
-                                sysbus_mmio_get_region(timerbusdev, 0));
+                                sysbus_mmio_get_region(mptimerbusdev, 0));
    memory_region_add_subregion(&s->container, 0x620,
                                sysbus_mmio_get_region(wdtbusdev, 0));
    memory_region_add_subregion(&s->container, 0x1000,
                                sysbus_mmio_get_region(gicbusdev, 0));

    /* Wire up the interrupt from each watchdog and timer.
-     * For each core the timer is PPI 29 and the watchdog PPI 30.
+     * For each core the global timer is PPI 27, the private
+     * timer is PPI 29 and the watchdog PPI 30.
     */
    for (i = 0; i < s->num_cpu; i++) {
        int ppibase = (s->num_irq - 32) + i * 32;
-        sysbus_connect_irq(timerbusdev, i,
+        sysbus_connect_irq(gtimerbusdev, i,
+                           qdev_get_gpio_in(gicdev, ppibase + 27));
+        sysbus_connect_irq(mptimerbusdev, i,
                           qdev_get_gpio_in(gicdev, ppibase + 29));
        sysbus_connect_irq(wdtbusdev, i,
                           qdev_get_gpio_in(gicdev, ppibase + 30));
--- a/hw/display/qxl-render.c
+++ b/hw/display/qxl-render.c
@@ -20,6 +20,7 @@
 */

 #include "qxl.h"
+#include "trace.h"

 static void qxl_blit(PCIQXLDevice *qxl, QXLRect *rect)
 {
--- a/hw/display/qxl.c
+++ b/hw/display/qxl.c
@@ -1144,8 +1144,14 @@ static void qxl_soft_reset(PCIQXLDevice *d)

 static void qxl_hard_reset(PCIQXLDevice *d, int loadvm)
 {
+    bool startstop = qemu_spice_display_is_running(&d->ssd);
+
    trace_qxl_hard_reset(d->id, loadvm);

+    if (startstop) {
+        qemu_spice_display_stop();
+    }
+
    qxl_spice_reset_cursor(d);
    qxl_spice_reset_image_cache(d);
    qxl_reset_surfaces(d);
@@ -1159,6 +1165,10 @@ static void qxl_hard_reset(PCIQXLDevice *d, int loadvm)
    }
    qemu_spice_create_host_memslot(&d->ssd);
    qxl_soft_reset(d);
+
+    if (startstop) {
+        qemu_spice_display_start();
+    }
 }

 static void qxl_reset_handler(DeviceState *dev)
--- a/hw/display/ssd0323.c
+++ b/hw/display/ssd0323.c
@@ -312,42 +312,18 @@ static int ssd0323_load(QEMUFile *f, void *opaque, int version_id)
        return -EINVAL;

    s->cmd_len = qemu_get_be32(f);
-    if (s->cmd_len < 0 || s->cmd_len > ARRAY_SIZE(s->cmd_data)) {
-        return -EINVAL;
-    }
    s->cmd = qemu_get_be32(f);
    for (i = 0; i < 8; i++)
        s->cmd_data[i] = qemu_get_be32(f);
    s->row = qemu_get_be32(f);
-    if (s->row < 0 || s->row >= 80) {
-        return -EINVAL;
-    }
    s->row_start = qemu_get_be32(f);
-    if (s->row_start < 0 || s->row_start >= 80) {
-        return -EINVAL;
-    }
    s->row_end = qemu_get_be32(f);
-    if (s->row_end < 0 || s->row_end >= 80) {
-        return -EINVAL;
-    }
    s->col = qemu_get_be32(f);
-    if (s->col < 0 || s->col >= 64) {
-        return -EINVAL;
-    }
    s->col_start = qemu_get_be32(f);
-    if (s->col_start < 0 || s->col_start >= 64) {
-        return -EINVAL;
-    }
    s->col_end = qemu_get_be32(f);
-    if (s->col_end < 0 || s->col_end >= 64) {
-        return -EINVAL;
-    }
    s->redraw = qemu_get_be32(f);
    s->remap = qemu_get_be32(f);
    s->mode = qemu_get_be32(f);
-    if (s->mode != SSD0323_CMD && s->mode != SSD0323_DATA) {
-        return -EINVAL;
-    }
    qemu_get_buffer(f, s->framebuffer, sizeof(s->framebuffer));

    ss->cs = qemu_get_be32(f);
--- a/hw/display/vmware_vga.c
+++ b/hw/display/vmware_vga.c
@@ -23,6 +23,7 @@
 */
 #include "hw/hw.h"
 #include "hw/loader.h"
+#include "trace.h"
 #include "ui/console.h"
 #include "hw/pci/pci.h"

--- a/hw/dma/omap_dma.c
+++ b/hw/dma/omap_dma.c
@@ -1660,7 +1660,7 @@ struct soc_dma_s *omap_dma_init(hwaddr base, qemu_irq *irqs,
    }

    omap_dma_setcaps(s);
-    omap_clk_adduser(s->clk, qemu_allocate_irq(omap_dma_clk_update, s, 0));
+    omap_clk_adduser(s->clk, qemu_allocate_irqs(omap_dma_clk_update, s, 1)[0]);
    omap_dma_reset(s->dma);
    omap_dma_clk_update(s, 0, 1);

@@ -2082,7 +2082,7 @@ struct soc_dma_s *omap_dma4_init(hwaddr base, qemu_irq *irqs,
    s->intr_update = omap_dma_interrupts_4_update;

    omap_dma_setcaps(s);
-    omap_clk_adduser(s->clk, qemu_allocate_irq(omap_dma_clk_update, s, 0));
+    omap_clk_adduser(s->clk, qemu_allocate_irqs(omap_dma_clk_update, s, 1)[0]);
    omap_dma_reset(s->dma);
    omap_dma_clk_update(s, 0, !!s->dma->freq);

--- a/hw/gpio/zaurus.c
+++ b/hw/gpio/zaurus.c
@@ -203,15 +203,6 @@ static bool is_version_0 (void *opaque, int version_id)
    return version_id == 0;
 }

-static bool vmstate_scoop_validate(void *opaque, int version_id)
-{
-    ScoopInfo *s = opaque;
-
-    return !(s->prev_level & 0xffff0000) &&
-        !(s->gpio_level & 0xffff0000) &&
-        !(s->gpio_dir & 0xffff0000);
-}
-
 static const VMStateDescription vmstate_scoop_regs = {
    .name = "scoop",
    .version_id = 1,
@@ -224,7 +215,6 @@ static const VMStateDescription vmstate_scoop_regs = {
        VMSTATE_UINT32(gpio_level, ScoopInfo),
        VMSTATE_UINT32(gpio_dir, ScoopInfo),
        VMSTATE_UINT32(prev_level, ScoopInfo),
-        VMSTATE_VALIDATE("irq levels are 16 bit", vmstate_scoop_validate),
        VMSTATE_UINT16(mcr, ScoopInfo),
        VMSTATE_UINT16(cdr, ScoopInfo),
        VMSTATE_UINT16(ccr, ScoopInfo),
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -285,7 +285,8 @@ static inline void build_append_array(GArray *array, GArray *val)
    g_array_append_vals(array, val->data, val->len);
 }

-static void build_append_nameseg(GArray *array, const char *format, ...)
+static void GCC_FMT_ATTR(2, 3)
+build_append_nameseg(GArray *array, const char *format, ...)
 {
    /* It would be nicer to use g_string_vprintf but it's only there in 2.22 */
    char s[] = "XXXX";
@@ -630,7 +631,7 @@ build_append_notify(GArray *device, const char *name,
    GArray *method = build_alloc_array();
    uint8_t op = 0x14; /* MethodOp */

-    build_append_nameseg(method, name);
+    build_append_nameseg(method, "%s", name);
    build_append_byte(method, 0x02); /* MethodFlags: ArgCount */
    for (i = skip; i < count; i++) {
        GArray *target = build_alloc_array();
@@ -923,10 +924,16 @@ build_mcfg_q35(GArray *table_data, GArray *linker, AcpiMcfgInfo *info)
 static void
 build_dsdt(GArray *table_data, GArray *linker, AcpiMiscInfo *misc)
 {
-    void *dsdt;
+    AcpiTableHeader *dsdt;
+
    assert(misc->dsdt_code && misc->dsdt_size);
+
    dsdt = acpi_data_push(table_data, misc->dsdt_size);
    memcpy(dsdt, misc->dsdt_code, misc->dsdt_size);
+
+    memset(dsdt, 0, sizeof *dsdt);
+    build_header(linker, table_data, dsdt, ACPI_DSDT_SIGNATURE,
+                 misc->dsdt_size, 1);
 }

 /* Build final rsdt table */
@@ -1075,16 +1082,15 @@ void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables *tables)
    /* ACPI tables pointed to by RSDT */
    acpi_add_table(table_offsets, tables->table_data);
    build_fadt(tables->table_data, tables->linker, &pm, facs, dsdt);
-
    acpi_add_table(table_offsets, tables->table_data);
+
    build_ssdt(tables->table_data, tables->linker, &cpu, &pm, &misc, &pci,
               guest_info);
-
    acpi_add_table(table_offsets, tables->table_data);
-    build_madt(tables->table_data, tables->linker, &cpu, guest_info);

+    build_madt(tables->table_data, tables->linker, &cpu, guest_info);
+    acpi_add_table(table_offsets, tables->table_data);
    if (misc.has_hpet) {
-        acpi_add_table(table_offsets, tables->table_data);
        build_hpet(tables->table_data, tables->linker);
    }
    if (guest_info->numa_nodes) {
--- a/hw/i386/acpi-dsdt.dsl
+++ b/hw/i386/acpi-dsdt.dsl
@@ -235,7 +235,7 @@ DefinitionBlock (
            }
            Return (0x0B)
        }
-        Method(IQCR, 1, NotSerialized) {
+        Method(IQCR, 1, Serialized) {
            // _CRS method - get current settings
            Name(PRR0, ResourceTemplate() {
                Interrupt(, Level, ActiveHigh, Shared) { 0 }
--- a/hw/i386/acpi-dsdt.hex.generated
+++ b/hw/i386/acpi-dsdt.hex.generated
@@ -8,7 +8,7 @@ static unsigned char AcpiDsdtAmlCode[] = {
 0x0,
 0x0,
 0x1,
-0xe0,
+0xd8,
 0x42,
 0x58,
 0x50,
@@ -3379,7 +3379,7 @@ static unsigned char AcpiDsdtAmlCode[] = {
 0x51,
 0x43,
 0x52,
-0x1,
+0x9,
 0x8,
 0x50,
 0x52,
--- a/hw/i386/bios-linker-loader.c
+++ b/hw/i386/bios-linker-loader.c
@@ -18,10 +18,11 @@
 * with this program; if not, see <http://www.gnu.org/licenses/>.
 */

-#include "qemu-common.h"
 #include "bios-linker-loader.h"
 #include "hw/nvram/fw_cfg.h"

+#include <string.h>
+#include <assert.h>
 #include "qemu/bswap.h"

 #define BIOS_LINKER_LOADER_FILESZ FW_CFG_MAX_FILE_PATH
--- a/hw/i386/kvm/pci-assign.c
+++ b/hw/i386/kvm/pci-assign.c
@@ -1257,7 +1257,6 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
    if (pos != 0 && kvm_device_msix_supported(kvm_state)) {
        int bar_nr;
        uint32_t msix_table_entry;
-        uint16_t msix_max;

        if (!check_irqchip_in_kernel()) {
            return -ENOTSUP;
@@ -1269,10 +1268,9 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
        }
        pci_dev->msix_cap = pos;

-        msix_max = (pci_get_word(pci_dev->config + pos + PCI_MSIX_FLAGS) &
-                    PCI_MSIX_FLAGS_QSIZE) + 1;
-        msix_max = MIN(msix_max, KVM_MAX_MSIX_PER_DEV);
-        pci_set_word(pci_dev->config + pos + PCI_MSIX_FLAGS, msix_max - 1);
+        pci_set_word(pci_dev->config + pos + PCI_MSIX_FLAGS,
+                     pci_get_word(pci_dev->config + pos + PCI_MSIX_FLAGS) &
+                     PCI_MSIX_FLAGS_QSIZE);

        /* Only enable and function mask bits are writable */
        pci_set_word(pci_dev->wmask + pos + PCI_MSIX_FLAGS,
@@ -1282,7 +1280,9 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
        bar_nr = msix_table_entry & PCI_MSIX_FLAGS_BIRMASK;
        msix_table_entry &= ~PCI_MSIX_FLAGS_BIRMASK;
        dev->msix_table_addr = pci_region[bar_nr].base_addr + msix_table_entry;
-        dev->msix_max = msix_max;
+        dev->msix_max = pci_get_word(pci_dev->config + pos + PCI_MSIX_FLAGS);
+        dev->msix_max &= PCI_MSIX_FLAGS_QSIZE;
+        dev->msix_max += 1;
    }

    /* Minimal PM support, nothing writable, device appears to NAK changes */
--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .7.2
 .7.50