x86: Correct translation of some rdgsbase and wrgsbase encodings

It looks like there was a transcription error when writing this code initially. The code previously only decoded src or dst of rax. This resolves https://bugs.launchpad.net/qemu/+bug/1719984. Signed-off-by: Todd Eisenberger <teisenbe@google.com> Message-Id: <CAP26EVRNVb=Mq=O3s51w7fDhGVmf-e3XFFA73MRzc5b4qKBA4g@mail.gmail.com> Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
vl: exit if maxcpus is negative
2017-10-09 23:29:20 -03:00 · 2017-10-09 23:21:52 -03:00 · 2017-10-09 23:21:52 -03:00 · 2017-10-09 23:21:52 -03:00 · 2017-10-09 23:21:52 -03:00 · 2017-10-09 23:21:52 -03:00
407 changed files with 14776 additions and 4884 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -49,9 +49,11 @@
 /qemu-version.h
 /qemu-version.h.tmp
 /module_block.h
+/scsi/qemu-pr-helper
 /vscclient
 /vhost-user-scsi
 /fsdev/virtfs-proxy-helper
+*.tmp
 *.[1-9]
 *.a
 *.aux
--- a/46
+++ b/46
@@ -299,6 +299,8 @@ M: Cornelia Huck <cohuck@redhat.com>
 M: Alexander Graf <agraf@suse.de>
 S: Maintained
 F: target/s390x/kvm.c
+F: target/s390x/kvm_s390x.h
+F: target/s390x/kvm-stub.c
 F: target/s390x/ioinst.[ch]
 F: target/s390x/machine.c
 F: hw/intc/s390_flic.c
@@ -380,6 +382,7 @@ M: Peter Maydell <peter.maydell@linaro.org>
 L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/char/pl011.c
+F: include/hw/char/pl011.h
 F: hw/display/pl110*
 F: hw/dma/pl080.c
 F: hw/dma/pl330.c
@@ -403,13 +406,15 @@ F: hw/intc/gic_internal.h
 F: hw/misc/a9scu.c
 F: hw/misc/arm11scu.c
 F: hw/timer/a9gtimer*
-F: hw/timer/arm_*
-F: include/hw/arm/arm.h
+F: hw/timer/arm*
+F: include/hw/arm/arm*.h
 F: include/hw/intc/arm*
 F: include/hw/misc/a9scu.h
 F: include/hw/misc/arm11scu.h
 F: include/hw/timer/a9gtimer.h
 F: include/hw/timer/arm_mptimer.h
+F: include/hw/timer/armv7m_systick.h
+F: tests/test-arm-mptimer.c

 Exynos
 M: Igor Mitsyanko <i.mitsyanko@gmail.com>
@@ -512,6 +517,7 @@ M: Peter Maydell <peter.maydell@linaro.org>
 L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/*/versatile*
+F: hw/misc/arm_sysctl.c

 Xilinx Zynq
 M: Edgar E. Iglesias <edgar.iglesias@gmail.com>
@@ -548,6 +554,7 @@ F: hw/char/stm32f2xx_usart.c
 F: hw/timer/stm32f2xx_timer.c
 F: hw/adc/*
 F: hw/ssi/stm32f2xx_spi.c
+F: include/hw/*/stm32*.h

 Netduino 2
 M: Alistair Francis <alistair@alistair23.me>
@@ -925,6 +932,8 @@ F: include/hw/pci/*
 F: hw/misc/pci-testdev.c
 F: hw/pci/*
 F: hw/pci-bridge/*
+F: docs/pci*
+F: docs/specs/*pci*

 ACPI/SMBIOS
 M: Michael S. Tsirkin <mst@redhat.com>
@@ -983,10 +992,13 @@ F: hw/scsi/lsi53c895a.c

 SSI
 M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
+M: Alistair Francis <alistair.francis@xilinx.com>
 S: Maintained
 F: hw/ssi/*
 F: hw/block/m25p80.c
+F: include/hw/ssi/ssi.h
 X: hw/ssi/xilinx_*
+F: tests/m25p80-test.c

 Xilinx SPI
 M: Alistair Francis <alistair.francis@xilinx.com>
@@ -1029,6 +1041,7 @@ vhost
 M: Michael S. Tsirkin <mst@redhat.com>
 S: Supported
 F: hw/*/*vhost*
+F: docs/interop/vhost-user.txt

 virtio
 M: Michael S. Tsirkin <mst@redhat.com>
@@ -1126,6 +1139,7 @@ M: Dmitry Fleytman <dmitry@daynix.com>
 S: Maintained
 F: hw/net/vmxnet*
 F: hw/scsi/vmw_pvscsi*
+F: tests/vmxnet3-test.c

 Rocker
 M: Jiri Pirko <jiri@resnulli.us>
@@ -1156,6 +1170,7 @@ M: Alistair Francis <alistair.francis@xilinx.com>
 S: Maintained
 F: hw/core/generic-loader.c
 F: include/hw/core/generic-loader.h
+F: docs/generic-loader.txt

 CHRP NVRAM
 M: Thomas Huth <thuth@redhat.com>
@@ -1217,6 +1232,7 @@ F: util/aio-*.c
 F: block/io.c
 F: migration/block*
 F: include/block/aio.h
+F: scripts/qemugdb/aio.py
 T: git git://github.com/stefanha/qemu.git block

 Block SCSI subsystem
@@ -1257,7 +1273,7 @@ F: block/dirty-bitmap.c
 F: include/qemu/hbitmap.h
 F: include/block/dirty-bitmap.h
 F: tests/test-hbitmap.c
-F: docs/bitmaps.md
+F: docs/interop/bitmaps.rst
 T: git git://github.com/famz/qemu.git bitmaps
 T: git git://github.com/jnsnow/qemu.git bitmaps

@@ -1426,7 +1442,7 @@ F: tests/test-qapi-*.c
 F: tests/test-qmp-*.c
 F: tests/test-visitor-serialization.c
 F: scripts/qapi*
-F: docs/qapi*
+F: docs/devel/qapi*
 T: git git://repo.or.cz/qemu/armbru.git qapi-next

 QAPI Schema
@@ -1455,6 +1471,10 @@ QEMU Guest Agent
 M: Michael Roth <mdroth@linux.vnet.ibm.com>
 S: Maintained
 F: qga/
+F: qemu-ga.texi
+F: scripts/qemu-guest-agent/
+F: tests/test-qga.c
+F: docs/interop/qemu-ga-ref.texi
 T: git git://github.com/mdroth/qemu.git qga

 QOM
@@ -1474,7 +1494,7 @@ M: Markus Armbruster <armbru@redhat.com>
 S: Supported
 F: qmp.c
 F: monitor.c
-F: docs/*qmp-*
+F: docs/devel/*qmp-*
 F: scripts/qmp/
 F: tests/qmp-test.c
 T: git git://repo.or.cz/qemu/armbru.git qapi-next
@@ -1505,7 +1525,7 @@ S: Maintained
 F: trace/
 F: scripts/tracetool.py
 F: scripts/tracetool/
-F: docs/tracing.txt
+F: docs/devel/tracing.txt
 T: git git://github.com/stefanha/qemu.git tracing

 TPM
@@ -1528,7 +1548,7 @@ F: include/migration/
 F: migration/
 F: scripts/vmstate-static-checker.py
 F: tests/vmstate-static-checker-data/
-F: docs/migration.txt
+F: docs/devel/migration.txt
 F: qapi/migration.json

 Seccomp
@@ -1543,6 +1563,7 @@ S: Maintained
 F: crypto/
 F: include/crypto/
 F: tests/test-crypto-*
+F: tests/benchmark-crypto-*
 F: qemu.sasl

 Coroutines
@@ -1579,8 +1600,10 @@ M: Alberto Garcia <berto@igalia.com>
 S: Supported
 F: block/throttle-groups.c
 F: include/block/throttle-groups.h
-F: include/qemu/throttle.h
+F: include/qemu/throttle*.h
 F: util/throttle.c
+F: docs/throttle.txt
+F: tests/test-throttle.c
 L: qemu-block@nongnu.org

 UUID
@@ -1836,7 +1859,7 @@ M: Denis V. Lunev <den@openvz.org>
 L: qemu-block@nongnu.org
 S: Supported
 F: block/parallels.c
-F: docs/specs/parallels.txt
+F: docs/interop/parallels.txt

 qed
 M: Stefan Hajnoczi <stefanha@redhat.com>
@@ -1861,6 +1884,7 @@ M: Max Reitz <mreitz@redhat.com>
 L: qemu-block@nongnu.org
 S: Supported
 F: block/qcow2*
+F: docs/interop/qcow2.txt

 qcow
 M: Kevin Wolf <kwolf@redhat.com>
@@ -1904,6 +1928,7 @@ F: docs/block-replication.txt

 Build and test automation
 -------------------------
+Build and test automation
 M: Alex Bennée <alex.bennee@linaro.org>
 M: Fam Zheng <famz@redhat.com>
 R: Philippe Mathieu-Daudé <f4bug@amsat.org>
@@ -1912,6 +1937,7 @@ S: Maintained
 F: .travis.yml
 F: .shippable.yml
 F: tests/docker/
+F: tests/vm/
 W: https://travis-ci.org/qemu/qemu
 W: https://app.shippable.com/github/qemu/qemu
 W: http://patchew.org/QEMU/
@@ -1921,5 +1947,5 @@ Documentation
 Build system architecture
 M: Daniel P. Berrange <berrange@redhat.com>
 S: Odd Fixes
-F: docs/build-system.txt
+F: docs/devel/build-system.txt

--- a/15
+++ b/15
@@ -209,6 +209,7 @@ ifdef BUILD_DOCS
 DOCS=qemu-doc.html qemu-doc.txt qemu.1 qemu-img.1 qemu-nbd.8 qemu-ga.8
 DOCS+=docs/interop/qemu-qmp-ref.html docs/interop/qemu-qmp-ref.txt docs/interop/qemu-qmp-ref.7
 DOCS+=docs/interop/qemu-ga-ref.html docs/interop/qemu-ga-ref.txt docs/interop/qemu-ga-ref.7
+DOCS+=docs/qemu-block-drivers.7
 ifdef CONFIG_VIRTFS
 DOCS+=fsdev/virtfs-proxy-helper.1
 endif
@@ -372,6 +373,11 @@ qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o $(COMMON_LDADDS)
 fsdev/virtfs-proxy-helper$(EXESUF): fsdev/virtfs-proxy-helper.o fsdev/9p-marshal.o fsdev/9p-iov-marshal.o $(COMMON_LDADDS)
 fsdev/virtfs-proxy-helper$(EXESUF): LIBS += -lcap

+scsi/qemu-pr-helper$(EXESUF): scsi/qemu-pr-helper.o scsi/utils.o $(crypto-obj-y) $(io-obj-y) $(qom-obj-y) $(COMMON_LDADDS)
+ifdef CONFIG_MPATH
+scsi/qemu-pr-helper$(EXESUF): LIBS += -ludev -lmultipath -lmpathpersist
+endif
+
 qemu-img-cmds.h: $(SRC_PATH)/qemu-img-cmds.hx $(SRC_PATH)/scripts/hxtool
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"GEN","$@")

@@ -488,7 +494,7 @@ clean:
 	rm -f *.msi
 	find . \( -name '*.so' -o -name '*.dll' -o -name '*.mo' -o -name '*.[oda]' \) -type f -exec rm {} +
 	rm -f $(filter-out %.tlb,$(TOOLS)) $(HELPERS-y) qemu-ga TAGS cscope.* *.pod *~ */*~
-	rm -f fsdev/*.pod
+	rm -f fsdev/*.pod scsi/*.pod
 	rm -f qemu-img-cmds.h
 	rm -f ui/shader/*-vert.h ui/shader/*-frag.h
 	@# May not be present in GENERATED_FILES
@@ -527,6 +533,7 @@ distclean: clean
 	rm -f docs/interop/qemu-qmp-ref.txt docs/interop/qemu-ga-ref.txt
 	rm -f docs/interop/qemu-qmp-ref.pdf docs/interop/qemu-ga-ref.pdf
 	rm -f docs/interop/qemu-qmp-ref.html docs/interop/qemu-ga-ref.html
+	rm -f docs/qemu-block-drivers.7
 	for d in $(TARGET_DIRS); do \
 	rm -rf $$d || exit 1 ; \
        done
@@ -571,6 +578,7 @@ ifdef CONFIG_POSIX
 	$(INSTALL_DATA) qemu.1 "$(DESTDIR)$(mandir)/man1"
 	$(INSTALL_DIR) "$(DESTDIR)$(mandir)/man7"
 	$(INSTALL_DATA) docs/interop/qemu-qmp-ref.7 "$(DESTDIR)$(mandir)/man7"
+	$(INSTALL_DATA) docs/qemu-block-drivers.7 "$(DESTDIR)$(mandir)/man7"
 ifneq ($(TOOLS),)
 	$(INSTALL_DATA) qemu-img.1 "$(DESTDIR)$(mandir)/man1"
 	$(INSTALL_DIR) "$(DESTDIR)$(mandir)/man8"
@@ -716,6 +724,7 @@ qemu-img.1: qemu-img.texi qemu-option-trace.texi qemu-img-cmds.texi
 fsdev/virtfs-proxy-helper.1: fsdev/virtfs-proxy-helper.texi
 qemu-nbd.8: qemu-nbd.texi qemu-option-trace.texi
 qemu-ga.8: qemu-ga.texi
+docs/qemu-block-drivers.7: docs/qemu-block-drivers.texi

 html: qemu-doc.html docs/interop/qemu-qmp-ref.html docs/interop/qemu-ga-ref.html
 info: qemu-doc.info docs/interop/qemu-qmp-ref.info docs/interop/qemu-ga-ref.info
@@ -725,7 +734,7 @@ txt: qemu-doc.txt docs/interop/qemu-qmp-ref.txt docs/interop/qemu-ga-ref.txt
 qemu-doc.html qemu-doc.info qemu-doc.pdf qemu-doc.txt: \
 	qemu-img.texi qemu-nbd.texi qemu-options.texi qemu-option-trace.texi \
 	qemu-monitor.texi qemu-img-cmds.texi qemu-ga.texi \
-	qemu-monitor-info.texi
+	qemu-monitor-info.texi docs/qemu-block-drivers.texi

 docs/interop/qemu-ga-ref.dvi docs/interop/qemu-ga-ref.html \
    docs/interop/qemu-ga-ref.info docs/interop/qemu-ga-ref.pdf \
@@ -811,6 +820,7 @@ endif
 -include $(wildcard *.d tests/*.d)

 include $(SRC_PATH)/tests/docker/Makefile.include
+include $(SRC_PATH)/tests/vm/Makefile.include

 .PHONY: help
 help:
@@ -834,6 +844,7 @@ help:
 	@echo  'Test targets:'
 	@echo  '  check           - Run all tests (check-help for details)'
 	@echo  '  docker          - Help about targets running tests inside Docker containers'
+	@echo  '  vm-test         - Help about targets running tests inside VM'
 	@echo  ''
 	@echo  'Documentation targets:'
 	@echo  '  html info pdf txt'
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -171,6 +171,7 @@ trace-events-subdirs += qapi
 trace-events-subdirs += accel/tcg
 trace-events-subdirs += accel/kvm
 trace-events-subdirs += nbd
+trace-events-subdirs += scsi

 trace-events-files = $(SRC_PATH)/trace-events $(trace-events-subdirs:%=$(SRC_PATH)/%/trace-events)

--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -87,6 +87,7 @@ struct KVMState
 #endif
    int many_ioeventfds;
    int intx_set_mask;
+    bool sync_mmu;
    /* The man page (and posix) say ioctl numbers are signed int, but
     * they're not.  Linux, glibc and *BSD all treat ioctl numbers as
     * unsigned, and treating them as signed here can break things */
@@ -722,7 +723,6 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
    mem = kvm_lookup_matching_slot(kml, start_addr, size);
    if (!add) {
        if (!mem) {
-            g_assert(!memory_region_is_ram(mr) && !writeable && !mr->romd_mode);
            return;
        }
        if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
@@ -1440,7 +1440,7 @@ static void kvm_irqchip_create(MachineState *machine, KVMState *s)
 */
 static int kvm_recommended_vcpus(KVMState *s)
 {
-    int ret = kvm_check_extension(s, KVM_CAP_NR_VCPUS);
+    int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS);
    return (ret) ? ret : 4;
 }

@@ -1530,26 +1530,6 @@ static int kvm_init(MachineState *ms)
        s->nr_slots = 32;
    }

-    /* check the vcpu limits */
-    soft_vcpus_limit = kvm_recommended_vcpus(s);
-    hard_vcpus_limit = kvm_max_vcpus(s);
-
-    while (nc->name) {
-        if (nc->num > soft_vcpus_limit) {
-            warn_report("Number of %s cpus requested (%d) exceeds "
-                        "the recommended cpus supported by KVM (%d)",
-                        nc->name, nc->num, soft_vcpus_limit);
-
-            if (nc->num > hard_vcpus_limit) {
-                fprintf(stderr, "Number of %s cpus requested (%d) exceeds "
-                        "the maximum cpus supported by KVM (%d)\n",
-                        nc->name, nc->num, hard_vcpus_limit);
-                exit(1);
-            }
-        }
-        nc++;
-    }
-
    kvm_type = qemu_opt_get(qemu_get_machine_opts(), "kvm-type");
    if (mc->kvm_type) {
        type = mc->kvm_type(kvm_type);
@@ -1584,6 +1564,27 @@ static int kvm_init(MachineState *ms)
    }

    s->vmfd = ret;
+
+    /* check the vcpu limits */
+    soft_vcpus_limit = kvm_recommended_vcpus(s);
+    hard_vcpus_limit = kvm_max_vcpus(s);
+
+    while (nc->name) {
+        if (nc->num > soft_vcpus_limit) {
+            warn_report("Number of %s cpus requested (%d) exceeds "
+                        "the recommended cpus supported by KVM (%d)",
+                        nc->name, nc->num, soft_vcpus_limit);
+
+            if (nc->num > hard_vcpus_limit) {
+                fprintf(stderr, "Number of %s cpus requested (%d) exceeds "
+                        "the maximum cpus supported by KVM (%d)\n",
+                        nc->name, nc->num, hard_vcpus_limit);
+                exit(1);
+            }
+        }
+        nc++;
+    }
+
    missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
    if (!missing_cap) {
        missing_cap =
@@ -1665,6 +1666,8 @@ static int kvm_init(MachineState *ms)

    s->many_ioeventfds = kvm_check_many_ioeventfds();

+    s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
+
    return 0;

 err:
@@ -2131,10 +2134,9 @@ int kvm_device_access(int fd, int group, uint64_t attr,
    return err;
 }

-/* Return 1 on success, 0 on failure */
-int kvm_has_sync_mmu(void)
+bool kvm_has_sync_mmu(void)
 {
-    return kvm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
+    return kvm_state->sync_mmu;
 }

 int kvm_has_vcpu_events(void)
--- a/accel/stubs/kvm-stub.c
+++ b/accel/stubs/kvm-stub.c
@@ -64,9 +64,9 @@ int kvm_cpu_exec(CPUState *cpu)
    abort();
 }

-int kvm_has_sync_mmu(void)
+bool kvm_has_sync_mmu(void)
 {
-    return 0;
+    return false;
 }

 int kvm_has_many_ioeventfds(void)
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -765,7 +765,7 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,

    cpu->mem_io_vaddr = addr;

-    if (mr->global_locking) {
+    if (mr->global_locking && !qemu_mutex_iothread_locked()) {
        qemu_mutex_lock_iothread();
        locked = true;
    }
@@ -800,7 +800,7 @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
    cpu->mem_io_vaddr = addr;
    cpu->mem_io_pc = retaddr;

-    if (mr->global_locking) {
+    if (mr->global_locking && !qemu_mutex_iothread_locked()) {
        qemu_mutex_lock_iothread();
        locked = true;
    }
--- a/audio/Makefile.objs
+++ b/audio/Makefile.objs
@@ -11,3 +11,9 @@ common-obj-$(CONFIG_AUDIO_WIN_INT) += audio_win_int.o
 common-obj-y += wavcapture.o

 sdlaudio.o-cflags := $(SDL_CFLAGS)
+sdlaudio.o-libs := $(SDL_LIBS)
+alsaaudio.o-libs := $(ALSA_LIBS)
+paaudio.o-libs := $(PULSE_LIBS)
+coreaudio.o-libs := $(COREAUDIO_LIBS)
+dsoundaudio.o-libs := $(DSOUND_LIBS)
+ossaudio.o-libs := $(OSS_LIBS)
--- a/block.c
+++ b/block.c
@@ -239,12 +239,6 @@ bool bdrv_is_read_only(BlockDriverState *bs)
    return bs->read_only;
 }

-/* Returns whether the image file can be written to right now */
-bool bdrv_is_writable(BlockDriverState *bs)
-{
-    return !bdrv_is_read_only(bs) && !(bs->open_flags & BDRV_O_INACTIVE);
-}
-
 int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only,
                           bool ignore_allow_rdw, Error **errp)
 {
@@ -987,6 +981,33 @@ static void bdrv_backing_options(int *child_flags, QDict *child_options,
    *child_flags = flags;
 }

+static int bdrv_backing_update_filename(BdrvChild *c, BlockDriverState *base,
+                                        const char *filename, Error **errp)
+{
+    BlockDriverState *parent = c->opaque;
+    int orig_flags = bdrv_get_flags(parent);
+    int ret;
+
+    if (!(orig_flags & BDRV_O_RDWR)) {
+        ret = bdrv_reopen(parent, orig_flags | BDRV_O_RDWR, errp);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    ret = bdrv_change_backing_file(parent, filename,
+                                   base->drv ? base->drv->format_name : "");
+    if (ret < 0) {
+        error_setg_errno(errp, ret, "Could not update backing file link");
+    }
+
+    if (!(orig_flags & BDRV_O_RDWR)) {
+        bdrv_reopen(parent, orig_flags, NULL);
+    }
+
+    return ret;
+}
+
 const BdrvChildRole child_backing = {
    .get_parent_desc = bdrv_child_get_parent_desc,
    .attach          = bdrv_backing_attach,
@@ -995,6 +1016,7 @@ const BdrvChildRole child_backing = {
    .drained_begin   = bdrv_child_cb_drained_begin,
    .drained_end     = bdrv_child_cb_drained_end,
    .inactivate      = bdrv_child_cb_inactivate,
+    .update_filename = bdrv_backing_update_filename,
 };

 static int bdrv_open_flags(BlockDriverState *bs, int flags)
@@ -1531,22 +1553,59 @@ static int bdrv_fill_options(QDict **options, const char *filename,
    return 0;
 }

-static int bdrv_child_check_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
+static int bdrv_child_check_perm(BdrvChild *c, BlockReopenQueue *q,
+                                 uint64_t perm, uint64_t shared,
                                 GSList *ignore_children, Error **errp);
 static void bdrv_child_abort_perm_update(BdrvChild *c);
 static void bdrv_child_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared);

+typedef struct BlockReopenQueueEntry {
+     bool prepared;
+     BDRVReopenState state;
+     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
+} BlockReopenQueueEntry;
+
+/*
+ * Return the flags that @bs will have after the reopens in @q have
+ * successfully completed. If @q is NULL (or @bs is not contained in @q),
+ * return the current flags.
+ */
+static int bdrv_reopen_get_flags(BlockReopenQueue *q, BlockDriverState *bs)
+{
+    BlockReopenQueueEntry *entry;
+
+    if (q != NULL) {
+        QSIMPLEQ_FOREACH(entry, q, entry) {
+            if (entry->state.bs == bs) {
+                return entry->state.flags;
+            }
+        }
+    }
+
+    return bs->open_flags;
+}
+
+/* Returns whether the image file can be written to after the reopen queue @q
+ * has been successfully applied, or right now if @q is NULL. */
+static bool bdrv_is_writable(BlockDriverState *bs, BlockReopenQueue *q)
+{
+    int flags = bdrv_reopen_get_flags(q, bs);
+
+    return (flags & (BDRV_O_RDWR | BDRV_O_INACTIVE)) == BDRV_O_RDWR;
+}
+
 static void bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs,
-                            BdrvChild *c,
-                            const BdrvChildRole *role,
+                            BdrvChild *c, const BdrvChildRole *role,
+                            BlockReopenQueue *reopen_queue,
                            uint64_t parent_perm, uint64_t parent_shared,
                            uint64_t *nperm, uint64_t *nshared)
 {
    if (bs->drv && bs->drv->bdrv_child_perm) {
-        bs->drv->bdrv_child_perm(bs, c, role,
+        bs->drv->bdrv_child_perm(bs, c, role, reopen_queue,
                                 parent_perm, parent_shared,
                                 nperm, nshared);
    }
+    /* TODO Take force_share from reopen_queue */
    if (child_bs && child_bs->force_share) {
        *nshared = BLK_PERM_ALL;
    }
@@ -1561,7 +1620,8 @@ static void bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs,
 * A call to this function must always be followed by a call to bdrv_set_perm()
 * or bdrv_abort_perm_update().
 */
-static int bdrv_check_perm(BlockDriverState *bs, uint64_t cumulative_perms,
+static int bdrv_check_perm(BlockDriverState *bs, BlockReopenQueue *q,
+                           uint64_t cumulative_perms,
                           uint64_t cumulative_shared_perms,
                           GSList *ignore_children, Error **errp)
 {
@@ -1571,7 +1631,7 @@ static int bdrv_check_perm(BlockDriverState *bs, uint64_t cumulative_perms,

    /* Write permissions never work with read-only images */
    if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) &&
-        !bdrv_is_writable(bs))
+        !bdrv_is_writable(bs, q))
    {
        error_setg(errp, "Block node is read-only");
        return -EPERM;
@@ -1596,11 +1656,11 @@ static int bdrv_check_perm(BlockDriverState *bs, uint64_t cumulative_perms,
    /* Check all children */
    QLIST_FOREACH(c, &bs->children, next) {
        uint64_t cur_perm, cur_shared;
-        bdrv_child_perm(bs, c->bs, c, c->role,
+        bdrv_child_perm(bs, c->bs, c, c->role, q,
                        cumulative_perms, cumulative_shared_perms,
                        &cur_perm, &cur_shared);
-        ret = bdrv_child_check_perm(c, cur_perm, cur_shared, ignore_children,
-                                    errp);
+        ret = bdrv_child_check_perm(c, q, cur_perm, cur_shared,
+                                    ignore_children, errp);
        if (ret < 0) {
            return ret;
        }
@@ -1658,7 +1718,7 @@ static void bdrv_set_perm(BlockDriverState *bs, uint64_t cumulative_perms,
    /* Update all children */
    QLIST_FOREACH(c, &bs->children, next) {
        uint64_t cur_perm, cur_shared;
-        bdrv_child_perm(bs, c->bs, c, c->role,
+        bdrv_child_perm(bs, c->bs, c, c->role, NULL,
                        cumulative_perms, cumulative_shared_perms,
                        &cur_perm, &cur_shared);
        bdrv_child_set_perm(c, cur_perm, cur_shared);
@@ -1726,7 +1786,8 @@ char *bdrv_perm_names(uint64_t perm)
 *
 * Needs to be followed by a call to either bdrv_set_perm() or
 * bdrv_abort_perm_update(). */
-static int bdrv_check_update_perm(BlockDriverState *bs, uint64_t new_used_perm,
+static int bdrv_check_update_perm(BlockDriverState *bs, BlockReopenQueue *q,
+                                  uint64_t new_used_perm,
                                  uint64_t new_shared_perm,
                                  GSList *ignore_children, Error **errp)
 {
@@ -1768,19 +1829,20 @@ static int bdrv_check_update_perm(BlockDriverState *bs, uint64_t new_used_perm,
        cumulative_shared_perms &= c->shared_perm;
    }

-    return bdrv_check_perm(bs, cumulative_perms, cumulative_shared_perms,
+    return bdrv_check_perm(bs, q, cumulative_perms, cumulative_shared_perms,
                           ignore_children, errp);
 }

 /* Needs to be followed by a call to either bdrv_child_set_perm() or
 * bdrv_child_abort_perm_update(). */
-static int bdrv_child_check_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
+static int bdrv_child_check_perm(BdrvChild *c, BlockReopenQueue *q,
+                                 uint64_t perm, uint64_t shared,
                                 GSList *ignore_children, Error **errp)
 {
    int ret;

    ignore_children = g_slist_prepend(g_slist_copy(ignore_children), c);
-    ret = bdrv_check_update_perm(c->bs, perm, shared, ignore_children, errp);
+    ret = bdrv_check_update_perm(c->bs, q, perm, shared, ignore_children, errp);
    g_slist_free(ignore_children);

    return ret;
@@ -1808,7 +1870,7 @@ int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
 {
    int ret;

-    ret = bdrv_child_check_perm(c, perm, shared, NULL, errp);
+    ret = bdrv_child_check_perm(c, NULL, perm, shared, NULL, errp);
    if (ret < 0) {
        bdrv_child_abort_perm_update(c);
        return ret;
@@ -1827,6 +1889,7 @@ int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared,

 void bdrv_filter_default_perms(BlockDriverState *bs, BdrvChild *c,
                               const BdrvChildRole *role,
+                               BlockReopenQueue *reopen_queue,
                               uint64_t perm, uint64_t shared,
                               uint64_t *nperm, uint64_t *nshared)
 {
@@ -1844,6 +1907,7 @@ void bdrv_filter_default_perms(BlockDriverState *bs, BdrvChild *c,

 void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
                               const BdrvChildRole *role,
+                               BlockReopenQueue *reopen_queue,
                               uint64_t perm, uint64_t shared,
                               uint64_t *nperm, uint64_t *nshared)
 {
@@ -1853,10 +1917,11 @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
    if (!backing) {
        /* Apart from the modifications below, the same permissions are
         * forwarded and left alone as for filters */
-        bdrv_filter_default_perms(bs, c, role, perm, shared, &perm, &shared);
+        bdrv_filter_default_perms(bs, c, role, reopen_queue, perm, shared,
+                                  &perm, &shared);

        /* Format drivers may touch metadata even if the guest doesn't write */
-        if (bdrv_is_writable(bs)) {
+        if (bdrv_is_writable(bs, reopen_queue)) {
            perm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
        }

@@ -1945,7 +2010,7 @@ static void bdrv_replace_child(BdrvChild *child, BlockDriverState *new_bs)
         * because we're just taking a parent away, so we're loosening
         * restrictions. */
        bdrv_get_cumulative_perm(old_bs, &perm, &shared_perm);
-        bdrv_check_perm(old_bs, perm, shared_perm, NULL, &error_abort);
+        bdrv_check_perm(old_bs, NULL, perm, shared_perm, NULL, &error_abort);
        bdrv_set_perm(old_bs, perm, shared_perm);
    }

@@ -1964,7 +2029,7 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
    BdrvChild *child;
    int ret;

-    ret = bdrv_check_update_perm(child_bs, perm, shared_perm, NULL, errp);
+    ret = bdrv_check_update_perm(child_bs, NULL, perm, shared_perm, NULL, errp);
    if (ret < 0) {
        bdrv_abort_perm_update(child_bs);
        return NULL;
@@ -1999,7 +2064,7 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,

    assert(parent_bs->drv);
    assert(bdrv_get_aio_context(parent_bs) == bdrv_get_aio_context(child_bs));
-    bdrv_child_perm(parent_bs, child_bs, NULL, child_role,
+    bdrv_child_perm(parent_bs, child_bs, NULL, child_role, NULL,
                    perm, shared_perm, &perm, &shared_perm);

    child = bdrv_root_attach_child(child_bs, child_name, child_role,
@@ -2633,12 +2698,6 @@ BlockDriverState *bdrv_open(const char *filename, const char *reference,
                             NULL, errp);
 }

-typedef struct BlockReopenQueueEntry {
-     bool prepared;
-     BDRVReopenState state;
-     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
-} BlockReopenQueueEntry;
-
 /*
 * Adds a BlockDriverState to a simple queue for an atomic, transactional
 * reopen of multiple devices.
@@ -2737,6 +2796,23 @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
        flags |= BDRV_O_ALLOW_RDWR;
    }

+    if (!bs_entry) {
+        bs_entry = g_new0(BlockReopenQueueEntry, 1);
+        QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
+    } else {
+        QDECREF(bs_entry->state.options);
+        QDECREF(bs_entry->state.explicit_options);
+    }
+
+    bs_entry->state.bs = bs;
+    bs_entry->state.options = options;
+    bs_entry->state.explicit_options = explicit_options;
+    bs_entry->state.flags = flags;
+
+    /* This needs to be overwritten in bdrv_reopen_prepare() */
+    bs_entry->state.perm = UINT64_MAX;
+    bs_entry->state.shared_perm = 0;
+
    QLIST_FOREACH(child, &bs->children, next) {
        QDict *new_child_options;
        char *child_key_dot;
@@ -2756,19 +2832,6 @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
                                child->role, options, flags);
    }

-    if (!bs_entry) {
-        bs_entry = g_new0(BlockReopenQueueEntry, 1);
-        QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
-    } else {
-        QDECREF(bs_entry->state.options);
-        QDECREF(bs_entry->state.explicit_options);
-    }
-
-    bs_entry->state.bs = bs;
-    bs_entry->state.options = options;
-    bs_entry->state.explicit_options = explicit_options;
-    bs_entry->state.flags = flags;
-
    return bs_queue;
 }

@@ -2856,6 +2919,52 @@ int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
    return ret;
 }

+static BlockReopenQueueEntry *find_parent_in_reopen_queue(BlockReopenQueue *q,
+                                                          BdrvChild *c)
+{
+    BlockReopenQueueEntry *entry;
+
+    QSIMPLEQ_FOREACH(entry, q, entry) {
+        BlockDriverState *bs = entry->state.bs;
+        BdrvChild *child;
+
+        QLIST_FOREACH(child, &bs->children, next) {
+            if (child == c) {
+                return entry;
+            }
+        }
+    }
+
+    return NULL;
+}
+
+static void bdrv_reopen_perm(BlockReopenQueue *q, BlockDriverState *bs,
+                             uint64_t *perm, uint64_t *shared)
+{
+    BdrvChild *c;
+    BlockReopenQueueEntry *parent;
+    uint64_t cumulative_perms = 0;
+    uint64_t cumulative_shared_perms = BLK_PERM_ALL;
+
+    QLIST_FOREACH(c, &bs->parents, next_parent) {
+        parent = find_parent_in_reopen_queue(q, c);
+        if (!parent) {
+            cumulative_perms |= c->perm;
+            cumulative_shared_perms &= c->shared_perm;
+        } else {
+            uint64_t nperm, nshared;
+
+            bdrv_child_perm(parent->state.bs, bs, c, c->role, q,
+                            parent->state.perm, parent->state.shared_perm,
+                            &nperm, &nshared);
+
+            cumulative_perms |= nperm;
+            cumulative_shared_perms &= nshared;
+        }
+    }
+    *perm = cumulative_perms;
+    *shared = cumulative_shared_perms;
+}

 /*
 * Prepares a BlockDriverState for reopen. All changes are staged in the
@@ -2921,6 +3030,9 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
        goto error;
    }

+    /* Calculate required permissions after reopening */
+    bdrv_reopen_perm(queue, reopen_state->bs,
+                     &reopen_state->perm, &reopen_state->shared_perm);

    ret = bdrv_flush(reopen_state->bs);
    if (ret) {
@@ -2976,6 +3088,12 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
        } while ((entry = qdict_next(reopen_state->options, entry)));
    }

+    ret = bdrv_check_perm(reopen_state->bs, queue, reopen_state->perm,
+                          reopen_state->shared_perm, NULL, errp);
+    if (ret < 0) {
+        goto error;
+    }
+
    ret = 0;

 error:
@@ -3016,6 +3134,9 @@ void bdrv_reopen_commit(BDRVReopenState *reopen_state)

    bdrv_refresh_limits(bs, NULL);

+    bdrv_set_perm(reopen_state->bs, reopen_state->perm,
+                  reopen_state->shared_perm);
+
    new_can_write =
        !bdrv_is_read_only(bs) && !(bdrv_get_flags(bs) & BDRV_O_INACTIVE);
    if (!old_can_write && new_can_write && drv->bdrv_reopen_bitmaps_rw) {
@@ -3049,6 +3170,8 @@ void bdrv_reopen_abort(BDRVReopenState *reopen_state)
    }

    QDECREF(reopen_state->explicit_options);
+
+    bdrv_abort_perm_update(reopen_state->bs);
 }


@@ -3179,7 +3302,7 @@ void bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,

    /* Check whether the required permissions can be granted on @to, ignoring
     * all BdrvChild in @list so that they can't block themselves. */
-    ret = bdrv_check_update_perm(to, perm, shared, list, errp);
+    ret = bdrv_check_update_perm(to, NULL, perm, shared, list, errp);
    if (ret < 0) {
        bdrv_abort_perm_update(to);
        goto out;
@@ -3368,53 +3491,62 @@ BlockDriverState *bdrv_find_base(BlockDriverState *bs)
 *  if active == top, that is considered an error
 *
 */
-int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
-                           BlockDriverState *base, const char *backing_file_str)
+int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
+                           const char *backing_file_str)
 {
-    BlockDriverState *new_top_bs = NULL;
+    BdrvChild *c, *next;
    Error *local_err = NULL;
    int ret = -EIO;

+    bdrv_ref(top);
+
    if (!top->drv || !base->drv) {
        goto exit;
    }

-    new_top_bs = bdrv_find_overlay(active, top);
-
-    if (new_top_bs == NULL) {
-        /* we could not find the image above 'top', this is an error */
-        goto exit;
-    }
-
-    /* special case of new_top_bs->backing->bs already pointing to base - nothing
-     * to do, no intermediate images */
-    if (backing_bs(new_top_bs) == base) {
-        ret = 0;
-        goto exit;
-    }
-
    /* Make sure that base is in the backing chain of top */
    if (!bdrv_chain_contains(top, base)) {
        goto exit;
    }

    /* success - we can delete the intermediate states, and link top->base */
+    /* TODO Check graph modification op blockers (BLK_PERM_GRAPH_MOD) once
+     * we've figured out how they should work. */
    backing_file_str = backing_file_str ? backing_file_str : base->filename;
-    ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
-                                   base->drv ? base->drv->format_name : "");
-    if (ret) {
-        goto exit;
-    }

-    bdrv_set_backing_hd(new_top_bs, base, &local_err);
-    if (local_err) {
-        ret = -EPERM;
-        error_report_err(local_err);
-        goto exit;
+    QLIST_FOREACH_SAFE(c, &top->parents, next_parent, next) {
+        /* Check whether we are allowed to switch c from top to base */
+        GSList *ignore_children = g_slist_prepend(NULL, c);
+        bdrv_check_update_perm(base, NULL, c->perm, c->shared_perm,
+                               ignore_children, &local_err);
+        if (local_err) {
+            ret = -EPERM;
+            error_report_err(local_err);
+            goto exit;
+        }
+        g_slist_free(ignore_children);
+
+        /* If so, update the backing file path in the image file */
+        if (c->role->update_filename) {
+            ret = c->role->update_filename(c, base, backing_file_str,
+                                           &local_err);
+            if (ret < 0) {
+                bdrv_abort_perm_update(base);
+                error_report_err(local_err);
+                goto exit;
+            }
+        }
+
+        /* Do the actual switch in the in-memory graph.
+         * Completes bdrv_check_update_perm() transaction internally. */
+        bdrv_ref(base);
+        bdrv_replace_child(c, base);
+        bdrv_unref(top);
    }

    ret = 0;
 exit:
+    bdrv_unref(top);
    return ret;
 }

@@ -3450,12 +3582,18 @@ int bdrv_truncate(BdrvChild *child, int64_t offset, PreallocMode prealloc,
    assert(!(bs->open_flags & BDRV_O_INACTIVE));

    ret = drv->bdrv_truncate(bs, offset, prealloc, errp);
-    if (ret == 0) {
-        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
-        bdrv_dirty_bitmap_truncate(bs);
-        bdrv_parent_cb_resize(bs);
-        atomic_inc(&bs->write_gen);
+    if (ret < 0) {
+        return ret;
    }
+    ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Could not refresh total sector count");
+    } else {
+        offset = bs->total_sectors * BDRV_SECTOR_SIZE;
+    }
+    bdrv_dirty_bitmap_truncate(bs, offset);
+    bdrv_parent_cb_resize(bs);
+    atomic_inc(&bs->write_gen);
    return ret;
 }

@@ -4049,7 +4187,7 @@ void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)

    /* Update permissions, they may differ for inactive nodes */
    bdrv_get_cumulative_perm(bs, &perm, &shared_perm);
-    ret = bdrv_check_perm(bs, perm, shared_perm, NULL, &local_err);
+    ret = bdrv_check_perm(bs, NULL, perm, shared_perm, NULL, &local_err);
    if (ret < 0) {
        bs->open_flags |= BDRV_O_INACTIVE;
        error_propagate(errp, local_err);
@@ -4116,7 +4254,7 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs,

        /* Update permissions, they may differ for inactive nodes */
        bdrv_get_cumulative_perm(bs, &perm, &shared_perm);
-        bdrv_check_perm(bs, perm, shared_perm, NULL, &error_abort);
+        bdrv_check_perm(bs, NULL, perm, shared_perm, NULL, &error_abort);
        bdrv_set_perm(bs, perm, shared_perm);
    }

@@ -4393,7 +4531,7 @@ void bdrv_img_create(const char *filename, const char *fmt,

    /* The size for the image must always be specified, unless we have a backing
     * file and we have not been forbidden from opening it. */
-    size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
+    size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, img_size);
    if (backing_file && !(flags & BDRV_O_NO_BACKING)) {
        BlockDriverState *bs;
        char *full_backing = g_new0(char, PATH_MAX);
--- a/block/backup.c
+++ b/block/backup.c
@@ -372,10 +372,10 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)

    granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
    clusters_per_iter = MAX((granularity / job->cluster_size), 1);
-    dbi = bdrv_dirty_iter_new(job->sync_bitmap, 0);
+    dbi = bdrv_dirty_iter_new(job->sync_bitmap);

    /* Find the next dirty sector(s) */
-    while ((offset = bdrv_dirty_iter_next(dbi) * BDRV_SECTOR_SIZE) >= 0) {
+    while ((offset = bdrv_dirty_iter_next(dbi)) >= 0) {
        cluster = offset / job->cluster_size;

        /* Fake progress updates for any clusters we skipped */
@@ -403,8 +403,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
        /* If the bitmap granularity is smaller than the backup granularity,
         * we need to advance the iterator pointer to the next cluster. */
        if (granularity < job->cluster_size) {
-            bdrv_set_dirty_iter(dbi,
-                                cluster * job->cluster_size / BDRV_SECTOR_SIZE);
+            bdrv_set_dirty_iter(dbi, cluster * job->cluster_size);
        }

        last_cluster = cluster - 1;
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -244,7 +244,6 @@ static int read_config(BDRVBlkdebugState *s, const char *filename,
        ret = qemu_config_parse(f, config_groups, filename);
        if (ret < 0) {
            error_setg(errp, "Could not parse blkdebug config file");
-            ret = -EINVAL;
            goto fail;
        }
    }
--- a/block/commit.c
+++ b/block/commit.c
@@ -36,13 +36,11 @@ enum {
 typedef struct CommitBlockJob {
    BlockJob common;
    RateLimit limit;
-    BlockDriverState *active;
    BlockDriverState *commit_top_bs;
    BlockBackend *top;
    BlockBackend *base;
    BlockdevOnError on_error;
    int base_flags;
-    int orig_overlay_flags;
    char *backing_file_str;
 } CommitBlockJob;

@@ -81,18 +79,15 @@ static void commit_complete(BlockJob *job, void *opaque)
 {
    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
    CommitCompleteData *data = opaque;
-    BlockDriverState *active = s->active;
    BlockDriverState *top = blk_bs(s->top);
    BlockDriverState *base = blk_bs(s->base);
-    BlockDriverState *overlay_bs = bdrv_find_overlay(active, s->commit_top_bs);
+    BlockDriverState *commit_top_bs = s->commit_top_bs;
    int ret = data->ret;
    bool remove_commit_top_bs = false;

-    /* Make sure overlay_bs and top stay around until bdrv_set_backing_hd() */
+    /* Make sure commit_top_bs and top stay around until bdrv_replace_node() */
    bdrv_ref(top);
-    if (overlay_bs) {
-        bdrv_ref(overlay_bs);
-    }
+    bdrv_ref(commit_top_bs);

    /* Remove base node parent that still uses BLK_PERM_WRITE/RESIZE before
     * the normal backing chain can be restored. */
@@ -100,9 +95,9 @@ static void commit_complete(BlockJob *job, void *opaque)

    if (!block_job_is_cancelled(&s->common) && ret == 0) {
        /* success */
-        ret = bdrv_drop_intermediate(active, s->commit_top_bs, base,
+        ret = bdrv_drop_intermediate(s->commit_top_bs, base,
                                     s->backing_file_str);
-    } else if (overlay_bs) {
+    } else {
        /* XXX Can (or should) we somehow keep 'consistent read' blocked even
         * after the failed/cancelled commit job is gone? If we already wrote
         * something to base, the intermediate images aren't valid any more. */
@@ -115,9 +110,6 @@ static void commit_complete(BlockJob *job, void *opaque)
    if (s->base_flags != bdrv_get_flags(base)) {
        bdrv_reopen(base, s->base_flags, NULL);
    }
-    if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
-        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
-    }
    g_free(s->backing_file_str);
    blk_unref(s->top);

@@ -134,10 +126,13 @@ static void commit_complete(BlockJob *job, void *opaque)
     * filter driver from the backing chain. Do this as the final step so that
     * the 'consistent read' permission can be granted.  */
    if (remove_commit_top_bs) {
-        bdrv_set_backing_hd(overlay_bs, top, &error_abort);
+        bdrv_child_try_set_perm(commit_top_bs->backing, 0, BLK_PERM_ALL,
+                                &error_abort);
+        bdrv_replace_node(commit_top_bs, backing_bs(commit_top_bs),
+                          &error_abort);
    }

-    bdrv_unref(overlay_bs);
+    bdrv_unref(commit_top_bs);
    bdrv_unref(top);
 }

@@ -257,6 +252,7 @@ static void bdrv_commit_top_close(BlockDriverState *bs)

 static void bdrv_commit_top_child_perm(BlockDriverState *bs, BdrvChild *c,
                                       const BdrvChildRole *role,
+                                       BlockReopenQueue *reopen_queue,
                                       uint64_t perm, uint64_t shared,
                                       uint64_t *nperm, uint64_t *nshared)
 {
@@ -282,10 +278,8 @@ void commit_start(const char *job_id, BlockDriverState *bs,
 {
    CommitBlockJob *s;
    BlockReopenQueue *reopen_queue = NULL;
-    int orig_overlay_flags;
    int orig_base_flags;
    BlockDriverState *iter;
-    BlockDriverState *overlay_bs;
    BlockDriverState *commit_top_bs = NULL;
    Error *local_err = NULL;
    int ret;
@@ -296,31 +290,19 @@ void commit_start(const char *job_id, BlockDriverState *bs,
        return;
    }

-    overlay_bs = bdrv_find_overlay(bs, top);
-
-    if (overlay_bs == NULL) {
-        error_setg(errp, "Could not find overlay image for %s:", top->filename);
-        return;
-    }
-
    s = block_job_create(job_id, &commit_job_driver, bs, 0, BLK_PERM_ALL,
                         speed, BLOCK_JOB_DEFAULT, NULL, NULL, errp);
    if (!s) {
        return;
    }

-    orig_base_flags    = bdrv_get_flags(base);
-    orig_overlay_flags = bdrv_get_flags(overlay_bs);
-
-    /* convert base & overlay_bs to r/w, if necessary */
+    /* convert base to r/w, if necessary */
+    orig_base_flags = bdrv_get_flags(base);
    if (!(orig_base_flags & BDRV_O_RDWR)) {
        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
                                         orig_base_flags | BDRV_O_RDWR);
    }
-    if (!(orig_overlay_flags & BDRV_O_RDWR)) {
-        reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs, NULL,
-                                         orig_overlay_flags | BDRV_O_RDWR);
-    }
+
    if (reopen_queue) {
        bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err);
        if (local_err != NULL) {
@@ -349,7 +331,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
        error_propagate(errp, local_err);
        goto fail;
    }
-    bdrv_set_backing_hd(overlay_bs, commit_top_bs, &local_err);
+    bdrv_replace_node(top, commit_top_bs, &local_err);
    if (local_err) {
        bdrv_unref(commit_top_bs);
        commit_top_bs = NULL;
@@ -381,14 +363,6 @@ void commit_start(const char *job_id, BlockDriverState *bs,
        goto fail;
    }

-    /* overlay_bs must be blocked because it needs to be modified to
-     * update the backing image string. */
-    ret = block_job_add_bdrv(&s->common, "overlay of top", overlay_bs,
-                             BLK_PERM_GRAPH_MOD, BLK_PERM_ALL, errp);
-    if (ret < 0) {
-        goto fail;
-    }
-
    s->base = blk_new(BLK_PERM_CONSISTENT_READ
                      | BLK_PERM_WRITE
                      | BLK_PERM_RESIZE,
@@ -407,13 +381,8 @@ void commit_start(const char *job_id, BlockDriverState *bs,
        goto fail;
    }

-    s->active = bs;
-
-    s->base_flags          = orig_base_flags;
-    s->orig_overlay_flags  = orig_overlay_flags;
-
+    s->base_flags = orig_base_flags;
    s->backing_file_str = g_strdup(backing_file_str);
-
    s->on_error = on_error;

    trace_commit_start(bs, base, top, s);
@@ -428,7 +397,7 @@ fail:
        blk_unref(s->top);
    }
    if (commit_top_bs) {
-        bdrv_set_backing_hd(overlay_bs, top, &error_abort);
+        bdrv_replace_node(commit_top_bs, top, &error_abort);
    }
    block_job_early_fail(&s->common);
 }
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -279,6 +279,9 @@ static int block_crypto_open_generic(QCryptoBlockFormat format,
        return -EINVAL;
    }

+    bs->supported_write_flags = BDRV_REQ_FUA &
+        bs->file->bs->supported_write_flags;
+
    opts = qemu_opts_create(opts_spec, NULL, 0, &error_abort);
    qemu_opts_absorb_qdict(opts, options, &local_err);
    if (local_err) {
@@ -364,8 +367,9 @@ static int block_crypto_truncate(BlockDriverState *bs, int64_t offset,
                                 PreallocMode prealloc, Error **errp)
 {
    BlockCrypto *crypto = bs->opaque;
-    size_t payload_offset =
+    uint64_t payload_offset =
        qcrypto_block_get_payload_offset(crypto->block);
+    assert(payload_offset < (INT64_MAX - offset));

    offset += payload_offset;

@@ -379,66 +383,65 @@ static void block_crypto_close(BlockDriverState *bs)
 }


-#define BLOCK_CRYPTO_MAX_SECTORS 32
+/*
+ * 1 MB bounce buffer gives good performance / memory tradeoff
+ * when using cache=none|directsync.
+ */
+#define BLOCK_CRYPTO_MAX_IO_SIZE (1024 * 1024)

 static coroutine_fn int
-block_crypto_co_readv(BlockDriverState *bs, int64_t sector_num,
-                      int remaining_sectors, QEMUIOVector *qiov)
+block_crypto_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                       QEMUIOVector *qiov, int flags)
 {
    BlockCrypto *crypto = bs->opaque;
-    int cur_nr_sectors; /* number of sectors in current iteration */
+    uint64_t cur_bytes; /* number of bytes in current iteration */
    uint64_t bytes_done = 0;
    uint8_t *cipher_data = NULL;
    QEMUIOVector hd_qiov;
    int ret = 0;
-    size_t payload_offset =
-        qcrypto_block_get_payload_offset(crypto->block) / 512;
+    uint64_t sector_size = qcrypto_block_get_sector_size(crypto->block);
+    uint64_t payload_offset = qcrypto_block_get_payload_offset(crypto->block);
+
+    assert(!flags);
+    assert(payload_offset < INT64_MAX);
+    assert(QEMU_IS_ALIGNED(offset, sector_size));
+    assert(QEMU_IS_ALIGNED(bytes, sector_size));

    qemu_iovec_init(&hd_qiov, qiov->niov);

-    /* Bounce buffer so we have a linear mem region for
-     * entire sector. XXX optimize so we avoid bounce
-     * buffer in case that qiov->niov == 1
+    /* Bounce buffer because we don't wish to expose cipher text
+     * in qiov which points to guest memory.
     */
    cipher_data =
-        qemu_try_blockalign(bs->file->bs, MIN(BLOCK_CRYPTO_MAX_SECTORS * 512,
+        qemu_try_blockalign(bs->file->bs, MIN(BLOCK_CRYPTO_MAX_IO_SIZE,
                                              qiov->size));
    if (cipher_data == NULL) {
        ret = -ENOMEM;
        goto cleanup;
    }

-    while (remaining_sectors) {
-        cur_nr_sectors = remaining_sectors;
-
-        if (cur_nr_sectors > BLOCK_CRYPTO_MAX_SECTORS) {
-            cur_nr_sectors = BLOCK_CRYPTO_MAX_SECTORS;
-        }
+    while (bytes) {
+        cur_bytes = MIN(bytes, BLOCK_CRYPTO_MAX_IO_SIZE);

        qemu_iovec_reset(&hd_qiov);
-        qemu_iovec_add(&hd_qiov, cipher_data, cur_nr_sectors * 512);
+        qemu_iovec_add(&hd_qiov, cipher_data, cur_bytes);

-        ret = bdrv_co_readv(bs->file,
-                            payload_offset + sector_num,
-                            cur_nr_sectors, &hd_qiov);
+        ret = bdrv_co_preadv(bs->file, payload_offset + offset + bytes_done,
+                             cur_bytes, &hd_qiov, 0);
        if (ret < 0) {
            goto cleanup;
        }

-        if (qcrypto_block_decrypt(crypto->block,
-                                  sector_num,
-                                  cipher_data, cur_nr_sectors * 512,
-                                  NULL) < 0) {
+        if (qcrypto_block_decrypt(crypto->block, offset + bytes_done,
+                                  cipher_data, cur_bytes, NULL) < 0) {
            ret = -EIO;
            goto cleanup;
        }

-        qemu_iovec_from_buf(qiov, bytes_done,
-                            cipher_data, cur_nr_sectors * 512);
+        qemu_iovec_from_buf(qiov, bytes_done, cipher_data, cur_bytes);

-        remaining_sectors -= cur_nr_sectors;
-        sector_num += cur_nr_sectors;
-        bytes_done += cur_nr_sectors * 512;
+        bytes -= cur_bytes;
+        bytes_done += cur_bytes;
    }

 cleanup:
@@ -450,63 +453,58 @@ block_crypto_co_readv(BlockDriverState *bs, int64_t sector_num,


 static coroutine_fn int
-block_crypto_co_writev(BlockDriverState *bs, int64_t sector_num,
-                       int remaining_sectors, QEMUIOVector *qiov)
+block_crypto_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                        QEMUIOVector *qiov, int flags)
 {
    BlockCrypto *crypto = bs->opaque;
-    int cur_nr_sectors; /* number of sectors in current iteration */
+    uint64_t cur_bytes; /* number of bytes in current iteration */
    uint64_t bytes_done = 0;
    uint8_t *cipher_data = NULL;
    QEMUIOVector hd_qiov;
    int ret = 0;
-    size_t payload_offset =
-        qcrypto_block_get_payload_offset(crypto->block) / 512;
+    uint64_t sector_size = qcrypto_block_get_sector_size(crypto->block);
+    uint64_t payload_offset = qcrypto_block_get_payload_offset(crypto->block);
+
+    assert(!(flags & ~BDRV_REQ_FUA));
+    assert(payload_offset < INT64_MAX);
+    assert(QEMU_IS_ALIGNED(offset, sector_size));
+    assert(QEMU_IS_ALIGNED(bytes, sector_size));

    qemu_iovec_init(&hd_qiov, qiov->niov);

-    /* Bounce buffer so we have a linear mem region for
-     * entire sector. XXX optimize so we avoid bounce
-     * buffer in case that qiov->niov == 1
+    /* Bounce buffer because we're not permitted to touch
+     * contents of qiov - it points to guest memory.
     */
    cipher_data =
-        qemu_try_blockalign(bs->file->bs, MIN(BLOCK_CRYPTO_MAX_SECTORS * 512,
+        qemu_try_blockalign(bs->file->bs, MIN(BLOCK_CRYPTO_MAX_IO_SIZE,
                                              qiov->size));
    if (cipher_data == NULL) {
        ret = -ENOMEM;
        goto cleanup;
    }

-    while (remaining_sectors) {
-        cur_nr_sectors = remaining_sectors;
+    while (bytes) {
+        cur_bytes = MIN(bytes, BLOCK_CRYPTO_MAX_IO_SIZE);

-        if (cur_nr_sectors > BLOCK_CRYPTO_MAX_SECTORS) {
-            cur_nr_sectors = BLOCK_CRYPTO_MAX_SECTORS;
-        }
+        qemu_iovec_to_buf(qiov, bytes_done, cipher_data, cur_bytes);

-        qemu_iovec_to_buf(qiov, bytes_done,
-                          cipher_data, cur_nr_sectors * 512);
-
-        if (qcrypto_block_encrypt(crypto->block,
-                                  sector_num,
-                                  cipher_data, cur_nr_sectors * 512,
-                                  NULL) < 0) {
+        if (qcrypto_block_encrypt(crypto->block, offset + bytes_done,
+                                  cipher_data, cur_bytes, NULL) < 0) {
            ret = -EIO;
            goto cleanup;
        }

        qemu_iovec_reset(&hd_qiov);
-        qemu_iovec_add(&hd_qiov, cipher_data, cur_nr_sectors * 512);
+        qemu_iovec_add(&hd_qiov, cipher_data, cur_bytes);

-        ret = bdrv_co_writev(bs->file,
-                             payload_offset + sector_num,
-                             cur_nr_sectors, &hd_qiov);
+        ret = bdrv_co_pwritev(bs->file, payload_offset + offset + bytes_done,
+                              cur_bytes, &hd_qiov, flags);
        if (ret < 0) {
            goto cleanup;
        }

-        remaining_sectors -= cur_nr_sectors;
-        sector_num += cur_nr_sectors;
-        bytes_done += cur_nr_sectors * 512;
+        bytes -= cur_bytes;
+        bytes_done += cur_bytes;
    }

 cleanup:
@@ -516,13 +514,22 @@ block_crypto_co_writev(BlockDriverState *bs, int64_t sector_num,
    return ret;
 }

+static void block_crypto_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+    BlockCrypto *crypto = bs->opaque;
+    uint64_t sector_size = qcrypto_block_get_sector_size(crypto->block);
+    bs->bl.request_alignment = sector_size; /* No sub-sector I/O */
+}
+

 static int64_t block_crypto_getlength(BlockDriverState *bs)
 {
    BlockCrypto *crypto = bs->opaque;
    int64_t len = bdrv_getlength(bs->file->bs);

-    ssize_t offset = qcrypto_block_get_payload_offset(crypto->block);
+    uint64_t offset = qcrypto_block_get_payload_offset(crypto->block);
+    assert(offset < INT64_MAX);
+    assert(offset < len);

    len -= offset;

@@ -613,8 +620,9 @@ BlockDriver bdrv_crypto_luks = {
    .bdrv_truncate      = block_crypto_truncate,
    .create_opts        = &block_crypto_create_opts_luks,

-    .bdrv_co_readv      = block_crypto_co_readv,
-    .bdrv_co_writev     = block_crypto_co_writev,
+    .bdrv_refresh_limits = block_crypto_refresh_limits,
+    .bdrv_co_preadv     = block_crypto_co_preadv,
+    .bdrv_co_pwritev    = block_crypto_co_pwritev,
    .bdrv_getlength     = block_crypto_getlength,
    .bdrv_get_info      = block_crypto_get_info_luks,
    .bdrv_get_specific_info = block_crypto_get_specific_info_luks,
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -1,7 +1,7 @@
 /*
 * Block Dirty Bitmap
 *
- * Copyright (c) 2016 Red Hat. Inc
+ * Copyright (c) 2016-2017 Red Hat. Inc
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -38,11 +38,11 @@
 */
 struct BdrvDirtyBitmap {
    QemuMutex *mutex;
-    HBitmap *bitmap;            /* Dirty sector bitmap implementation */
+    HBitmap *bitmap;            /* Dirty bitmap implementation */
    HBitmap *meta;              /* Meta dirty bitmap */
    BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */
    char *name;                 /* Optional non-empty unique ID */
-    int64_t size;               /* Size of the bitmap (Number of sectors) */
+    int64_t size;               /* Size of the bitmap, in bytes */
    bool disabled;              /* Bitmap is disabled. It ignores all writes to
                                   the device */
    int active_iterators;       /* How many iterators are active */
@@ -115,17 +115,14 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
 {
    int64_t bitmap_size;
    BdrvDirtyBitmap *bitmap;
-    uint32_t sector_granularity;

-    assert((granularity & (granularity - 1)) == 0);
+    assert(is_power_of_2(granularity) && granularity >= BDRV_SECTOR_SIZE);

    if (name && bdrv_find_dirty_bitmap(bs, name)) {
        error_setg(errp, "Bitmap already exists: %s", name);
        return NULL;
    }
-    sector_granularity = granularity >> BDRV_SECTOR_BITS;
-    assert(sector_granularity);
-    bitmap_size = bdrv_nb_sectors(bs);
+    bitmap_size = bdrv_getlength(bs);
    if (bitmap_size < 0) {
        error_setg_errno(errp, -bitmap_size, "could not get length of device");
        errno = -bitmap_size;
@@ -133,7 +130,7 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
    }
    bitmap = g_new0(BdrvDirtyBitmap, 1);
    bitmap->mutex = &bs->dirty_bitmap_mutex;
-    bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity));
+    bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(granularity));
    bitmap->size = bitmap_size;
    bitmap->name = g_strdup(name);
    bitmap->disabled = false;
@@ -173,45 +170,6 @@ void bdrv_release_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap)
    qemu_mutex_unlock(bitmap->mutex);
 }

-int bdrv_dirty_bitmap_get_meta_locked(BlockDriverState *bs,
-                                      BdrvDirtyBitmap *bitmap, int64_t sector,
-                                      int nb_sectors)
-{
-    uint64_t i;
-    int sectors_per_bit = 1 << hbitmap_granularity(bitmap->meta);
-
-    /* To optimize: we can make hbitmap to internally check the range in a
-     * coarse level, or at least do it word by word. */
-    for (i = sector; i < sector + nb_sectors; i += sectors_per_bit) {
-        if (hbitmap_get(bitmap->meta, i)) {
-            return true;
-        }
-    }
-    return false;
-}
-
-int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs,
-                               BdrvDirtyBitmap *bitmap, int64_t sector,
-                               int nb_sectors)
-{
-    bool dirty;
-
-    qemu_mutex_lock(bitmap->mutex);
-    dirty = bdrv_dirty_bitmap_get_meta_locked(bs, bitmap, sector, nb_sectors);
-    qemu_mutex_unlock(bitmap->mutex);
-
-    return dirty;
-}
-
-void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs,
-                                  BdrvDirtyBitmap *bitmap, int64_t sector,
-                                  int nb_sectors)
-{
-    qemu_mutex_lock(bitmap->mutex);
-    hbitmap_reset(bitmap->meta, sector, nb_sectors);
-    qemu_mutex_unlock(bitmap->mutex);
-}
-
 int64_t bdrv_dirty_bitmap_size(const BdrvDirtyBitmap *bitmap)
 {
    return bitmap->size;
@@ -341,17 +299,16 @@ BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
 * Truncates _all_ bitmaps attached to a BDS.
 * Called with BQL taken.
 */
-void bdrv_dirty_bitmap_truncate(BlockDriverState *bs)
+void bdrv_dirty_bitmap_truncate(BlockDriverState *bs, int64_t bytes)
 {
    BdrvDirtyBitmap *bitmap;
-    uint64_t size = bdrv_nb_sectors(bs);

    bdrv_dirty_bitmaps_lock(bs);
    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
        assert(!bdrv_dirty_bitmap_frozen(bitmap));
        assert(!bitmap->active_iterators);
-        hbitmap_truncate(bitmap->bitmap, size);
-        bitmap->size = size;
+        hbitmap_truncate(bitmap->bitmap, bytes);
+        bitmap->size = bytes;
    }
    bdrv_dirty_bitmaps_unlock(bs);
 }
@@ -461,7 +418,7 @@ BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
        BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
        BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
-        info->count = bdrv_get_dirty_count(bm) << BDRV_SECTOR_BITS;
+        info->count = bdrv_get_dirty_count(bm);
        info->granularity = bdrv_dirty_bitmap_granularity(bm);
        info->has_name = !!bm->name;
        info->name = g_strdup(bm->name);
@@ -476,13 +433,13 @@ BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
 }

 /* Called within bdrv_dirty_bitmap_lock..unlock */
-int bdrv_get_dirty_locked(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
-                          int64_t sector)
+bool bdrv_get_dirty_locked(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+                           int64_t offset)
 {
    if (bitmap) {
-        return hbitmap_get(bitmap->bitmap, sector);
+        return hbitmap_get(bitmap->bitmap, offset);
    } else {
-        return 0;
+        return false;
    }
 }

@@ -508,19 +465,13 @@ uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs)

 uint32_t bdrv_dirty_bitmap_granularity(const BdrvDirtyBitmap *bitmap)
 {
-    return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap);
+    return 1U << hbitmap_granularity(bitmap->bitmap);
 }

-uint32_t bdrv_dirty_bitmap_meta_granularity(BdrvDirtyBitmap *bitmap)
-{
-    return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->meta);
-}
-
-BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap,
-                                         uint64_t first_sector)
+BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap)
 {
    BdrvDirtyBitmapIter *iter = g_new(BdrvDirtyBitmapIter, 1);
-    hbitmap_iter_init(&iter->hbi, bitmap->bitmap, first_sector);
+    hbitmap_iter_init(&iter->hbi, bitmap->bitmap, 0);
    iter->bitmap = bitmap;
    bitmap->active_iterators++;
    return iter;
@@ -552,35 +503,35 @@ int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter)

 /* Called within bdrv_dirty_bitmap_lock..unlock */
 void bdrv_set_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
-                                  int64_t cur_sector, int64_t nr_sectors)
+                                  int64_t offset, int64_t bytes)
 {
    assert(bdrv_dirty_bitmap_enabled(bitmap));
    assert(!bdrv_dirty_bitmap_readonly(bitmap));
-    hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
+    hbitmap_set(bitmap->bitmap, offset, bytes);
 }

 void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
-                           int64_t cur_sector, int64_t nr_sectors)
+                           int64_t offset, int64_t bytes)
 {
    bdrv_dirty_bitmap_lock(bitmap);
-    bdrv_set_dirty_bitmap_locked(bitmap, cur_sector, nr_sectors);
+    bdrv_set_dirty_bitmap_locked(bitmap, offset, bytes);
    bdrv_dirty_bitmap_unlock(bitmap);
 }

 /* Called within bdrv_dirty_bitmap_lock..unlock */
 void bdrv_reset_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
-                                    int64_t cur_sector, int64_t nr_sectors)
+                                    int64_t offset, int64_t bytes)
 {
    assert(bdrv_dirty_bitmap_enabled(bitmap));
    assert(!bdrv_dirty_bitmap_readonly(bitmap));
-    hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
+    hbitmap_reset(bitmap->bitmap, offset, bytes);
 }

 void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
-                             int64_t cur_sector, int64_t nr_sectors)
+                             int64_t offset, int64_t bytes)
 {
    bdrv_dirty_bitmap_lock(bitmap);
-    bdrv_reset_dirty_bitmap_locked(bitmap, cur_sector, nr_sectors);
+    bdrv_reset_dirty_bitmap_locked(bitmap, offset, bytes);
    bdrv_dirty_bitmap_unlock(bitmap);
 }

@@ -610,42 +561,42 @@ void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in)
 }

 uint64_t bdrv_dirty_bitmap_serialization_size(const BdrvDirtyBitmap *bitmap,
-                                              uint64_t start, uint64_t count)
+                                              uint64_t offset, uint64_t bytes)
 {
-    return hbitmap_serialization_size(bitmap->bitmap, start, count);
+    return hbitmap_serialization_size(bitmap->bitmap, offset, bytes);
 }

 uint64_t bdrv_dirty_bitmap_serialization_align(const BdrvDirtyBitmap *bitmap)
 {
-    return hbitmap_serialization_granularity(bitmap->bitmap);
+    return hbitmap_serialization_align(bitmap->bitmap);
 }

 void bdrv_dirty_bitmap_serialize_part(const BdrvDirtyBitmap *bitmap,
-                                      uint8_t *buf, uint64_t start,
-                                      uint64_t count)
+                                      uint8_t *buf, uint64_t offset,
+                                      uint64_t bytes)
 {
-    hbitmap_serialize_part(bitmap->bitmap, buf, start, count);
+    hbitmap_serialize_part(bitmap->bitmap, buf, offset, bytes);
 }

 void bdrv_dirty_bitmap_deserialize_part(BdrvDirtyBitmap *bitmap,
-                                        uint8_t *buf, uint64_t start,
-                                        uint64_t count, bool finish)
+                                        uint8_t *buf, uint64_t offset,
+                                        uint64_t bytes, bool finish)
 {
-    hbitmap_deserialize_part(bitmap->bitmap, buf, start, count, finish);
+    hbitmap_deserialize_part(bitmap->bitmap, buf, offset, bytes, finish);
 }

 void bdrv_dirty_bitmap_deserialize_zeroes(BdrvDirtyBitmap *bitmap,
-                                          uint64_t start, uint64_t count,
+                                          uint64_t offset, uint64_t bytes,
                                          bool finish)
 {
-    hbitmap_deserialize_zeroes(bitmap->bitmap, start, count, finish);
+    hbitmap_deserialize_zeroes(bitmap->bitmap, offset, bytes, finish);
 }

 void bdrv_dirty_bitmap_deserialize_ones(BdrvDirtyBitmap *bitmap,
-                                        uint64_t start, uint64_t count,
+                                        uint64_t offset, uint64_t bytes,
                                        bool finish)
 {
-    hbitmap_deserialize_ones(bitmap->bitmap, start, count, finish);
+    hbitmap_deserialize_ones(bitmap->bitmap, offset, bytes, finish);
 }

 void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap *bitmap)
@@ -653,8 +604,7 @@ void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap *bitmap)
    hbitmap_deserialize_finish(bitmap->bitmap);
 }

-void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
-                    int64_t nr_sectors)
+void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes)
 {
    BdrvDirtyBitmap *bitmap;

@@ -668,7 +618,7 @@ void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
            continue;
        }
        assert(!bdrv_dirty_bitmap_readonly(bitmap));
-        hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
+        hbitmap_set(bitmap->bitmap, offset, bytes);
    }
    bdrv_dirty_bitmaps_unlock(bs);
 }
@@ -676,9 +626,9 @@ void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
 /**
 * Advance a BdrvDirtyBitmapIter to an arbitrary offset.
 */
-void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *iter, int64_t sector_num)
+void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *iter, int64_t offset)
 {
-    hbitmap_iter_init(&iter->hbi, iter->hbi.hb, sector_num);
+    hbitmap_iter_init(&iter->hbi, iter->hbi.hb, offset);
 }

 int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap)
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -33,6 +33,9 @@
 #include "block/raw-aio.h"
 #include "qapi/qmp/qstring.h"

+#include "scsi/pr-manager.h"
+#include "scsi/constants.h"
+
 #if defined(__APPLE__) && (__MACH__)
 #include <paths.h>
 #include <sys/param.h>
@@ -155,6 +158,8 @@ typedef struct BDRVRawState {
    bool page_cache_inconsistent:1;
    bool has_fallocate;
    bool needs_alignment;
+
+    PRManager *pr_mgr;
 } BDRVRawState;

 typedef struct BDRVRawReopenState {
@@ -402,6 +407,11 @@ static QemuOptsList raw_runtime_opts = {
            .type = QEMU_OPT_STRING,
            .help = "file locking mode (on/off/auto, default: auto)",
        },
+        {
+            .name = "pr-manager",
+            .type = QEMU_OPT_STRING,
+            .help = "id of persistent reservation manager object (default: none)",
+        },
        { /* end of list */ }
    },
 };
@@ -413,6 +423,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
    QemuOpts *opts;
    Error *local_err = NULL;
    const char *filename = NULL;
+    const char *str;
    BlockdevAioOptions aio, aio_default;
    int fd, ret;
    struct stat st;
@@ -476,6 +487,16 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
        abort();
    }

+    str = qemu_opt_get(opts, "pr-manager");
+    if (str) {
+        s->pr_mgr = pr_manager_lookup(str, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            ret = -EINVAL;
+            goto fail;
+        }
+    }
+
    s->open_flags = open_flags;
    raw_parse_flags(bdrv_flags, &s->open_flags);

@@ -2597,6 +2618,15 @@ static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
    if (fd_open(bs) < 0)
        return NULL;

+    if (req == SG_IO && s->pr_mgr) {
+        struct sg_io_hdr *io_hdr = buf;
+        if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT ||
+            io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) {
+            return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs),
+                                      s->fd, io_hdr, cb, opaque);
+        }
+    }
+
    acb = g_new(RawPosixAIOData, 1);
    acb->bs = bs;
    acb->aio_type = QEMU_AIO_IOCTL;
@@ -2700,6 +2730,16 @@ static int hdev_create(const char *filename, QemuOpts *opts,
        ret = -ENOSPC;
    }

+    if (!ret && total_size) {
+        uint8_t buf[BDRV_SECTOR_SIZE] = { 0 };
+        int64_t zero_size = MIN(BDRV_SECTOR_SIZE, total_size);
+        if (lseek(fd, 0, SEEK_SET) == -1) {
+            ret = -errno;
+        } else {
+            ret = qemu_write_full(fd, buf, zero_size);
+            ret = ret == zero_size ? 0 : -errno;
+        }
+    }
    qemu_close(fd);
    return ret;
 }
--- a/block/io.c
+++ b/block/io.c
@@ -34,6 +34,9 @@

 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */

+/* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
+#define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
+
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
    int64_t offset, int bytes, BdrvRequestFlags flags);

@@ -945,68 +948,114 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,

    BlockDriver *drv = bs->drv;
    struct iovec iov;
-    QEMUIOVector bounce_qiov;
+    QEMUIOVector local_qiov;
    int64_t cluster_offset;
    unsigned int cluster_bytes;
    size_t skip_bytes;
    int ret;
+    int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
+                                    BDRV_REQUEST_MAX_BYTES);
+    unsigned int progress = 0;

    /* FIXME We cannot require callers to have write permissions when all they
     * are doing is a read request. If we did things right, write permissions
     * would be obtained anyway, but internally by the copy-on-read code. As
-     * long as it is implemented here rather than in a separat filter driver,
+     * long as it is implemented here rather than in a separate filter driver,
     * the copy-on-read code doesn't have its own BdrvChild, however, for which
     * it could request permissions. Therefore we have to bypass the permission
     * system for the moment. */
    // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));

    /* Cover entire cluster so no additional backing file I/O is required when
-     * allocating cluster in the image file.
+     * allocating cluster in the image file.  Note that this value may exceed
+     * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
+     * is one reason we loop rather than doing it all at once.
     */
    bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
+    skip_bytes = offset - cluster_offset;

    trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
                                   cluster_offset, cluster_bytes);

-    iov.iov_len = cluster_bytes;
-    iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
+    bounce_buffer = qemu_try_blockalign(bs,
+                                        MIN(MIN(max_transfer, cluster_bytes),
+                                            MAX_BOUNCE_BUFFER));
    if (bounce_buffer == NULL) {
        ret = -ENOMEM;
        goto err;
    }

-    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
+    while (cluster_bytes) {
+        int64_t pnum;

-    ret = bdrv_driver_preadv(bs, cluster_offset, cluster_bytes,
-                             &bounce_qiov, 0);
-    if (ret < 0) {
-        goto err;
+        ret = bdrv_is_allocated(bs, cluster_offset,
+                                MIN(cluster_bytes, max_transfer), &pnum);
+        if (ret < 0) {
+            /* Safe to treat errors in querying allocation as if
+             * unallocated; we'll probably fail again soon on the
+             * read, but at least that will set a decent errno.
+             */
+            pnum = MIN(cluster_bytes, max_transfer);
+        }
+
+        assert(skip_bytes < pnum);
+
+        if (ret <= 0) {
+            /* Must copy-on-read; use the bounce buffer */
+            iov.iov_base = bounce_buffer;
+            iov.iov_len = pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
+            qemu_iovec_init_external(&local_qiov, &iov, 1);
+
+            ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
+                                     &local_qiov, 0);
+            if (ret < 0) {
+                goto err;
+            }
+
+            bdrv_debug_event(bs, BLKDBG_COR_WRITE);
+            if (drv->bdrv_co_pwrite_zeroes &&
+                buffer_is_zero(bounce_buffer, pnum)) {
+                /* FIXME: Should we (perhaps conditionally) be setting
+                 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
+                 * that still correctly reads as zero? */
+                ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 0);
+            } else {
+                /* This does not change the data on the disk, it is not
+                 * necessary to flush even in cache=writethrough mode.
+                 */
+                ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
+                                          &local_qiov, 0);
+            }
+
+            if (ret < 0) {
+                /* It might be okay to ignore write errors for guest
+                 * requests.  If this is a deliberate copy-on-read
+                 * then we don't want to ignore the error.  Simply
+                 * report it in all cases.
+                 */
+                goto err;
+            }
+
+            qemu_iovec_from_buf(qiov, progress, bounce_buffer + skip_bytes,
+                                pnum - skip_bytes);
+        } else {
+            /* Read directly into the destination */
+            qemu_iovec_init(&local_qiov, qiov->niov);
+            qemu_iovec_concat(&local_qiov, qiov, progress, pnum - skip_bytes);
+            ret = bdrv_driver_preadv(bs, offset + progress, local_qiov.size,
+                                     &local_qiov, 0);
+            qemu_iovec_destroy(&local_qiov);
+            if (ret < 0) {
+                goto err;
+            }
+        }
+
+        cluster_offset += pnum;
+        cluster_bytes -= pnum;
+        progress += pnum - skip_bytes;
+        skip_bytes = 0;
    }
-
-    if (drv->bdrv_co_pwrite_zeroes &&
-        buffer_is_zero(bounce_buffer, iov.iov_len)) {
-        /* FIXME: Should we (perhaps conditionally) be setting
-         * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
-         * that still correctly reads as zero? */
-        ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, cluster_bytes, 0);
-    } else {
-        /* This does not change the data on the disk, it is not necessary
-         * to flush even in cache=writethrough mode.
-         */
-        ret = bdrv_driver_pwritev(bs, cluster_offset, cluster_bytes,
-                                  &bounce_qiov, 0);
-    }
-
-    if (ret < 0) {
-        /* It might be okay to ignore write errors for guest requests.  If this
-         * is a deliberate copy-on-read then we don't want to ignore the error.
-         * Simply report it in all cases.
-         */
-        goto err;
-    }
-
-    skip_bytes = offset - cluster_offset;
-    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, bytes);
+    ret = 0;

 err:
    qemu_vfree(bounce_buffer);
@@ -1212,9 +1261,6 @@ int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num,
    return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0);
 }

-/* Maximum buffer for write zeroes fallback, in bytes */
-#define MAX_WRITE_ZEROES_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
-
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
    int64_t offset, int bytes, BdrvRequestFlags flags)
 {
@@ -1229,8 +1275,7 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
    int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
    int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
                        bs->bl.request_alignment);
-    int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
-                                    MAX_WRITE_ZEROES_BOUNCE_BUFFER);
+    int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);

    assert(alignment % bs->bl.request_alignment == 0);
    head = offset % alignment;
@@ -1334,7 +1379,6 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
    bool waited;
    int ret;

-    int64_t start_sector = offset >> BDRV_SECTOR_BITS;
    int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
    uint64_t bytes_remaining = bytes;
    int max_transfer;
@@ -1409,7 +1453,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
    bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);

    atomic_inc(&bs->write_gen);
-    bdrv_set_dirty(bs, start_sector, end_sector - start_sector);
+    bdrv_set_dirty(bs, offset, bytes);

    stat64_max(&bs->wr_highest_offset, offset + bytes);

@@ -1778,6 +1822,10 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
        *pnum = 0;
        return BDRV_BLOCK_EOF;
    }
+    if (!nb_sectors) {
+        *pnum = 0;
+        return 0;
+    }

    n = total_sectors - sector_num;
    if (n < nb_sectors) {
@@ -2438,8 +2486,7 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
    ret = 0;
 out:
    atomic_inc(&bs->write_gen);
-    bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS,
-                   req.bytes >> BDRV_SECTOR_BITS);
+    bdrv_set_dirty(bs, req.offset, req.bytes);
    tracked_request_end(&req);
    bdrv_dec_in_flight(bs);
    return ret;
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -141,8 +141,7 @@ static void mirror_write_complete(void *opaque, int ret)
    if (ret < 0) {
        BlockErrorAction action;

-        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->offset >> BDRV_SECTOR_BITS,
-                              op->bytes >> BDRV_SECTOR_BITS);
+        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->offset, op->bytes);
        action = mirror_error_action(s, false, -ret);
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
            s->ret = ret;
@@ -161,8 +160,7 @@ static void mirror_read_complete(void *opaque, int ret)
    if (ret < 0) {
        BlockErrorAction action;

-        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->offset >> BDRV_SECTOR_BITS,
-                              op->bytes >> BDRV_SECTOR_BITS);
+        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->offset, op->bytes);
        action = mirror_error_action(s, true, -ret);
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
            s->ret = ret;
@@ -336,12 +334,11 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
    int max_io_bytes = MAX(s->buf_size / MAX_IN_FLIGHT, MAX_IO_BYTES);

    bdrv_dirty_bitmap_lock(s->dirty_bitmap);
-    offset = bdrv_dirty_iter_next(s->dbi) * BDRV_SECTOR_SIZE;
+    offset = bdrv_dirty_iter_next(s->dbi);
    if (offset < 0) {
        bdrv_set_dirty_iter(s->dbi, 0);
-        offset = bdrv_dirty_iter_next(s->dbi) * BDRV_SECTOR_SIZE;
-        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap) *
-                                  BDRV_SECTOR_SIZE);
+        offset = bdrv_dirty_iter_next(s->dbi);
+        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
        assert(offset >= 0);
    }
    bdrv_dirty_bitmap_unlock(s->dirty_bitmap);
@@ -362,19 +359,18 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
        int64_t next_offset = offset + nb_chunks * s->granularity;
        int64_t next_chunk = next_offset / s->granularity;
        if (next_offset >= s->bdev_length ||
-            !bdrv_get_dirty_locked(source, s->dirty_bitmap,
-                                   next_offset >> BDRV_SECTOR_BITS)) {
+            !bdrv_get_dirty_locked(source, s->dirty_bitmap, next_offset)) {
            break;
        }
        if (test_bit(next_chunk, s->in_flight_bitmap)) {
            break;
        }

-        next_dirty = bdrv_dirty_iter_next(s->dbi) * BDRV_SECTOR_SIZE;
+        next_dirty = bdrv_dirty_iter_next(s->dbi);
        if (next_dirty > next_offset || next_dirty < 0) {
            /* The bitmap iterator's cache is stale, refresh it */
-            bdrv_set_dirty_iter(s->dbi, next_offset >> BDRV_SECTOR_BITS);
-            next_dirty = bdrv_dirty_iter_next(s->dbi) * BDRV_SECTOR_SIZE;
+            bdrv_set_dirty_iter(s->dbi, next_offset);
+            next_dirty = bdrv_dirty_iter_next(s->dbi);
        }
        assert(next_dirty == next_offset);
        nb_chunks++;
@@ -384,8 +380,8 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
     * calling bdrv_get_block_status_above could yield - if some blocks are
     * marked dirty in this window, we need to know.
     */
-    bdrv_reset_dirty_bitmap_locked(s->dirty_bitmap, offset >> BDRV_SECTOR_BITS,
-                                   nb_chunks * sectors_per_chunk);
+    bdrv_reset_dirty_bitmap_locked(s->dirty_bitmap, offset,
+                                   nb_chunks * s->granularity);
    bdrv_dirty_bitmap_unlock(s->dirty_bitmap);

    bitmap_set(s->in_flight_bitmap, offset / s->granularity, nb_chunks);
@@ -616,25 +612,23 @@ static void mirror_throttle(MirrorBlockJob *s)

 static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
 {
-    int64_t sector_num, end;
+    int64_t offset;
    BlockDriverState *base = s->base;
    BlockDriverState *bs = s->source;
    BlockDriverState *target_bs = blk_bs(s->target);
-    int ret, n;
+    int ret;
    int64_t count;

-    end = s->bdev_length / BDRV_SECTOR_SIZE;
-
    if (base == NULL && !bdrv_has_zero_init(target_bs)) {
        if (!bdrv_can_write_zeroes_with_unmap(target_bs)) {
-            bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, end);
+            bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, s->bdev_length);
            return 0;
        }

        s->initial_zeroing_ongoing = true;
-        for (sector_num = 0; sector_num < end; ) {
-            int nb_sectors = MIN(end - sector_num,
-                QEMU_ALIGN_DOWN(INT_MAX, s->granularity) >> BDRV_SECTOR_BITS);
+        for (offset = 0; offset < s->bdev_length; ) {
+            int bytes = MIN(s->bdev_length - offset,
+                            QEMU_ALIGN_DOWN(INT_MAX, s->granularity));

            mirror_throttle(s);

@@ -650,9 +644,8 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
                continue;
            }

-            mirror_do_zero_or_discard(s, sector_num * BDRV_SECTOR_SIZE,
-                                      nb_sectors * BDRV_SECTOR_SIZE, false);
-            sector_num += nb_sectors;
+            mirror_do_zero_or_discard(s, offset, bytes, false);
+            offset += bytes;
        }

        mirror_wait_for_all_io(s);
@@ -660,10 +653,10 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
    }

    /* First part, loop on the sectors and initialize the dirty bitmap.  */
-    for (sector_num = 0; sector_num < end; ) {
+    for (offset = 0; offset < s->bdev_length; ) {
        /* Just to make sure we are not exceeding int limit. */
-        int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS,
-                             end - sector_num);
+        int bytes = MIN(s->bdev_length - offset,
+                        QEMU_ALIGN_DOWN(INT_MAX, s->granularity));

        mirror_throttle(s);

@@ -671,21 +664,16 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
            return 0;
        }

-        ret = bdrv_is_allocated_above(bs, base, sector_num * BDRV_SECTOR_SIZE,
-                                      nb_sectors * BDRV_SECTOR_SIZE, &count);
+        ret = bdrv_is_allocated_above(bs, base, offset, bytes, &count);
        if (ret < 0) {
            return ret;
        }

-        /* TODO: Relax this once bdrv_is_allocated_above and dirty
-         * bitmaps no longer require sector alignment. */
-        assert(QEMU_IS_ALIGNED(count, BDRV_SECTOR_SIZE));
-        n = count >> BDRV_SECTOR_BITS;
-        assert(n > 0);
+        assert(count);
        if (ret == 1) {
-            bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
+            bdrv_set_dirty_bitmap(s->dirty_bitmap, offset, count);
        }
-        sector_num += n;
+        offset += count;
    }
    return 0;
 }
@@ -796,7 +784,7 @@ static void coroutine_fn mirror_run(void *opaque)
    }

    assert(!s->dbi);
-    s->dbi = bdrv_dirty_iter_new(s->dirty_bitmap, 0);
+    s->dbi = bdrv_dirty_iter_new(s->dirty_bitmap);
    for (;;) {
        uint64_t delay_ns = 0;
        int64_t cnt, delta;
@@ -811,11 +799,10 @@ static void coroutine_fn mirror_run(void *opaque)

        cnt = bdrv_get_dirty_count(s->dirty_bitmap);
        /* s->common.offset contains the number of bytes already processed so
-         * far, cnt is the number of dirty sectors remaining and
+         * far, cnt is the number of dirty bytes remaining and
         * s->bytes_in_flight is the number of bytes currently being
         * processed; together those are the current total operation length */
-        s->common.len = s->common.offset + s->bytes_in_flight +
-            cnt * BDRV_SECTOR_SIZE;
+        s->common.len = s->common.offset + s->bytes_in_flight + cnt;

        /* Note that even when no rate limit is applied we need to yield
         * periodically with no pending I/O so that bdrv_drain_all() returns.
@@ -827,8 +814,7 @@ static void coroutine_fn mirror_run(void *opaque)
            s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
            if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                (cnt == 0 && s->in_flight > 0)) {
-                trace_mirror_yield(s, cnt * BDRV_SECTOR_SIZE,
-                                   s->buf_free_count, s->in_flight);
+                trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight);
                mirror_wait_for_io(s);
                continue;
            } else if (cnt != 0) {
@@ -869,7 +855,7 @@ static void coroutine_fn mirror_run(void *opaque)
             * whether to switch to target check one last time if I/O has
             * come in the meanwhile, and if not flush the data to disk.
             */
-            trace_mirror_before_drain(s, cnt * BDRV_SECTOR_SIZE);
+            trace_mirror_before_drain(s, cnt);

            bdrv_drained_begin(bs);
            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
@@ -888,8 +874,7 @@ static void coroutine_fn mirror_run(void *opaque)
        }

        ret = 0;
-        trace_mirror_before_sleep(s, cnt * BDRV_SECTOR_SIZE,
-                                  s->synced, delay_ns);
+        trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
        if (!s->synced) {
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
            if (block_job_is_cancelled(&s->common)) {
@@ -1056,6 +1041,10 @@ static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs,

 static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs)
 {
+    if (bs->backing == NULL) {
+        /* we can be here after failed bdrv_append in mirror_start_job */
+        return 0;
+    }
    return bdrv_co_flush(bs->backing->bs);
 }

@@ -1073,6 +1062,11 @@ static int coroutine_fn bdrv_mirror_top_pdiscard(BlockDriverState *bs,

 static void bdrv_mirror_top_refresh_filename(BlockDriverState *bs, QDict *opts)
 {
+    if (bs->backing == NULL) {
+        /* we can be here after failed bdrv_attach_child in
+         * bdrv_set_backing_hd */
+        return;
+    }
    bdrv_refresh_filename(bs->backing->bs);
    pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
            bs->backing->bs->filename);
@@ -1084,6 +1078,7 @@ static void bdrv_mirror_top_close(BlockDriverState *bs)

 static void bdrv_mirror_top_child_perm(BlockDriverState *bs, BdrvChild *c,
                                       const BdrvChildRole *role,
+                                       BlockReopenQueue *reopen_queue,
                                       uint64_t perm, uint64_t shared,
                                       uint64_t *nperm, uint64_t *nshared)
 {
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -31,8 +31,8 @@
 #include "qapi/error.h"
 #include "nbd-client.h"

-#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
-#define INDEX_TO_HANDLE(bs, index)  ((index)  ^ ((uint64_t)(intptr_t)bs))
+#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ (uint64_t)(intptr_t)(bs))
+#define INDEX_TO_HANDLE(bs, index)  ((index)  ^ (uint64_t)(intptr_t)(bs))

 static void nbd_recv_coroutines_wake_all(NBDClientSession *s)
 {
@@ -161,6 +161,8 @@ static int nbd_co_send_request(BlockDriverState *bs,
                                       NULL) < 0) {
                rc = -EIO;
            }
+        } else if (rc >= 0) {
+            rc = -EIO;
        }
        qio_channel_set_cork(s->ioc, false);
    } else {
@@ -178,26 +180,27 @@ err:
    return rc;
 }

-static void nbd_co_receive_reply(NBDClientSession *s,
-                                 NBDRequest *request,
-                                 NBDReply *reply,
-                                 QEMUIOVector *qiov)
+static int nbd_co_receive_reply(NBDClientSession *s,
+                                NBDRequest *request,
+                                QEMUIOVector *qiov)
 {
+    int ret;
    int i = HANDLE_TO_INDEX(s, request->handle);

    /* Wait until we're woken up by nbd_read_reply_entry.  */
    s->requests[i].receiving = true;
    qemu_coroutine_yield();
    s->requests[i].receiving = false;
-    *reply = s->reply;
-    if (reply->handle != request->handle || !s->ioc || s->quit) {
-        reply->error = EIO;
+    if (!s->ioc || s->quit) {
+        ret = -EIO;
    } else {
-        if (qiov && reply->error == 0) {
+        assert(s->reply.handle == request->handle);
+        ret = -s->reply.error;
+        if (qiov && s->reply.error == 0) {
            assert(request->len == iov_size(qiov->iov, qiov->niov));
            if (qio_channel_readv_all(s->ioc, qiov->iov, qiov->niov,
                                      NULL) < 0) {
-                reply->error = EIO;
+                ret = -EIO;
                s->quit = true;
            }
        }
@@ -217,6 +220,8 @@ static void nbd_co_receive_reply(NBDClientSession *s,
    s->in_flight--;
    qemu_co_queue_next(&s->free_sema);
    qemu_co_mutex_unlock(&s->send_mutex);
+
+    return ret;
 }

 static int nbd_co_request(BlockDriverState *bs,
@@ -224,7 +229,6 @@ static int nbd_co_request(BlockDriverState *bs,
                          QEMUIOVector *qiov)
 {
    NBDClientSession *client = nbd_get_client_session(bs);
-    NBDReply reply;
    int ret;

    assert(!qiov || request->type == NBD_CMD_WRITE ||
@@ -232,12 +236,11 @@ static int nbd_co_request(BlockDriverState *bs,
    ret = nbd_co_send_request(bs, request,
                              request->type == NBD_CMD_WRITE ? qiov : NULL);
    if (ret < 0) {
-        reply.error = -ret;
-    } else {
-        nbd_co_receive_reply(client, request, &reply,
-                             request->type == NBD_CMD_READ ? qiov : NULL);
+        return ret;
    }
-    return -reply.error;
+
+    return nbd_co_receive_reply(client, request,
+                                request->type == NBD_CMD_READ ? qiov : NULL);
 }

 int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -478,7 +478,9 @@ static int get_cluster_offset(BlockDriverState *bs,
                    for(i = 0; i < s->cluster_sectors; i++) {
                        if (i < n_start || i >= n_end) {
                            memset(s->cluster_data, 0x00, 512);
-                            if (qcrypto_block_encrypt(s->crypto, start_sect + i,
+                            if (qcrypto_block_encrypt(s->crypto,
+                                                      (start_sect + i) *
+                                                      BDRV_SECTOR_SIZE,
                                                      s->cluster_data,
                                                      BDRV_SECTOR_SIZE,
                                                      NULL) < 0) {
@@ -668,7 +670,8 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
            }
            if (bs->encrypted) {
                assert(s->crypto);
-                if (qcrypto_block_decrypt(s->crypto, sector_num, buf,
+                if (qcrypto_block_decrypt(s->crypto,
+                                          sector_num * BDRV_SECTOR_SIZE, buf,
                                          n * BDRV_SECTOR_SIZE, NULL) < 0) {
                    ret = -EIO;
                    break;
@@ -740,8 +743,8 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
        }
        if (bs->encrypted) {
            assert(s->crypto);
-            if (qcrypto_block_encrypt(s->crypto, sector_num, buf,
-                                      n * BDRV_SECTOR_SIZE, NULL) < 0) {
+            if (qcrypto_block_encrypt(s->crypto, sector_num * BDRV_SECTOR_SIZE,
+                                      buf, n * BDRV_SECTOR_SIZE, NULL) < 0) {
                ret = -EIO;
                break;
            }
--- a/block/qcow2-bitmap.c
+++ b/block/qcow2-bitmap.c
@@ -269,15 +269,16 @@ static int free_bitmap_clusters(BlockDriverState *bs, Qcow2BitmapTable *tb)
    return 0;
 }

-/* This function returns the number of disk sectors covered by a single qcow2
- * cluster of bitmap data. */
-static uint64_t sectors_covered_by_bitmap_cluster(const BDRVQcow2State *s,
-                                                  const BdrvDirtyBitmap *bitmap)
+/* Return the disk size covered by a single qcow2 cluster of bitmap data. */
+static uint64_t bytes_covered_by_bitmap_cluster(const BDRVQcow2State *s,
+                                                const BdrvDirtyBitmap *bitmap)
 {
-    uint32_t sector_granularity =
-            bdrv_dirty_bitmap_granularity(bitmap) >> BDRV_SECTOR_BITS;
+    uint64_t granularity = bdrv_dirty_bitmap_granularity(bitmap);
+    uint64_t limit = granularity * (s->cluster_size << 3);

-    return (uint64_t)sector_granularity * (s->cluster_size << 3);
+    assert(QEMU_IS_ALIGNED(limit,
+                           bdrv_dirty_bitmap_serialization_align(bitmap)));
+    return limit;
 }

 /* load_bitmap_data
@@ -290,7 +291,7 @@ static int load_bitmap_data(BlockDriverState *bs,
 {
    int ret = 0;
    BDRVQcow2State *s = bs->opaque;
-    uint64_t sector, sbc;
+    uint64_t offset, limit;
    uint64_t bm_size = bdrv_dirty_bitmap_size(bitmap);
    uint8_t *buf = NULL;
    uint64_t i, tab_size =
@@ -302,28 +303,28 @@ static int load_bitmap_data(BlockDriverState *bs,
    }

    buf = g_malloc(s->cluster_size);
-    sbc = sectors_covered_by_bitmap_cluster(s, bitmap);
-    for (i = 0, sector = 0; i < tab_size; ++i, sector += sbc) {
-        uint64_t count = MIN(bm_size - sector, sbc);
+    limit = bytes_covered_by_bitmap_cluster(s, bitmap);
+    for (i = 0, offset = 0; i < tab_size; ++i, offset += limit) {
+        uint64_t count = MIN(bm_size - offset, limit);
        uint64_t entry = bitmap_table[i];
-        uint64_t offset = entry & BME_TABLE_ENTRY_OFFSET_MASK;
+        uint64_t data_offset = entry & BME_TABLE_ENTRY_OFFSET_MASK;

        assert(check_table_entry(entry, s->cluster_size) == 0);

-        if (offset == 0) {
+        if (data_offset == 0) {
            if (entry & BME_TABLE_ENTRY_FLAG_ALL_ONES) {
-                bdrv_dirty_bitmap_deserialize_ones(bitmap, sector, count,
+                bdrv_dirty_bitmap_deserialize_ones(bitmap, offset, count,
                                                   false);
            } else {
                /* No need to deserialize zeros because the dirty bitmap is
                 * already cleared */
            }
        } else {
-            ret = bdrv_pread(bs->file, offset, buf, s->cluster_size);
+            ret = bdrv_pread(bs->file, data_offset, buf, s->cluster_size);
            if (ret < 0) {
                goto finish;
            }
-            bdrv_dirty_bitmap_deserialize_part(bitmap, buf, sector, count,
+            bdrv_dirty_bitmap_deserialize_part(bitmap, buf, offset, count,
                                               false);
        }
    }
@@ -602,7 +603,7 @@ static Qcow2BitmapList *bitmap_list_load(BlockDriverState *bs, uint64_t offset,
            goto fail;
        }

-        bm = g_new(Qcow2Bitmap, 1);
+        bm = g_new0(Qcow2Bitmap, 1);
        bm->table.offset = e->bitmap_table_offset;
        bm->table.size = e->bitmap_table_size;
        bm->flags = e->flags;
@@ -1071,8 +1072,8 @@ static uint64_t *store_bitmap_data(BlockDriverState *bs,
 {
    int ret;
    BDRVQcow2State *s = bs->opaque;
-    int64_t sector;
-    uint64_t sbc;
+    int64_t offset;
+    uint64_t limit;
    uint64_t bm_size = bdrv_dirty_bitmap_size(bitmap);
    const char *bm_name = bdrv_dirty_bitmap_name(bitmap);
    uint8_t *buf = NULL;
@@ -1095,20 +1096,25 @@ static uint64_t *store_bitmap_data(BlockDriverState *bs,
        return NULL;
    }

-    dbi = bdrv_dirty_iter_new(bitmap, 0);
+    dbi = bdrv_dirty_iter_new(bitmap);
    buf = g_malloc(s->cluster_size);
-    sbc = sectors_covered_by_bitmap_cluster(s, bitmap);
-    assert(DIV_ROUND_UP(bm_size, sbc) == tb_size);
+    limit = bytes_covered_by_bitmap_cluster(s, bitmap);
+    assert(DIV_ROUND_UP(bm_size, limit) == tb_size);

-    while ((sector = bdrv_dirty_iter_next(dbi)) != -1) {
-        uint64_t cluster = sector / sbc;
+    while ((offset = bdrv_dirty_iter_next(dbi)) >= 0) {
+        uint64_t cluster = offset / limit;
        uint64_t end, write_size;
        int64_t off;

-        sector = cluster * sbc;
-        end = MIN(bm_size, sector + sbc);
-        write_size =
-            bdrv_dirty_bitmap_serialization_size(bitmap, sector, end - sector);
+        /*
+         * We found the first dirty offset, but want to write out the
+         * entire cluster of the bitmap that includes that offset,
+         * including any leading zero bits.
+         */
+        offset = QEMU_ALIGN_DOWN(offset, limit);
+        end = MIN(bm_size, offset + limit);
+        write_size = bdrv_dirty_bitmap_serialization_size(bitmap, offset,
+                                                          end - offset);
        assert(write_size <= s->cluster_size);

        off = qcow2_alloc_clusters(bs, s->cluster_size);
@@ -1120,7 +1126,7 @@ static uint64_t *store_bitmap_data(BlockDriverState *bs,
        }
        tb[cluster] = off;

-        bdrv_dirty_bitmap_serialize_part(bitmap, buf, sector, end - sector);
+        bdrv_dirty_bitmap_serialize_part(bitmap, buf, offset, end - offset);
        if (write_size < s->cluster_size) {
            memset(buf + write_size, 0, s->cluster_size - write_size);
        }
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -411,3 +411,29 @@ void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c,
    assert(c->entries[i].offset != 0);
    c->entries[i].dirty = true;
 }
+
+void *qcow2_cache_is_table_offset(BlockDriverState *bs, Qcow2Cache *c,
+                                  uint64_t offset)
+{
+    int i;
+
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].offset == offset) {
+            return qcow2_cache_get_table_addr(bs, c, i);
+        }
+    }
+    return NULL;
+}
+
+void qcow2_cache_discard(BlockDriverState *bs, Qcow2Cache *c, void *table)
+{
+    int i = qcow2_cache_get_table_idx(bs, c, table);
+
+    assert(c->entries[i].ref == 0);
+
+    c->entries[i].offset = 0;
+    c->entries[i].lru_counter = 0;
+    c->entries[i].dirty = false;
+
+    qcow2_cache_table_release(bs, c, i, 1);
+}
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -32,6 +32,56 @@
 #include "qemu/bswap.h"
 #include "trace.h"

+int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t exact_size)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int new_l1_size, i, ret;
+
+    if (exact_size >= s->l1_size) {
+        return 0;
+    }
+
+    new_l1_size = exact_size;
+
+#ifdef DEBUG_ALLOC2
+    fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size);
+#endif
+
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
+    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset +
+                                       new_l1_size * sizeof(uint64_t),
+                             (s->l1_size - new_l1_size) * sizeof(uint64_t), 0);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = bdrv_flush(bs->file->bs);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
+    for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
+        if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
+            continue;
+        }
+        qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK,
+                            s->cluster_size, QCOW2_DISCARD_ALWAYS);
+        s->l1_table[i] = 0;
+    }
+    return 0;
+
+fail:
+    /*
+     * If the write in the l1_table failed the image may contain a partially
+     * overwritten l1_table. In this case it would be better to clear the
+     * l1_table in memory to avoid possible image corruption.
+     */
+    memset(s->l1_table + new_l1_size, 0,
+           (s->l1_size - new_l1_size) * sizeof(uint64_t));
+    return ret;
+}
+
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                        bool exact_size)
 {
@@ -396,15 +446,13 @@ static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs,
 {
    if (bytes && bs->encrypted) {
        BDRVQcow2State *s = bs->opaque;
-        int64_t sector = (s->crypt_physical_offset ?
+        int64_t offset = (s->crypt_physical_offset ?
                          (cluster_offset + offset_in_cluster) :
-                          (src_cluster_offset + offset_in_cluster))
-                         >> BDRV_SECTOR_BITS;
+                          (src_cluster_offset + offset_in_cluster));
        assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0);
        assert((bytes & ~BDRV_SECTOR_MASK) == 0);
        assert(s->crypto);
-        if (qcrypto_block_encrypt(s->crypto, sector, buffer,
-                                  bytes, NULL) < 0) {
+        if (qcrypto_block_encrypt(s->crypto, offset, buffer, bytes, NULL) < 0) {
            return false;
        }
    }
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -29,6 +29,7 @@
 #include "block/qcow2.h"
 #include "qemu/range.h"
 #include "qemu/bswap.h"
+#include "qemu/cutils.h"

 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
@@ -861,8 +862,24 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
        }
        s->set_refcount(refcount_block, block_index, refcount);

-        if (refcount == 0 && s->discard_passthrough[type]) {
-            update_refcount_discard(bs, cluster_offset, s->cluster_size);
+        if (refcount == 0) {
+            void *table;
+
+            table = qcow2_cache_is_table_offset(bs, s->refcount_block_cache,
+                                                offset);
+            if (table != NULL) {
+                qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
+                qcow2_cache_discard(bs, s->refcount_block_cache, table);
+            }
+
+            table = qcow2_cache_is_table_offset(bs, s->l2_table_cache, offset);
+            if (table != NULL) {
+                qcow2_cache_discard(bs, s->l2_table_cache, table);
+            }
+
+            if (s->discard_passthrough[type]) {
+                update_refcount_discard(bs, cluster_offset, s->cluster_size);
+            }
        }
    }

@@ -3045,3 +3062,144 @@ done:
    qemu_vfree(new_refblock);
    return ret;
 }
+
+static int qcow2_discard_refcount_block(BlockDriverState *bs,
+                                        uint64_t discard_block_offs)
+{
+    BDRVQcow2State *s = bs->opaque;
+    uint64_t refblock_offs = get_refblock_offset(s, discard_block_offs);
+    uint64_t cluster_index = discard_block_offs >> s->cluster_bits;
+    uint32_t block_index = cluster_index & (s->refcount_block_size - 1);
+    void *refblock;
+    int ret;
+
+    assert(discard_block_offs != 0);
+
+    ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
+                          &refblock);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (s->get_refcount(refblock, block_index) != 1) {
+        qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:"
+                                " refblock offset %#" PRIx64
+                                ", reftable index %u"
+                                ", block offset %#" PRIx64
+                                ", refcount %#" PRIx64,
+                                refblock_offs,
+                                offset_to_reftable_index(s, discard_block_offs),
+                                discard_block_offs,
+                                s->get_refcount(refblock, block_index));
+        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+        return -EINVAL;
+    }
+    s->set_refcount(refblock, block_index, 0);
+
+    qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, refblock);
+
+    qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+
+    if (cluster_index < s->free_cluster_index) {
+        s->free_cluster_index = cluster_index;
+    }
+
+    refblock = qcow2_cache_is_table_offset(bs, s->refcount_block_cache,
+                                           discard_block_offs);
+    if (refblock) {
+        /* discard refblock from the cache if refblock is cached */
+        qcow2_cache_discard(bs, s->refcount_block_cache, refblock);
+    }
+    update_refcount_discard(bs, discard_block_offs, s->cluster_size);
+
+    return 0;
+}
+
+int qcow2_shrink_reftable(BlockDriverState *bs)
+{
+    BDRVQcow2State *s = bs->opaque;
+    uint64_t *reftable_tmp =
+        g_malloc(s->refcount_table_size * sizeof(uint64_t));
+    int i, ret;
+
+    for (i = 0; i < s->refcount_table_size; i++) {
+        int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
+        void *refblock;
+        bool unused_block;
+
+        if (refblock_offs == 0) {
+            reftable_tmp[i] = 0;
+            continue;
+        }
+        ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
+                              &refblock);
+        if (ret < 0) {
+            goto out;
+        }
+
+        /* the refblock has own reference */
+        if (i == offset_to_reftable_index(s, refblock_offs)) {
+            uint64_t block_index = (refblock_offs >> s->cluster_bits) &
+                                   (s->refcount_block_size - 1);
+            uint64_t refcount = s->get_refcount(refblock, block_index);
+
+            s->set_refcount(refblock, block_index, 0);
+
+            unused_block = buffer_is_zero(refblock, s->cluster_size);
+
+            s->set_refcount(refblock, block_index, refcount);
+        } else {
+            unused_block = buffer_is_zero(refblock, s->cluster_size);
+        }
+        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+
+        reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
+    }
+
+    ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset, reftable_tmp,
+                           s->refcount_table_size * sizeof(uint64_t));
+    /*
+     * If the write in the reftable failed the image may contain a partially
+     * overwritten reftable. In this case it would be better to clear the
+     * reftable in memory to avoid possible image corruption.
+     */
+    for (i = 0; i < s->refcount_table_size; i++) {
+        if (s->refcount_table[i] && !reftable_tmp[i]) {
+            if (ret == 0) {
+                ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] &
+                                                       REFT_OFFSET_MASK);
+            }
+            s->refcount_table[i] = 0;
+        }
+    }
+
+    if (!s->cache_discards) {
+        qcow2_process_discards(bs, ret);
+    }
+
+out:
+    g_free(reftable_tmp);
+    return ret;
+}
+
+int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int64_t i;
+
+    for (i = size_to_clusters(s, size) - 1; i >= 0; i--) {
+        uint64_t refcount;
+        int ret = qcow2_get_refcount(bs, i, &refcount);
+        if (ret < 0) {
+            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
+                    i, strerror(-ret));
+            return ret;
+        }
+        if (refcount > 0) {
+            return i;
+        }
+    }
+    qcow2_signal_corruption(bs, true, -1, -1,
+                            "There are no references in the refcount table.");
+    return -EIO;
+}
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1811,7 +1811,7 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
                if (qcrypto_block_decrypt(s->crypto,
                                          (s->crypt_physical_offset ?
                                           cluster_offset + offset_in_cluster :
-                                           offset) >> BDRV_SECTOR_BITS,
+                                           offset),
                                          cluster_data,
                                          cur_bytes,
                                          NULL) < 0) {
@@ -1946,7 +1946,7 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
            if (qcrypto_block_encrypt(s->crypto,
                                      (s->crypt_physical_offset ?
                                       cluster_offset + offset_in_cluster :
-                                       offset) >> BDRV_SECTOR_BITS,
+                                       offset),
                                      cluster_data,
                                      cur_bytes, NULL) < 0) {
                ret = -EIO;
@@ -3104,18 +3104,66 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset,
    }

    old_length = bs->total_sectors * 512;
-
-    /* shrinking is currently not supported */
-    if (offset < old_length) {
-        error_setg(errp, "qcow2 doesn't support shrinking images yet");
-        return -ENOTSUP;
-    }
-
    new_l1_size = size_to_l1(s, offset);
-    ret = qcow2_grow_l1_table(bs, new_l1_size, true);
-    if (ret < 0) {
-        error_setg_errno(errp, -ret, "Failed to grow the L1 table");
-        return ret;
+
+    if (offset < old_length) {
+        int64_t last_cluster, old_file_size;
+        if (prealloc != PREALLOC_MODE_OFF) {
+            error_setg(errp,
+                       "Preallocation can't be used for shrinking an image");
+            return -EINVAL;
+        }
+
+        ret = qcow2_cluster_discard(bs, ROUND_UP(offset, s->cluster_size),
+                                    old_length - ROUND_UP(offset,
+                                                          s->cluster_size),
+                                    QCOW2_DISCARD_ALWAYS, true);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to discard cropped clusters");
+            return ret;
+        }
+
+        ret = qcow2_shrink_l1_table(bs, new_l1_size);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret,
+                             "Failed to reduce the number of L2 tables");
+            return ret;
+        }
+
+        ret = qcow2_shrink_reftable(bs);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret,
+                             "Failed to discard unused refblocks");
+            return ret;
+        }
+
+        old_file_size = bdrv_getlength(bs->file->bs);
+        if (old_file_size < 0) {
+            error_setg_errno(errp, -old_file_size,
+                             "Failed to inquire current file length");
+            return old_file_size;
+        }
+        last_cluster = qcow2_get_last_cluster(bs, old_file_size);
+        if (last_cluster < 0) {
+            error_setg_errno(errp, -last_cluster,
+                             "Failed to find the last cluster");
+            return last_cluster;
+        }
+        if ((last_cluster + 1) * s->cluster_size < old_file_size) {
+            ret = bdrv_truncate(bs->file, (last_cluster + 1) * s->cluster_size,
+                                PREALLOC_MODE_OFF, NULL);
+            if (ret < 0) {
+                warn_report("Failed to truncate the tail of the image: %s",
+                            strerror(-ret));
+                ret = 0;
+            }
+        }
+    } else {
+        ret = qcow2_grow_l1_table(bs, new_l1_size, true);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to grow the L1 table");
+            return ret;
+        }
    }

    switch (prealloc) {
@@ -3142,7 +3190,7 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset,
        if (old_file_size < 0) {
            error_setg_errno(errp, -old_file_size,
                             "Failed to inquire current file length");
-            return ret;
+            return old_file_size;
        }

        nb_new_data_clusters = DIV_ROUND_UP(offset - old_length,
@@ -3171,7 +3219,7 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset,
        if (allocation_start < 0) {
            error_setg_errno(errp, -allocation_start,
                             "Failed to resize refcount structures");
-            return -allocation_start;
+            return allocation_start;
        }

        clusters_allocated = qcow2_alloc_clusters_at(bs, allocation_start,
@@ -3648,20 +3696,19 @@ static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs,
             */
            required = virtual_size;
        } else {
-            int cluster_sectors = cluster_size / BDRV_SECTOR_SIZE;
-            int64_t sector_num;
+            int64_t offset;
            int pnum = 0;

-            for (sector_num = 0;
-                 sector_num < ssize / BDRV_SECTOR_SIZE;
-                 sector_num += pnum) {
-                int nb_sectors = MIN(ssize / BDRV_SECTOR_SIZE - sector_num,
-                                     BDRV_REQUEST_MAX_SECTORS);
+            for (offset = 0; offset < ssize;
+                 offset += pnum * BDRV_SECTOR_SIZE) {
+                int nb_sectors = MIN(ssize - offset,
+                                     BDRV_REQUEST_MAX_BYTES) / BDRV_SECTOR_SIZE;
                BlockDriverState *file;
                int64_t ret;

                ret = bdrv_get_block_status_above(in_bs, NULL,
-                                                  sector_num, nb_sectors,
+                                                  offset >> BDRV_SECTOR_BITS,
+                                                  nb_sectors,
                                                  &pnum, &file);
                if (ret < 0) {
                    error_setg_errno(&local_err, -ret,
@@ -3674,12 +3721,11 @@ static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs,
                } else if ((ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) ==
                           (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) {
                    /* Extend pnum to end of cluster for next iteration */
-                    pnum = ROUND_UP(sector_num + pnum, cluster_sectors) -
-                           sector_num;
+                    pnum = (ROUND_UP(offset + pnum * BDRV_SECTOR_SIZE,
+                                 cluster_size) - offset) >> BDRV_SECTOR_BITS;

                    /* Count clusters we've seen */
-                    required += (sector_num % cluster_sectors + pnum) *
-                                BDRV_SECTOR_SIZE;
+                    required += offset % cluster_size + pnum * BDRV_SECTOR_SIZE;
                }
            }
        }
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -521,6 +521,18 @@ static inline uint64_t refcount_diff(uint64_t r1, uint64_t r2)
    return r1 > r2 ? r1 - r2 : r2 - r1;
 }

+static inline
+uint32_t offset_to_reftable_index(BDRVQcow2State *s, uint64_t offset)
+{
+    return offset >> (s->refcount_block_bits + s->cluster_bits);
+}
+
+static inline uint64_t get_refblock_offset(BDRVQcow2State *s, uint64_t offset)
+{
+    uint32_t index = offset_to_reftable_index(s, offset);
+    return s->refcount_table[index] & REFT_OFFSET_MASK;
+}
+
 /* qcow2.c functions */
 int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
                  int64_t sector_num, int nb_sectors);
@@ -584,10 +596,13 @@ int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
                                BlockDriverAmendStatusCB *status_cb,
                                void *cb_opaque, Error **errp);
+int qcow2_shrink_reftable(BlockDriverState *bs);
+int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size);

 /* qcow2-cluster.c functions */
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                        bool exact_size);
+int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t max_size);
 int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index);
 int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
 int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
@@ -649,6 +664,9 @@ int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
 int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
    void **table);
 void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
+void *qcow2_cache_is_table_offset(BlockDriverState *bs, Qcow2Cache *c,
+                                  uint64_t offset);
+void qcow2_cache_discard(BlockDriverState *bs, Qcow2Cache *c, void *table);

 /* qcow2-bitmap.c functions */
 int qcow2_check_bitmaps_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
--- a/block/replication.c
+++ b/block/replication.c
@@ -157,6 +157,7 @@ static void replication_close(BlockDriverState *bs)

 static void replication_child_perm(BlockDriverState *bs, BdrvChild *c,
                                   const BdrvChildRole *role,
+                                   BlockReopenQueue *reopen_queue,
                                   uint64_t perm, uint64_t shared,
                                   uint64_t *nperm, uint64_t *nshared)
 {
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -403,17 +403,19 @@ static void coroutine_fn throttle_group_restart_queue_entry(void *opaque)
        schedule_next_request(tgm, is_write);
        qemu_mutex_unlock(&tg->lock);
    }
+
+    g_free(data);
 }

 static void throttle_group_restart_queue(ThrottleGroupMember *tgm, bool is_write)
 {
    Coroutine *co;
-    RestartData rd = {
-        .tgm = tgm,
-        .is_write = is_write
-    };
+    RestartData *rd = g_new0(RestartData, 1);

-    co = qemu_coroutine_create(throttle_group_restart_queue_entry, &rd);
+    rd->tgm = tgm;
+    rd->is_write = is_write;
+
+    co = qemu_coroutine_create(throttle_group_restart_queue_entry, rd);
    aio_co_enter(tgm->aio_context, co);
 }

--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -57,15 +57,6 @@

 static void checkpoint(void);

-#ifdef __MINGW32__
-void nonono(const char* file, int line, const char* msg) {
-    fprintf(stderr, "Nonono! %s:%d %s\n", file, line, msg);
-    exit(-5);
-}
-#undef assert
-#define assert(a) do {if (!(a)) nonono(__FILE__, __LINE__, #a);}while(0)
-#endif
-
 #else

 #define DLOG(a)
@@ -3211,6 +3202,7 @@ err:

 static void vvfat_child_perm(BlockDriverState *bs, BdrvChild *c,
                             const BdrvChildRole *role,
+                             BlockReopenQueue *reopen_queue,
                             uint64_t perm, uint64_t shared,
                             uint64_t *nperm, uint64_t *nshared)
 {
@@ -3270,24 +3262,11 @@ static void bdrv_vvfat_init(void)
 block_init(bdrv_vvfat_init);

 #ifdef DEBUG
-static void checkpoint(void) {
+static void checkpoint(void)
+{
    assert(((mapping_t*)array_get(&(vvv->mapping), 0))->end == 2);
    check1(vvv);
    check2(vvv);
    assert(!vvv->current_mapping || vvv->current_fd || (vvv->current_mapping->mode & MODE_DIRECTORY));
-#if 0
-    if (((direntry_t*)vvv->directory.pointer)[1].attributes != 0xf)
-        fprintf(stderr, "Nonono!\n");
-    mapping_t* mapping;
-    direntry_t* direntry;
-    assert(vvv->mapping.size >= vvv->mapping.item_size * vvv->mapping.next);
-    assert(vvv->directory.size >= vvv->directory.item_size * vvv->directory.next);
-    if (vvv->mapping.next<47)
-        return;
-    assert((mapping = array_get(&(vvv->mapping), 47)));
-    assert(mapping->dir_index < vvv->directory.next);
-    direntry = array_get(&(vvv->directory), mapping->dir_index);
-    assert(!memcmp(direntry->name, "USB     H  ", 11) || direntry->name[0]==0);
-#endif
 }
 #endif
--- a/chardev/Makefile.objs
+++ b/chardev/Makefile.objs
@@ -20,5 +20,6 @@ chardev-obj-$(CONFIG_WIN32) += char-win-stdio.o
 common-obj-y += msmouse.o wctablet.o testdev.o
 common-obj-$(CONFIG_BRLAPI) += baum.o
 baum.o-cflags := $(SDL_CFLAGS)
+baum.o-libs := $(BRLAPI_LIBS)

 common-obj-$(CONFIG_SPICE) += spice.o
--- a/chardev/baum.c
+++ b/chardev/baum.c
@@ -643,6 +643,7 @@ static void baum_chr_open(Chardev *chr,
        error_setg(errp, "brlapi__openConnection: %s",
                   brlapi_strerror(brlapi_error_location()));
        g_free(handle);
+        baum->brlapi = NULL;
        return;
    }
    baum->deferred_init = 0;
--- a/chardev/char-fd.c
+++ b/chardev/char-fd.c
@@ -84,8 +84,7 @@ static GSource *fd_chr_add_watch(Chardev *chr, GIOCondition cond)
    return qio_channel_create_watch(s->ioc_out, cond);
 }

-static void fd_chr_update_read_handler(Chardev *chr,
-                                       GMainContext *context)
+static void fd_chr_update_read_handler(Chardev *chr)
 {
    FDChardev *s = FD_CHARDEV(chr);

@@ -94,7 +93,7 @@ static void fd_chr_update_read_handler(Chardev *chr,
        chr->gsource = io_add_watch_poll(chr, s->ioc_in,
                                           fd_chr_read_poll,
                                           fd_chr_read, chr,
-                                           context);
+                                           chr->gcontext);
    }
 }

--- a/chardev/char-fe.c
+++ b/chardev/char-fe.c
@@ -253,7 +253,6 @@ void qemu_chr_fe_set_handlers(CharBackend *b,
                              bool set_open)
 {
    Chardev *s;
-    ChardevClass *cc;
    int fe_open;

    s = b->chr;
@@ -261,7 +260,6 @@ void qemu_chr_fe_set_handlers(CharBackend *b,
        return;
    }

-    cc = CHARDEV_GET_CLASS(s);
    if (!opaque && !fd_can_read && !fd_read && !fd_event) {
        fe_open = 0;
        remove_fd_in_watch(s);
@@ -273,9 +271,8 @@ void qemu_chr_fe_set_handlers(CharBackend *b,
    b->chr_event = fd_event;
    b->chr_be_change = be_change;
    b->opaque = opaque;
-    if (cc->chr_update_read_handler) {
-        cc->chr_update_read_handler(s, context);
-    }
+
+    qemu_chr_be_update_read_handlers(s, context);

    if (set_open) {
        qemu_chr_fe_set_open(b, fe_open);
--- a/chardev/char-pty.c
+++ b/chardev/char-pty.c
@@ -112,8 +112,7 @@ static void pty_chr_update_read_handler_locked(Chardev *chr)
    }
 }

-static void pty_chr_update_read_handler(Chardev *chr,
-                                        GMainContext *context)
+static void pty_chr_update_read_handler(Chardev *chr)
 {
    qemu_mutex_lock(&chr->chr_write_lock);
    pty_chr_update_read_handler_locked(chr);
@@ -219,7 +218,7 @@ static void pty_chr_state(Chardev *chr, int connected)
            chr->gsource = io_add_watch_poll(chr, s->ioc,
                                               pty_chr_read_poll,
                                               pty_chr_read,
-                                               chr, NULL);
+                                               chr, chr->gcontext);
        }
    }
 }
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -516,13 +516,12 @@ static void tcp_chr_connect(void *opaque)
        chr->gsource = io_add_watch_poll(chr, s->ioc,
                                           tcp_chr_read_poll,
                                           tcp_chr_read,
-                                           chr, NULL);
+                                           chr, chr->gcontext);
    }
    qemu_chr_be_event(chr, CHR_EVENT_OPENED);
 }

-static void tcp_chr_update_read_handler(Chardev *chr,
-                                        GMainContext *context)
+static void tcp_chr_update_read_handler(Chardev *chr)
 {
    SocketChardev *s = SOCKET_CHARDEV(chr);

@@ -535,7 +534,7 @@ static void tcp_chr_update_read_handler(Chardev *chr,
        chr->gsource = io_add_watch_poll(chr, s->ioc,
                                           tcp_chr_read_poll,
                                           tcp_chr_read, chr,
-                                           context);
+                                           chr->gcontext);
    }
 }

--- a/chardev/char-udp.c
+++ b/chardev/char-udp.c
@@ -100,8 +100,7 @@ static gboolean udp_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque)
    return TRUE;
 }

-static void udp_chr_update_read_handler(Chardev *chr,
-                                        GMainContext *context)
+static void udp_chr_update_read_handler(Chardev *chr)
 {
    UdpChardev *s = UDP_CHARDEV(chr);

@@ -110,7 +109,7 @@ static void udp_chr_update_read_handler(Chardev *chr,
        chr->gsource = io_add_watch_poll(chr, s->ioc,
                                           udp_chr_read_poll,
                                           udp_chr_read, chr,
-                                           context);
+                                           chr->gcontext);
    }
 }

--- a/chardev/char.c
+++ b/chardev/char.c
@@ -180,6 +180,17 @@ void qemu_chr_be_write(Chardev *s, uint8_t *buf, int len)
    }
 }

+void qemu_chr_be_update_read_handlers(Chardev *s,
+                                      GMainContext *context)
+{
+    ChardevClass *cc = CHARDEV_GET_CLASS(s);
+
+    s->gcontext = context;
+    if (cc->chr_update_read_handler) {
+        cc->chr_update_read_handler(s);
+    }
+}
+
 int qemu_chr_add_client(Chardev *s, int fd)
 {
    return CHARDEV_GET_CLASS(s)->chr_add_client ?
--- a/121
+++ b/121
@@ -290,6 +290,7 @@ netmap="no"
 sdl=""
 sdlabi=""
 virtfs=""
+mpath=""
 vnc="yes"
 sparse="no"
 vde=""
@@ -331,6 +332,7 @@ modules="no"
 prefix="/usr/local"
 mandir="\${prefix}/share/man"
 datadir="\${prefix}/share"
+firmwarepath="\${prefix}/share/qemu-firmware"
 qemu_docdir="\${prefix}/share/doc/qemu"
 bindir="\${prefix}/bin"
 libdir="\${prefix}/lib"
@@ -745,7 +747,6 @@ SunOS)
  solaris="yes"
  make="${MAKE-gmake}"
  install="${INSTALL-ginstall}"
-  ld="gld"
  smbd="${SMBD-/usr/sfw/sbin/smbd}"
  if test -f /usr/include/sys/soundcard.h ; then
    audio_drv_list="oss"
@@ -914,6 +915,8 @@ for opt do
  ;;
  --localstatedir=*) local_statedir="$optarg"
  ;;
+  --firmwarepath=*) firmwarepath="$optarg"
+  ;;
  --sbindir=*|--sharedstatedir=*|\
  --oldincludedir=*|--datarootdir=*|--infodir=*|--localedir=*|\
  --htmldir=*|--dvidir=*|--pdfdir=*|--psdir=*)
@@ -936,6 +939,10 @@ for opt do
  ;;
  --enable-virtfs) virtfs="yes"
  ;;
+  --disable-mpath) mpath="no"
+  ;;
+  --enable-mpath) mpath="yes"
+  ;;
  --disable-vnc) vnc="no"
  ;;
  --enable-vnc) vnc="yes"
@@ -1411,6 +1418,7 @@ Advanced options (experts only):
  --libdir=PATH            install libraries in PATH
  --sysconfdir=PATH        install config in PATH$confsuffix
  --localstatedir=PATH     install local state in PATH (set at runtime on win32)
+  --firmwarepath=PATH      search PATH for firmware files
  --with-confsuffix=SUFFIX suffix for QEMU data inside datadir/libdir/sysconfdir [$confsuffix]
  --enable-debug           enable common debug build options
  --disable-strip          disable stripping binaries
@@ -1479,6 +1487,7 @@ disabled with --disable-FEATURE, default is enabled if available:
  vnc-png         PNG compression for VNC server
  cocoa           Cocoa UI (Mac OS X only)
  virtfs          VirtFS
+  mpath           Multipath persistent reservation passthrough
  xen             xen backend driver support
  xen-pci-passthrough
  brlapi          BrlAPI (Braile)
@@ -2788,7 +2797,6 @@ EOF
    sdl_cflags="$sdl_cflags $x11_cflags"
    sdl_libs="$sdl_libs $x11_libs"
  fi
-  libs_softmmu="$sdl_libs $libs_softmmu"
 fi

 ##########################################
@@ -2801,7 +2809,6 @@ EOF
  rdma_libs="-lrdmacm -libverbs"
  if compile_prog "" "$rdma_libs" ; then
    rdma="yes"
-    libs_softmmu="$libs_softmmu $rdma_libs"
  else
    if test "$rdma" = "yes" ; then
        error_exit \
@@ -2946,8 +2953,6 @@ int main(void)
 EOF
  if compile_prog "" "$vde_libs" ; then
    vde=yes
-    libs_softmmu="$vde_libs $libs_softmmu"
-    libs_tools="$vde_libs $libs_tools"
  else
    if test "$vde" = "yes" ; then
      feature_not_found "vde" "Install vde (Virtual Distributed Ethernet) devel"
@@ -3035,13 +3040,13 @@ for drv in $audio_drv_list; do
    alsa)
    audio_drv_probe $drv alsa/asoundlib.h -lasound \
        "return snd_pcm_close((snd_pcm_t *)0);"
-    libs_softmmu="-lasound $libs_softmmu"
+    alsa_libs="-lasound"
    ;;

    pa)
    audio_drv_probe $drv pulse/pulseaudio.h "-lpulse" \
        "pa_context_set_source_output_volume(NULL, 0, NULL, NULL, NULL); return 0;"
-    libs_softmmu="-lpulse $libs_softmmu"
+    pulse_libs="-lpulse"
    audio_pt_int="yes"
    ;;

@@ -3052,16 +3057,16 @@ for drv in $audio_drv_list; do
    ;;

    coreaudio)
-      libs_softmmu="-framework CoreAudio $libs_softmmu"
+      coreaudio_libs="-framework CoreAudio"
    ;;

    dsound)
-      libs_softmmu="-lole32 -ldxguid $libs_softmmu"
+      dsound_libs="-lole32 -ldxguid"
      audio_win_int="yes"
    ;;

    oss)
-      libs_softmmu="$oss_lib $libs_softmmu"
+      oss_libs="$oss_lib"
    ;;

    wav)
@@ -3089,7 +3094,6 @@ int main( void ) { return brlapi__openConnection (NULL, NULL, NULL); }
 EOF
  if compile_prog "" "$brlapi_libs" ; then
    brlapi=yes
-    libs_softmmu="$brlapi_libs $libs_softmmu"
  else
    if test "$brlapi" = "yes" ; then
      feature_not_found "brlapi" "Install brlapi devel"
@@ -3299,6 +3303,30 @@ else
      "Please install the pixman devel package."
 fi

+##########################################
+# libmpathpersist probe
+
+if test "$mpath" != "no" ; then
+  cat > $TMPC <<EOF
+#include <libudev.h>
+#include <mpath_persist.h>
+unsigned mpath_mx_alloc_len = 1024;
+int logsink;
+int main(void) {
+    struct udev *udev = udev_new();
+    mpath_lib_init(udev);
+    return 0;
+}
+EOF
+  if compile_prog "" "-ludev -lmultipath -lmpathpersist" ; then
+    mpathpersist=yes
+  else
+    mpathpersist=no
+  fi
+else
+  mpathpersist=no
+fi
+
 ##########################################
 # libcap probe

@@ -4204,13 +4232,10 @@ EOF
 fi

 # check for smartcard support
-smartcard_cflags=""
 if test "$smartcard" != "no"; then
    if $pkg_config libcacard; then
        libcacard_cflags=$($pkg_config --cflags libcacard)
        libcacard_libs=$($pkg_config --libs libcacard)
-        QEMU_CFLAGS="$QEMU_CFLAGS $libcacard_cflags"
-        libs_softmmu="$libs_softmmu $libcacard_libs"
        smartcard="yes"
    else
        if test "$smartcard" = "yes"; then
@@ -4226,8 +4251,6 @@ if test "$libusb" != "no" ; then
        libusb="yes"
        libusb_cflags=$($pkg_config --cflags libusb-1.0)
        libusb_libs=$($pkg_config --libs libusb-1.0)
-        QEMU_CFLAGS="$QEMU_CFLAGS $libusb_cflags"
-        libs_softmmu="$libs_softmmu $libusb_libs"
    else
        if test "$libusb" = "yes"; then
            feature_not_found "libusb" "Install libusb devel >= 1.0.13"
@@ -4242,8 +4265,6 @@ if test "$usb_redir" != "no" ; then
        usb_redir="yes"
        usb_redir_cflags=$($pkg_config --cflags libusbredirparser-0.5)
        usb_redir_libs=$($pkg_config --libs libusbredirparser-0.5)
-        QEMU_CFLAGS="$QEMU_CFLAGS $usb_redir_cflags"
-        libs_softmmu="$libs_softmmu $usb_redir_libs"
    else
        if test "$usb_redir" = "yes"; then
            feature_not_found "usb-redir" "Install usbredir devel"
@@ -4406,6 +4427,18 @@ if compile_prog "" "" ; then
    posix_syslog=yes
 fi

+##########################################
+# check if we have sem_timedwait
+
+sem_timedwait=no
+cat > $TMPC << EOF
+#include <semaphore.h>
+int main(void) { return sem_timedwait(0, 0); }
+EOF
+if compile_prog "" "" ; then
+    sem_timedwait=yes
+fi
+
 ##########################################
 # check if trace backend exists

@@ -5034,16 +5067,34 @@ if test "$want_tools" = "yes" ; then
  fi
 fi
 if test "$softmmu" = yes ; then
-  if test "$virtfs" != no ; then
-    if test "$cap" = yes && test "$linux" = yes && test "$attr" = yes ; then
+  if test "$linux" = yes; then
+    if test "$virtfs" != no && test "$cap" = yes && test "$attr" = yes ; then
      virtfs=yes
      tools="$tools fsdev/virtfs-proxy-helper\$(EXESUF)"
    else
      if test "$virtfs" = yes; then
-        error_exit "VirtFS is supported only on Linux and requires libcap devel and libattr devel"
+        error_exit "VirtFS requires libcap devel and libattr devel"
      fi
      virtfs=no
    fi
+    if test "$mpath" != no && test "$mpathpersist" = yes ; then
+      mpath=yes
+    else
+      if test "$mpath" = yes; then
+        error_exit "Multipath requires libmpathpersist devel"
+      fi
+      mpath=no
+    fi
+    tools="$tools scsi/qemu-pr-helper\$(EXESUF)"
+  else
+    if test "$virtfs" = yes; then
+      error_exit "VirtFS is supported only on Linux"
+    fi
+    virtfs=no
+    if test "$mpath" = yes; then
+      error_exit "Multipath is supported only on Linux"
+    fi
+    mpath=no
  fi
 fi

@@ -5228,6 +5279,7 @@ libs_softmmu="$pixman_libs $libs_softmmu"

 echo "Install prefix    $prefix"
 echo "BIOS directory    $(eval echo $qemu_datadir)"
+echo "firmware path     $(eval echo $firmwarepath)"
 echo "binary directory  $(eval echo $bindir)"
 echo "library directory $(eval echo $libdir)"
 echo "module directory  $(eval echo $qemu_moddir)"
@@ -5289,6 +5341,7 @@ echo "Audio drivers     $audio_drv_list"
 echo "Block whitelist (rw) $block_drv_rw_whitelist"
 echo "Block whitelist (ro) $block_drv_ro_whitelist"
 echo "VirtFS support    $virtfs"
+echo "Multipath support $mpath"
 echo "VNC support       $vnc"
 if test "$vnc" = "yes" ; then
    echo "VNC SASL support  $vnc_sasl"
@@ -5418,6 +5471,7 @@ echo "mandir=$mandir" >> $config_host_mak
 echo "sysconfdir=$sysconfdir" >> $config_host_mak
 echo "qemu_confdir=$qemu_confdir" >> $config_host_mak
 echo "qemu_datadir=$qemu_datadir" >> $config_host_mak
+echo "qemu_firmwarepath=$firmwarepath" >> $config_host_mak
 echo "qemu_docdir=$qemu_docdir" >> $config_host_mak
 echo "qemu_moddir=$qemu_moddir" >> $config_host_mak
 if test "$mingw32" = "no" ; then
@@ -5499,6 +5553,7 @@ if test "$slirp" = "yes" ; then
 fi
 if test "$vde" = "yes" ; then
  echo "CONFIG_VDE=y" >> $config_host_mak
+  echo "VDE_LIBS=$vde_libs" >> $config_host_mak
 fi
 if test "$netmap" = "yes" ; then
  echo "CONFIG_NETMAP=y" >> $config_host_mak
@@ -5514,6 +5569,11 @@ for drv in $audio_drv_list; do
    def=CONFIG_$(echo $drv | LC_ALL=C tr '[a-z]' '[A-Z]')
    echo "$def=y" >> $config_host_mak
 done
+echo "ALSA_LIBS=$alsa_libs" >> $config_host_mak
+echo "PULSE_LIBS=$pulse_libs" >> $config_host_mak
+echo "COREAUDIO_LIBS=$coreaudio_libs" >> $config_host_mak
+echo "DSOUND_LIBS=$dsound_libs" >> $config_host_mak
+echo "OSS_LIBS=$oss_libs" >> $config_host_mak
 if test "$audio_pt_int" = "yes" ; then
  echo "CONFIG_AUDIO_PT_INT=y" >> $config_host_mak
 fi
@@ -5558,6 +5618,7 @@ if test "$sdl" = "yes" ; then
  echo "CONFIG_SDL=y" >> $config_host_mak
  echo "CONFIG_SDLABI=$sdlabi" >> $config_host_mak
  echo "SDL_CFLAGS=$sdl_cflags" >> $config_host_mak
+  echo "SDL_LIBS=$sdl_libs" >> $config_host_mak
 fi
 if test "$cocoa" = "yes" ; then
  echo "CONFIG_COCOA=y" >> $config_host_mak
@@ -5634,6 +5695,9 @@ fi
 if test "$inotify1" = "yes" ; then
  echo "CONFIG_INOTIFY1=y" >> $config_host_mak
 fi
+if test "$sem_timedwait" = "yes" ; then
+  echo "CONFIG_SEM_TIMEDWAIT=y" >> $config_host_mak
+fi
 if test "$byteswap_h" = "yes" ; then
  echo "CONFIG_BYTESWAP_H=y" >> $config_host_mak
 fi
@@ -5647,6 +5711,7 @@ if test "$curl" = "yes" ; then
 fi
 if test "$brlapi" = "yes" ; then
  echo "CONFIG_BRLAPI=y" >> $config_host_mak
+  echo "BRLAPI_LIBS=$brlapi_libs" >> $config_host_mak
 fi
 if test "$bluez" = "yes" ; then
  echo "CONFIG_BLUEZ=y" >> $config_host_mak
@@ -5732,6 +5797,9 @@ fi
 if test "$virtfs" = "yes" ; then
  echo "CONFIG_VIRTFS=y" >> $config_host_mak
 fi
+if test "$mpath" = "yes" ; then
+  echo "CONFIG_MPATH=y" >> $config_host_mak
+fi
 if test "$vhost_scsi" = "yes" ; then
  echo "CONFIG_VHOST_SCSI=y" >> $config_host_mak
 fi
@@ -5781,14 +5849,20 @@ fi

 if test "$smartcard" = "yes" ; then
  echo "CONFIG_SMARTCARD=y" >> $config_host_mak
+  echo "SMARTCARD_CFLAGS=$libcacard_cflags" >> $config_host_mak
+  echo "SMARTCARD_LIBS=$libcacard_libs" >> $config_host_mak
 fi

 if test "$libusb" = "yes" ; then
  echo "CONFIG_USB_LIBUSB=y" >> $config_host_mak
+  echo "LIBUSB_CFLAGS=$libusb_cflags" >> $config_host_mak
+  echo "LIBUSB_LIBS=$libusb_libs" >> $config_host_mak
 fi

 if test "$usb_redir" = "yes" ; then
  echo "CONFIG_USB_REDIR=y" >> $config_host_mak
+  echo "USB_REDIR_CFLAGS=$usb_redir_cflags" >> $config_host_mak
+  echo "USB_REDIR_LIBS=$usb_redir_libs" >> $config_host_mak
 fi

 if test "$opengl" = "yes" ; then
@@ -5984,6 +6058,7 @@ echo "CONFIG_TRACE_FILE=$trace_file" >> $config_host_mak

 if test "$rdma" = "yes" ; then
  echo "CONFIG_RDMA=y" >> $config_host_mak
+  echo "RDMA_LIBS=$rdma_libs" >> $config_host_mak
 fi

 if test "$have_rtnetlink" = "yes" ; then
@@ -6505,8 +6580,8 @@ if test "$ccache_cpp2" = "yes"; then
 fi

 # build tree in object directory in case the source is not in the current directory
-DIRS="tests tests/tcg tests/tcg/cris tests/tcg/lm32 tests/libqos tests/qapi-schema tests/tcg/xtensa tests/qemu-iotests"
-DIRS="$DIRS docs docs/interop fsdev"
+DIRS="tests tests/tcg tests/tcg/cris tests/tcg/lm32 tests/libqos tests/qapi-schema tests/tcg/xtensa tests/qemu-iotests tests/vm"
+DIRS="$DIRS docs docs/interop fsdev scsi"
 DIRS="$DIRS pc-bios/optionrom pc-bios/spapr-rtas pc-bios/s390-ccw"
 DIRS="$DIRS roms/seabios roms/vgabios"
 DIRS="$DIRS qapi-generated"
--- a/cpus.c
+++ b/cpus.c
@@ -1764,8 +1764,9 @@ void qemu_init_vcpu(CPUState *cpu)
        /* If the target cpu hasn't set up any address spaces itself,
         * give it the default one.
         */
-        AddressSpace *as = address_space_init_shareable(cpu->memory,
-                                                        "cpu-memory");
+        AddressSpace *as = g_new0(AddressSpace, 1);
+
+        address_space_init(as, cpu->memory, "cpu-memory");
        cpu->num_ases = 1;
        cpu_address_space_init(cpu, as, 0);
    }
--- a/crypto/block-luks.c
+++ b/crypto/block-luks.c
@@ -846,8 +846,9 @@ qcrypto_block_luks_open(QCryptoBlock *block,
        }
    }

+    block->sector_size = QCRYPTO_BLOCK_LUKS_SECTOR_SIZE;
    block->payload_offset = luks->header.payload_offset *
-        QCRYPTO_BLOCK_LUKS_SECTOR_SIZE;
+        block->sector_size;

    luks->cipher_alg = cipheralg;
    luks->cipher_mode = ciphermode;
@@ -1240,8 +1241,9 @@ qcrypto_block_luks_create(QCryptoBlock *block,
                   QCRYPTO_BLOCK_LUKS_SECTOR_SIZE)) *
         QCRYPTO_BLOCK_LUKS_NUM_KEY_SLOTS);

+    block->sector_size = QCRYPTO_BLOCK_LUKS_SECTOR_SIZE;
    block->payload_offset = luks->header.payload_offset *
-        QCRYPTO_BLOCK_LUKS_SECTOR_SIZE;
+        block->sector_size;

    /* Reserve header space to match payload offset */
    initfunc(block, block->payload_offset, opaque, &local_err);
@@ -1397,29 +1399,33 @@ static void qcrypto_block_luks_cleanup(QCryptoBlock *block)

 static int
 qcrypto_block_luks_decrypt(QCryptoBlock *block,
-                           uint64_t startsector,
+                           uint64_t offset,
                           uint8_t *buf,
                           size_t len,
                           Error **errp)
 {
+    assert(QEMU_IS_ALIGNED(offset, QCRYPTO_BLOCK_LUKS_SECTOR_SIZE));
+    assert(QEMU_IS_ALIGNED(len, QCRYPTO_BLOCK_LUKS_SECTOR_SIZE));
    return qcrypto_block_decrypt_helper(block->cipher,
                                        block->niv, block->ivgen,
                                        QCRYPTO_BLOCK_LUKS_SECTOR_SIZE,
-                                        startsector, buf, len, errp);
+                                        offset, buf, len, errp);
 }


 static int
 qcrypto_block_luks_encrypt(QCryptoBlock *block,
-                           uint64_t startsector,
+                           uint64_t offset,
                           uint8_t *buf,
                           size_t len,
                           Error **errp)
 {
+    assert(QEMU_IS_ALIGNED(offset, QCRYPTO_BLOCK_LUKS_SECTOR_SIZE));
+    assert(QEMU_IS_ALIGNED(len, QCRYPTO_BLOCK_LUKS_SECTOR_SIZE));
    return qcrypto_block_encrypt_helper(block->cipher,
                                        block->niv, block->ivgen,
                                        QCRYPTO_BLOCK_LUKS_SECTOR_SIZE,
-                                        startsector, buf, len, errp);
+                                        offset, buf, len, errp);
 }


--- a/crypto/block-qcow.c
+++ b/crypto/block-qcow.c
@@ -80,6 +80,7 @@ qcrypto_block_qcow_init(QCryptoBlock *block,
        goto fail;
    }

+    block->sector_size = QCRYPTO_BLOCK_QCOW_SECTOR_SIZE;
    block->payload_offset = 0;

    return 0;
@@ -142,29 +143,33 @@ qcrypto_block_qcow_cleanup(QCryptoBlock *block)

 static int
 qcrypto_block_qcow_decrypt(QCryptoBlock *block,
-                           uint64_t startsector,
+                           uint64_t offset,
                           uint8_t *buf,
                           size_t len,
                           Error **errp)
 {
+    assert(QEMU_IS_ALIGNED(offset, QCRYPTO_BLOCK_QCOW_SECTOR_SIZE));
+    assert(QEMU_IS_ALIGNED(len, QCRYPTO_BLOCK_QCOW_SECTOR_SIZE));
    return qcrypto_block_decrypt_helper(block->cipher,
                                        block->niv, block->ivgen,
                                        QCRYPTO_BLOCK_QCOW_SECTOR_SIZE,
-                                        startsector, buf, len, errp);
+                                        offset, buf, len, errp);
 }


 static int
 qcrypto_block_qcow_encrypt(QCryptoBlock *block,
-                           uint64_t startsector,
+                           uint64_t offset,
                           uint8_t *buf,
                           size_t len,
                           Error **errp)
 {
+    assert(QEMU_IS_ALIGNED(offset, QCRYPTO_BLOCK_QCOW_SECTOR_SIZE));
+    assert(QEMU_IS_ALIGNED(len, QCRYPTO_BLOCK_QCOW_SECTOR_SIZE));
    return qcrypto_block_encrypt_helper(block->cipher,
                                        block->niv, block->ivgen,
                                        QCRYPTO_BLOCK_QCOW_SECTOR_SIZE,
-                                        startsector, buf, len, errp);
+                                        offset, buf, len, errp);
 }


--- a/crypto/block.c
+++ b/crypto/block.c
@@ -127,22 +127,22 @@ QCryptoBlockInfo *qcrypto_block_get_info(QCryptoBlock *block,


 int qcrypto_block_decrypt(QCryptoBlock *block,
-                          uint64_t startsector,
+                          uint64_t offset,
                          uint8_t *buf,
                          size_t len,
                          Error **errp)
 {
-    return block->driver->decrypt(block, startsector, buf, len, errp);
+    return block->driver->decrypt(block, offset, buf, len, errp);
 }


 int qcrypto_block_encrypt(QCryptoBlock *block,
-                          uint64_t startsector,
+                          uint64_t offset,
                          uint8_t *buf,
                          size_t len,
                          Error **errp)
 {
-    return block->driver->encrypt(block, startsector, buf, len, errp);
+    return block->driver->encrypt(block, offset, buf, len, errp);
 }


@@ -170,6 +170,12 @@ uint64_t qcrypto_block_get_payload_offset(QCryptoBlock *block)
 }


+uint64_t qcrypto_block_get_sector_size(QCryptoBlock *block)
+{
+    return block->sector_size;
+}
+
+
 void qcrypto_block_free(QCryptoBlock *block)
 {
    if (!block) {
@@ -188,13 +194,17 @@ int qcrypto_block_decrypt_helper(QCryptoCipher *cipher,
                                 size_t niv,
                                 QCryptoIVGen *ivgen,
                                 int sectorsize,
-                                 uint64_t startsector,
+                                 uint64_t offset,
                                 uint8_t *buf,
                                 size_t len,
                                 Error **errp)
 {
    uint8_t *iv;
    int ret = -1;
+    uint64_t startsector = offset / sectorsize;
+
+    assert(QEMU_IS_ALIGNED(offset, sectorsize));
+    assert(QEMU_IS_ALIGNED(len, sectorsize));

    iv = niv ? g_new0(uint8_t, niv) : NULL;

@@ -237,13 +247,17 @@ int qcrypto_block_encrypt_helper(QCryptoCipher *cipher,
                                 size_t niv,
                                 QCryptoIVGen *ivgen,
                                 int sectorsize,
-                                 uint64_t startsector,
+                                 uint64_t offset,
                                 uint8_t *buf,
                                 size_t len,
                                 Error **errp)
 {
    uint8_t *iv;
    int ret = -1;
+    uint64_t startsector = offset / sectorsize;
+
+    assert(QEMU_IS_ALIGNED(offset, sectorsize));
+    assert(QEMU_IS_ALIGNED(len, sectorsize));

    iv = niv ? g_new0(uint8_t, niv) : NULL;

--- a/crypto/blockpriv.h
+++ b/crypto/blockpriv.h
@@ -36,6 +36,7 @@ struct QCryptoBlock {
    QCryptoHashAlgorithm kdfhash;
    size_t niv;
    uint64_t payload_offset; /* In bytes */
+    uint64_t sector_size; /* In bytes */
 };

 struct QCryptoBlockDriver {
@@ -81,7 +82,7 @@ int qcrypto_block_decrypt_helper(QCryptoCipher *cipher,
                                 size_t niv,
                                 QCryptoIVGen *ivgen,
                                 int sectorsize,
-                                 uint64_t startsector,
+                                 uint64_t offset,
                                 uint8_t *buf,
                                 size_t len,
                                 Error **errp);
@@ -90,7 +91,7 @@ int qcrypto_block_encrypt_helper(QCryptoCipher *cipher,
                                 size_t niv,
                                 QCryptoIVGen *ivgen,
                                 int sectorsize,
-                                 uint64_t startsector,
+                                 uint64_t offset,
                                 uint8_t *buf,
                                 size_t len,
                                 Error **errp);
--- a/default-configs/arm-softmmu.mak
+++ b/default-configs/arm-softmmu.mak
@@ -129,3 +129,4 @@ CONFIG_ACPI=y
 CONFIG_SMBIOS=y
 CONFIG_ASPEED_SOC=y
 CONFIG_GPIO_KEY=y
+CONFIG_MSF2=y
--- a/docs/devel/atomics.txt
+++ b/docs/devel/atomics.txt
@@ -63,11 +63,23 @@ operations:
    typeof(*ptr) atomic_fetch_sub(ptr, val)
    typeof(*ptr) atomic_fetch_and(ptr, val)
    typeof(*ptr) atomic_fetch_or(ptr, val)
+    typeof(*ptr) atomic_fetch_xor(ptr, val)
+    typeof(*ptr) atomic_fetch_inc_nonzero(ptr)
    typeof(*ptr) atomic_xchg(ptr, val)
    typeof(*ptr) atomic_cmpxchg(ptr, old, new)

 all of which return the old value of *ptr.  These operations are
-polymorphic; they operate on any type that is as wide as an int.
+polymorphic; they operate on any type that is as wide as a pointer.
+
+Similar operations return the new value of *ptr:
+
+    typeof(*ptr) atomic_inc_fetch(ptr)
+    typeof(*ptr) atomic_dec_fetch(ptr)
+    typeof(*ptr) atomic_add_fetch(ptr, val)
+    typeof(*ptr) atomic_sub_fetch(ptr, val)
+    typeof(*ptr) atomic_and_fetch(ptr, val)
+    typeof(*ptr) atomic_or_fetch(ptr, val)
+    typeof(*ptr) atomic_xor_fetch(ptr, val)

 Sequentially consistent loads and stores can be done using:

--- a/docs/devel/migration.txt
+++ b/docs/devel/migration.txt
@@ -202,7 +202,7 @@ The functions to do that are inside a vmstate definition, and are called:

  This function is called after we load the state of one device.

- void (*pre_save)(void *opaque);
+- int (*pre_save)(void *opaque);

  This function is called before we save the state of one device.

--- a/docs/interop/pr-helper.rst
+++ b/docs/interop/pr-helper.rst
@@ -0,0 +1,83 @@
+..
+
+======================================
+Persistent reservation helper protocol
+======================================
+
+QEMU's SCSI passthrough devices, ``scsi-block`` and ``scsi-generic``,
+can delegate implementation of persistent reservations to an external
+(and typically privileged) program.  Persistent Reservations allow
+restricting access to block devices to specific initiators in a shared
+storage setup.
+
+For a more detailed reference please refer the the SCSI Primary
+Commands standard, specifically the section on Reservations and the
+"PERSISTENT RESERVE IN" and "PERSISTENT RESERVE OUT" commands.
+
+This document describes the socket protocol used between QEMU's
+``pr-manager-helper`` object and the external program.
+
+.. contents::
+
+Connection and initialization
+-----------------------------
+
+All data transmitted on the socket is big-endian.
+
+After connecting to the helper program's socket, the helper starts a simple
+feature negotiation process by writing four bytes corresponding to
+the features it exposes (``supported_features``).  QEMU reads it,
+then writes four bytes corresponding to the desired features of the
+helper program (``requested_features``).
+
+If a bit is 1 in ``requested_features`` and 0 in ``supported_features``,
+the corresponding feature is not supported by the helper and the connection
+is closed.  On the other hand, it is acceptable for a bit to be 0 in
+``requested_features`` and 1 in ``supported_features``; in this case,
+the helper will not enable the feature.
+
+Right now no feature is defined, so the two parties always write four
+zero bytes.
+
+Command format
+--------------
+
+It is invalid to send multiple commands concurrently on the same
+socket.  It is however possible to connect multiple sockets to the
+helper and send multiple commands to the helper for one or more
+file descriptors.
+
+A command consists of a request and a response.  A request consists
+of a 16-byte SCSI CDB.  A file descriptor must be passed to the helper
+together with the SCSI CDB using ancillary data.
+
+The CDB has the following limitations:
+
+- the command (stored in the first byte) must be one of 0x5E
+  (PERSISTENT RESERVE IN) or 0x5F (PERSISTENT RESERVE OUT).
+
+- the allocation length (stored in bytes 7-8 of the CDB for PERSISTENT
+  RESERVE IN) or parameter list length (stored in bytes 5-8 of the CDB
+  for PERSISTENT RESERVE OUT) is limited to 8 KiB.
+
+For PERSISTENT RESERVE OUT, the parameter list is sent right after the
+CDB.  The length of the parameter list is taken from the CDB itself.
+
+The helper's reply has the following structure:
+
+- 4 bytes for the SCSI status
+
+- 4 bytes for the payload size (nonzero only for PERSISTENT RESERVE IN
+  and only if the SCSI status is 0x00, i.e. GOOD)
+
+- 96 bytes for the SCSI sense data
+
+- if the size is nonzero, the payload follows
+
+The sense data is always sent to keep the protocol simple, even though
+it is only valid if the SCSI status is CHECK CONDITION (0x02).
+
+The payload size is always less than or equal to the allocation length
+specified in the CDB for the PERSISTENT RESERVE IN command.
+
+If the protocol is violated, the helper closes the socket.
--- a/docs/memory-hotplug.txt
+++ b/docs/memory-hotplug.txt
@@ -24,7 +24,7 @@ Where,

 For example, the following command-line:

- qemu [...] 1G,slots=3,maxmem=4G
+ qemu [...] -m 1G,slots=3,maxmem=4G

 Creates a guest with 1GB of memory and three hotpluggable memory slots.
 The hotpluggable memory slots are empty when the guest is booted, so all
--- a/docs/pr-manager.rst
+++ b/docs/pr-manager.rst
@@ -0,0 +1,111 @@
+======================================
+Persistent reservation managers
+======================================
+
+SCSI persistent Reservations allow restricting access to block devices
+to specific initiators in a shared storage setup.  When implementing
+clustering of virtual machines, it is a common requirement for virtual
+machines to send persistent reservation SCSI commands.  However,
+the operating system restricts sending these commands to unprivileged
+programs because incorrect usage can disrupt regular operation of the
+storage fabric.
+
+For this reason, QEMU's SCSI passthrough devices, ``scsi-block``
+and ``scsi-generic`` (both are only available on Linux) can delegate
+implementation of persistent reservations to a separate object,
+the "persistent reservation manager".  Only PERSISTENT RESERVE OUT and
+PERSISTENT RESERVE IN commands are passed to the persistent reservation
+manager object; other commands are processed by QEMU as usual.
+
+-----------------------------------------
+Defining a persistent reservation manager
+-----------------------------------------
+
+A persistent reservation manager is an instance of a subclass of the
+"pr-manager" QOM class.
+
+Right now only one subclass is defined, ``pr-manager-helper``, which
+forwards the commands to an external privileged helper program
+over Unix sockets.  The helper program only allows sending persistent
+reservation commands to devices for which QEMU has a file descriptor,
+so that QEMU will not be able to effect persistent reservations
+unless it has access to both the socket and the device.
+
+``pr-manager-helper`` has a single string property, ``path``, which
+accepts the path to the helper program's Unix socket.  For example,
+the following command line defines a ``pr-manager-helper`` object and
+attaches it to a SCSI passthrough device::
+
+      $ qemu-system-x86_64
+          -device virtio-scsi \
+          -object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock
+          -drive if=none,id=hd,driver=raw,file.filename=/dev/sdb,file.pr-manager=helper0
+          -device scsi-block,drive=hd
+
+Alternatively, using ``-blockdev``::
+
+      $ qemu-system-x86_64
+          -device virtio-scsi \
+          -object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock
+          -blockdev node-name=hd,driver=raw,file.driver=host_device,file.filename=/dev/sdb,file.pr-manager=helper0
+          -device scsi-block,drive=hd
+
+----------------------------------
+Invoking :program:`qemu-pr-helper`
+----------------------------------
+
+QEMU provides an implementation of the persistent reservation helper,
+called :program:`qemu-pr-helper`.  The helper should be started as a
+system service and supports the following option:
+
+-d, --daemon              run in the background
+-q, --quiet               decrease verbosity
+-v, --verbose             increase verbosity
+-f, --pidfile=path        PID file when running as a daemon
+-k, --socket=path         path to the socket
+-T, --trace=trace-opts    tracing options
+
+By default, the socket and PID file are placed in the runtime state
+directory, for example :file:`/var/run/qemu-pr-helper.sock` and
+:file:`/var/run/qemu-pr-helper.pid`.  The PID file is not created
+unless :option:`-d` is passed too.
+
+:program:`qemu-pr-helper` can also use the systemd socket activation
+protocol.  In this case, the systemd socket unit should specify a
+Unix stream socket, like this::
+
+    [Socket]
+    ListenStream=/var/run/qemu-pr-helper.sock
+
+After connecting to the socket, :program:`qemu-pr-helper`` can optionally drop
+root privileges, except for those capabilities that are needed for
+its operation.  To do this, add the following options:
+
+-u, --user=user           user to drop privileges to
+-g, --group=group         group to drop privileges to
+
+---------------------------------------------
+Multipath devices and persistent reservations
+---------------------------------------------
+
+Proper support of persistent reservation for multipath devices requires
+communication with the multipath daemon, so that the reservation is
+registered and applied when a path is newly discovered or becomes online
+again.  :command:`qemu-pr-helper` can do this if the ``libmpathpersist``
+library was available on the system at build time.
+
+As of August 2017, a reservation key must be specified in ``multipath.conf``
+for ``multipathd`` to check for persistent reservation for newly
+discovered paths or reinstated paths.  The attribute can be added
+to the ``defaults`` section or the ``multipaths`` section; for example::
+
+    multipaths {
+        multipath {
+            wwid   XXXXXXXXXXXXXXXX
+            alias      yellow
+            reservation_key  0x123abc
+        }
+    }
+
+Linking :program:`qemu-pr-helper` to ``libmpathpersist`` does not impede
+its usage on regular SCSI devices.
--- a/docs/qemu-block-drivers.texi
+++ b/docs/qemu-block-drivers.texi
@@ -0,0 +1,804 @@
+@c man begin SYNOPSIS
+QEMU block driver reference manual
+@c man end
+
+@c man begin DESCRIPTION
+
+@node disk_images_formats
+@subsection Disk image file formats
+
+QEMU supports many image file formats that can be used with VMs as well as with
+any of the tools (like @code{qemu-img}). This includes the preferred formats
+raw and qcow2 as well as formats that are supported for compatibility with
+older QEMU versions or other hypervisors.
+
+Depending on the image format, different options can be passed to
+@code{qemu-img create} and @code{qemu-img convert} using the @code{-o} option.
+This section describes each format and the options that are supported for it.
+
+@table @option
+@item raw
+
+Raw disk image format. This format has the advantage of
+being simple and easily exportable to all other emulators. If your
+file system supports @emph{holes} (for example in ext2 or ext3 on
+Linux or NTFS on Windows), then only the written sectors will reserve
+space. Use @code{qemu-img info} to know the real size used by the
+image or @code{ls -ls} on Unix/Linux.
+
+Supported options:
+@table @code
+@item preallocation
+Preallocation mode (allowed values: @code{off}, @code{falloc}, @code{full}).
+@code{falloc} mode preallocates space for image by calling posix_fallocate().
+@code{full} mode preallocates space for image by writing zeros to underlying
+storage.
+@end table
+
+@item qcow2
+QEMU image format, the most versatile format. Use it to have smaller
+images (useful if your filesystem does not supports holes, for example
+on Windows), zlib based compression and support of multiple VM
+snapshots.
+
+Supported options:
+@table @code
+@item compat
+Determines the qcow2 version to use. @code{compat=0.10} uses the
+traditional image format that can be read by any QEMU since 0.10.
+@code{compat=1.1} enables image format extensions that only QEMU 1.1 and
+newer understand (this is the default). Amongst others, this includes
+zero clusters, which allow efficient copy-on-read for sparse images.
+
+@item backing_file
+File name of a base image (see @option{create} subcommand)
+@item backing_fmt
+Image format of the base image
+@item encryption
+This option is deprecated and equivalent to @code{encrypt.format=aes}
+
+@item encrypt.format
+
+If this is set to @code{luks}, it requests that the qcow2 payload (not
+qcow2 header) be encrypted using the LUKS format. The passphrase to
+use to unlock the LUKS key slot is given by the @code{encrypt.key-secret}
+parameter. LUKS encryption parameters can be tuned with the other
+@code{encrypt.*} parameters.
+
+If this is set to @code{aes}, the image is encrypted with 128-bit AES-CBC.
+The encryption key is given by the @code{encrypt.key-secret} parameter.
+This encryption format is considered to be flawed by modern cryptography
+standards, suffering from a number of design problems:
+
+@itemize @minus
+@item The AES-CBC cipher is used with predictable initialization vectors based
+on the sector number. This makes it vulnerable to chosen plaintext attacks
+which can reveal the existence of encrypted data.
+@item The user passphrase is directly used as the encryption key. A poorly
+chosen or short passphrase will compromise the security of the encryption.
+@item In the event of the passphrase being compromised there is no way to
+change the passphrase to protect data in any qcow images. The files must
+be cloned, using a different encryption passphrase in the new file. The
+original file must then be securely erased using a program like shred,
+though even this is ineffective with many modern storage technologies.
+@end itemize
+
+The use of this is no longer supported in system emulators. Support only
+remains in the command line utilities, for the purposes of data liberation
+and interoperability with old versions of QEMU. The @code{luks} format
+should be used instead.
+
+@item encrypt.key-secret
+
+Provides the ID of a @code{secret} object that contains the passphrase
+(@code{encrypt.format=luks}) or encryption key (@code{encrypt.format=aes}).
+
+@item encrypt.cipher-alg
+
+Name of the cipher algorithm and key length. Currently defaults
+to @code{aes-256}. Only used when @code{encrypt.format=luks}.
+
+@item encrypt.cipher-mode
+
+Name of the encryption mode to use. Currently defaults to @code{xts}.
+Only used when @code{encrypt.format=luks}.
+
+@item encrypt.ivgen-alg
+
+Name of the initialization vector generator algorithm. Currently defaults
+to @code{plain64}. Only used when @code{encrypt.format=luks}.
+
+@item encrypt.ivgen-hash-alg
+
+Name of the hash algorithm to use with the initialization vector generator
+(if required). Defaults to @code{sha256}. Only used when @code{encrypt.format=luks}.
+
+@item encrypt.hash-alg
+
+Name of the hash algorithm to use for PBKDF algorithm
+Defaults to @code{sha256}. Only used when @code{encrypt.format=luks}.
+
+@item encrypt.iter-time
+
+Amount of time, in milliseconds, to use for PBKDF algorithm per key slot.
+Defaults to @code{2000}. Only used when @code{encrypt.format=luks}.
+
+@item cluster_size
+Changes the qcow2 cluster size (must be between 512 and 2M). Smaller cluster
+sizes can improve the image file size whereas larger cluster sizes generally
+provide better performance.
+
+@item preallocation
+Preallocation mode (allowed values: @code{off}, @code{metadata}, @code{falloc},
+@code{full}). An image with preallocated metadata is initially larger but can
+improve performance when the image needs to grow. @code{falloc} and @code{full}
+preallocations are like the same options of @code{raw} format, but sets up
+metadata also.
+
+@item lazy_refcounts
+If this option is set to @code{on}, reference count updates are postponed with
+the goal of avoiding metadata I/O and improving performance. This is
+particularly interesting with @option{cache=writethrough} which doesn't batch
+metadata updates. The tradeoff is that after a host crash, the reference count
+tables must be rebuilt, i.e. on the next open an (automatic) @code{qemu-img
+check -r all} is required, which may take some time.
+
+This option can only be enabled if @code{compat=1.1} is specified.
+
+@item nocow
+If this option is set to @code{on}, it will turn off COW of the file. It's only
+valid on btrfs, no effect on other file systems.
+
+Btrfs has low performance when hosting a VM image file, even more when the guest
+on the VM also using btrfs as file system. Turning off COW is a way to mitigate
+this bad performance. Generally there are two ways to turn off COW on btrfs:
+a) Disable it by mounting with nodatacow, then all newly created files will be
+NOCOW. b) For an empty file, add the NOCOW file attribute. That's what this option
+does.
+
+Note: this option is only valid to new or empty files. If there is an existing
+file which is COW and has data blocks already, it couldn't be changed to NOCOW
+by setting @code{nocow=on}. One can issue @code{lsattr filename} to check if
+the NOCOW flag is set or not (Capital 'C' is NOCOW flag).
+
+@end table
+
+@item qed
+Old QEMU image format with support for backing files and compact image files
+(when your filesystem or transport medium does not support holes).
+
+When converting QED images to qcow2, you might want to consider using the
+@code{lazy_refcounts=on} option to get a more QED-like behaviour.
+
+Supported options:
+@table @code
+@item backing_file
+File name of a base image (see @option{create} subcommand).
+@item backing_fmt
+Image file format of backing file (optional).  Useful if the format cannot be
+autodetected because it has no header, like some vhd/vpc files.
+@item cluster_size
+Changes the cluster size (must be power-of-2 between 4K and 64K). Smaller
+cluster sizes can improve the image file size whereas larger cluster sizes
+generally provide better performance.
+@item table_size
+Changes the number of clusters per L1/L2 table (must be power-of-2 between 1
+and 16).  There is normally no need to change this value but this option can be
+used for performance benchmarking.
+@end table
+
+@item qcow
+Old QEMU image format with support for backing files, compact image files,
+encryption and compression.
+
+Supported options:
+@table @code
+@item backing_file
+File name of a base image (see @option{create} subcommand)
+@item encryption
+This option is deprecated and equivalent to @code{encrypt.format=aes}
+
+@item encrypt.format
+If this is set to @code{aes}, the image is encrypted with 128-bit AES-CBC.
+The encryption key is given by the @code{encrypt.key-secret} parameter.
+This encryption format is considered to be flawed by modern cryptography
+standards, suffering from a number of design problems enumerated previously
+against the @code{qcow2} image format.
+
+The use of this is no longer supported in system emulators. Support only
+remains in the command line utilities, for the purposes of data liberation
+and interoperability with old versions of QEMU.
+
+Users requiring native encryption should use the @code{qcow2} format
+instead with @code{encrypt.format=luks}.
+
+@item encrypt.key-secret
+
+Provides the ID of a @code{secret} object that contains the encryption
+key (@code{encrypt.format=aes}).
+
+@end table
+
+@item luks
+
+LUKS v1 encryption format, compatible with Linux dm-crypt/cryptsetup
+
+Supported options:
+@table @code
+
+@item key-secret
+
+Provides the ID of a @code{secret} object that contains the passphrase.
+
+@item cipher-alg
+
+Name of the cipher algorithm and key length. Currently defaults
+to @code{aes-256}.
+
+@item cipher-mode
+
+Name of the encryption mode to use. Currently defaults to @code{xts}.
+
+@item ivgen-alg
+
+Name of the initialization vector generator algorithm. Currently defaults
+to @code{plain64}.
+
+@item ivgen-hash-alg
+
+Name of the hash algorithm to use with the initialization vector generator
+(if required). Defaults to @code{sha256}.
+
+@item hash-alg
+
+Name of the hash algorithm to use for PBKDF algorithm
+Defaults to @code{sha256}.
+
+@item iter-time
+
+Amount of time, in milliseconds, to use for PBKDF algorithm per key slot.
+Defaults to @code{2000}.
+
+@end table
+
+@item vdi
+VirtualBox 1.1 compatible image format.
+Supported options:
+@table @code
+@item static
+If this option is set to @code{on}, the image is created with metadata
+preallocation.
+@end table
+
+@item vmdk
+VMware 3 and 4 compatible image format.
+
+Supported options:
+@table @code
+@item backing_file
+File name of a base image (see @option{create} subcommand).
+@item compat6
+Create a VMDK version 6 image (instead of version 4)
+@item hwversion
+Specify vmdk virtual hardware version. Compat6 flag cannot be enabled
+if hwversion is specified.
+@item subformat
+Specifies which VMDK subformat to use. Valid options are
+@code{monolithicSparse} (default),
+@code{monolithicFlat},
+@code{twoGbMaxExtentSparse},
+@code{twoGbMaxExtentFlat} and
+@code{streamOptimized}.
+@end table
+
+@item vpc
+VirtualPC compatible image format (VHD).
+Supported options:
+@table @code
+@item subformat
+Specifies which VHD subformat to use. Valid options are
+@code{dynamic} (default) and @code{fixed}.
+@end table
+
+@item VHDX
+Hyper-V compatible image format (VHDX).
+Supported options:
+@table @code
+@item subformat
+Specifies which VHDX subformat to use. Valid options are
+@code{dynamic} (default) and @code{fixed}.
+@item block_state_zero
+Force use of payload blocks of type 'ZERO'.  Can be set to @code{on} (default)
+or @code{off}.  When set to @code{off}, new blocks will be created as
+@code{PAYLOAD_BLOCK_NOT_PRESENT}, which means parsers are free to return
+arbitrary data for those blocks.  Do not set to @code{off} when using
+@code{qemu-img convert} with @code{subformat=dynamic}.
+@item block_size
+Block size; min 1 MB, max 256 MB.  0 means auto-calculate based on image size.
+@item log_size
+Log size; min 1 MB.
+@end table
+@end table
+
+@subsubsection Read-only formats
+More disk image file formats are supported in a read-only mode.
+@table @option
+@item bochs
+Bochs images of @code{growing} type.
+@item cloop
+Linux Compressed Loop image, useful only to reuse directly compressed
+CD-ROM images present for example in the Knoppix CD-ROMs.
+@item dmg
+Apple disk image.
+@item parallels
+Parallels disk image format.
+@end table
+
+
+@node host_drives
+@subsection Using host drives
+
+In addition to disk image files, QEMU can directly access host
+devices. We describe here the usage for QEMU version >= 0.8.3.
+
+@subsubsection Linux
+
+On Linux, you can directly use the host device filename instead of a
+disk image filename provided you have enough privileges to access
+it. For example, use @file{/dev/cdrom} to access to the CDROM.
+
+@table @code
+@item CD
+You can specify a CDROM device even if no CDROM is loaded. QEMU has
+specific code to detect CDROM insertion or removal. CDROM ejection by
+the guest OS is supported. Currently only data CDs are supported.
+@item Floppy
+You can specify a floppy device even if no floppy is loaded. Floppy
+removal is currently not detected accurately (if you change floppy
+without doing floppy access while the floppy is not loaded, the guest
+OS will think that the same floppy is loaded).
+Use of the host's floppy device is deprecated, and support for it will
+be removed in a future release.
+@item Hard disks
+Hard disks can be used. Normally you must specify the whole disk
+(@file{/dev/hdb} instead of @file{/dev/hdb1}) so that the guest OS can
+see it as a partitioned disk. WARNING: unless you know what you do, it
+is better to only make READ-ONLY accesses to the hard disk otherwise
+you may corrupt your host data (use the @option{-snapshot} command
+line option or modify the device permissions accordingly).
+@end table
+
+@subsubsection Windows
+
+@table @code
+@item CD
+The preferred syntax is the drive letter (e.g. @file{d:}). The
+alternate syntax @file{\\.\d:} is supported. @file{/dev/cdrom} is
+supported as an alias to the first CDROM drive.
+
+Currently there is no specific code to handle removable media, so it
+is better to use the @code{change} or @code{eject} monitor commands to
+change or eject media.
+@item Hard disks
+Hard disks can be used with the syntax: @file{\\.\PhysicalDrive@var{N}}
+where @var{N} is the drive number (0 is the first hard disk).
+
+WARNING: unless you know what you do, it is better to only make
+READ-ONLY accesses to the hard disk otherwise you may corrupt your
+host data (use the @option{-snapshot} command line so that the
+modifications are written in a temporary file).
+@end table
+
+
+@subsubsection Mac OS X
+
+@file{/dev/cdrom} is an alias to the first CDROM.
+
+Currently there is no specific code to handle removable media, so it
+is better to use the @code{change} or @code{eject} monitor commands to
+change or eject media.
+
+@node disk_images_fat_images
+@subsection Virtual FAT disk images
+
+QEMU can automatically create a virtual FAT disk image from a
+directory tree. In order to use it, just type:
+
+@example
+qemu-system-i386 linux.img -hdb fat:/my_directory
+@end example
+
+Then you access access to all the files in the @file{/my_directory}
+directory without having to copy them in a disk image or to export
+them via SAMBA or NFS. The default access is @emph{read-only}.
+
+Floppies can be emulated with the @code{:floppy:} option:
+
+@example
+qemu-system-i386 linux.img -fda fat:floppy:/my_directory
+@end example
+
+A read/write support is available for testing (beta stage) with the
+@code{:rw:} option:
+
+@example
+qemu-system-i386 linux.img -fda fat:floppy:rw:/my_directory
+@end example
+
+What you should @emph{never} do:
+@itemize
+@item use non-ASCII filenames ;
+@item use "-snapshot" together with ":rw:" ;
+@item expect it to work when loadvm'ing ;
+@item write to the FAT directory on the host system while accessing it with the guest system.
+@end itemize
+
+@node disk_images_nbd
+@subsection NBD access
+
+QEMU can access directly to block device exported using the Network Block Device
+protocol.
+
+@example
+qemu-system-i386 linux.img -hdb nbd://my_nbd_server.mydomain.org:1024/
+@end example
+
+If the NBD server is located on the same host, you can use an unix socket instead
+of an inet socket:
+
+@example
+qemu-system-i386 linux.img -hdb nbd+unix://?socket=/tmp/my_socket
+@end example
+
+In this case, the block device must be exported using qemu-nbd:
+
+@example
+qemu-nbd --socket=/tmp/my_socket my_disk.qcow2
+@end example
+
+The use of qemu-nbd allows sharing of a disk between several guests:
+@example
+qemu-nbd --socket=/tmp/my_socket --share=2 my_disk.qcow2
+@end example
+
+@noindent
+and then you can use it with two guests:
+@example
+qemu-system-i386 linux1.img -hdb nbd+unix://?socket=/tmp/my_socket
+qemu-system-i386 linux2.img -hdb nbd+unix://?socket=/tmp/my_socket
+@end example
+
+If the nbd-server uses named exports (supported since NBD 2.9.18, or with QEMU's
+own embedded NBD server), you must specify an export name in the URI:
+@example
+qemu-system-i386 -cdrom nbd://localhost/debian-500-ppc-netinst
+qemu-system-i386 -cdrom nbd://localhost/openSUSE-11.1-ppc-netinst
+@end example
+
+The URI syntax for NBD is supported since QEMU 1.3.  An alternative syntax is
+also available.  Here are some example of the older syntax:
+@example
+qemu-system-i386 linux.img -hdb nbd:my_nbd_server.mydomain.org:1024
+qemu-system-i386 linux2.img -hdb nbd:unix:/tmp/my_socket
+qemu-system-i386 -cdrom nbd:localhost:10809:exportname=debian-500-ppc-netinst
+@end example
+
+@node disk_images_sheepdog
+@subsection Sheepdog disk images
+
+Sheepdog is a distributed storage system for QEMU.  It provides highly
+available block level storage volumes that can be attached to
+QEMU-based virtual machines.
+
+You can create a Sheepdog disk image with the command:
+@example
+qemu-img create sheepdog:///@var{image} @var{size}
+@end example
+where @var{image} is the Sheepdog image name and @var{size} is its
+size.
+
+To import the existing @var{filename} to Sheepdog, you can use a
+convert command.
+@example
+qemu-img convert @var{filename} sheepdog:///@var{image}
+@end example
+
+You can boot from the Sheepdog disk image with the command:
+@example
+qemu-system-i386 sheepdog:///@var{image}
+@end example
+
+You can also create a snapshot of the Sheepdog image like qcow2.
+@example
+qemu-img snapshot -c @var{tag} sheepdog:///@var{image}
+@end example
+where @var{tag} is a tag name of the newly created snapshot.
+
+To boot from the Sheepdog snapshot, specify the tag name of the
+snapshot.
+@example
+qemu-system-i386 sheepdog:///@var{image}#@var{tag}
+@end example
+
+You can create a cloned image from the existing snapshot.
+@example
+qemu-img create -b sheepdog:///@var{base}#@var{tag} sheepdog:///@var{image}
+@end example
+where @var{base} is a image name of the source snapshot and @var{tag}
+is its tag name.
+
+You can use an unix socket instead of an inet socket:
+
+@example
+qemu-system-i386 sheepdog+unix:///@var{image}?socket=@var{path}
+@end example
+
+If the Sheepdog daemon doesn't run on the local host, you need to
+specify one of the Sheepdog servers to connect to.
+@example
+qemu-img create sheepdog://@var{hostname}:@var{port}/@var{image} @var{size}
+qemu-system-i386 sheepdog://@var{hostname}:@var{port}/@var{image}
+@end example
+
+@node disk_images_iscsi
+@subsection iSCSI LUNs
+
+iSCSI is a popular protocol used to access SCSI devices across a computer
+network.
+
+There are two different ways iSCSI devices can be used by QEMU.
+
+The first method is to mount the iSCSI LUN on the host, and make it appear as
+any other ordinary SCSI device on the host and then to access this device as a
+/dev/sd device from QEMU. How to do this differs between host OSes.
+
+The second method involves using the iSCSI initiator that is built into
+QEMU. This provides a mechanism that works the same way regardless of which
+host OS you are running QEMU on. This section will describe this second method
+of using iSCSI together with QEMU.
+
+In QEMU, iSCSI devices are described using special iSCSI URLs
+
+@example
+URL syntax:
+iscsi://[<username>[%<password>]@@]<host>[:<port>]/<target-iqn-name>/<lun>
+@end example
+
+Username and password are optional and only used if your target is set up
+using CHAP authentication for access control.
+Alternatively the username and password can also be set via environment
+variables to have these not show up in the process list
+
+@example
+export LIBISCSI_CHAP_USERNAME=<username>
+export LIBISCSI_CHAP_PASSWORD=<password>
+iscsi://<host>/<target-iqn-name>/<lun>
+@end example
+
+Various session related parameters can be set via special options, either
+in a configuration file provided via '-readconfig' or directly on the
+command line.
+
+If the initiator-name is not specified qemu will use a default name
+of 'iqn.2008-11.org.linux-kvm[:<uuid>'] where <uuid> is the UUID of the
+virtual machine. If the UUID is not specified qemu will use
+'iqn.2008-11.org.linux-kvm[:<name>'] where <name> is the name of the
+virtual machine.
+
+@example
+Setting a specific initiator name to use when logging in to the target
+-iscsi initiator-name=iqn.qemu.test:my-initiator
+@end example
+
+@example
+Controlling which type of header digest to negotiate with the target
+-iscsi header-digest=CRC32C|CRC32C-NONE|NONE-CRC32C|NONE
+@end example
+
+These can also be set via a configuration file
+@example
+[iscsi]
+  user = "CHAP username"
+  password = "CHAP password"
+  initiator-name = "iqn.qemu.test:my-initiator"
+  # header digest is one of CRC32C|CRC32C-NONE|NONE-CRC32C|NONE
+  header-digest = "CRC32C"
+@end example
+
+
+Setting the target name allows different options for different targets
+@example
+[iscsi "iqn.target.name"]
+  user = "CHAP username"
+  password = "CHAP password"
+  initiator-name = "iqn.qemu.test:my-initiator"
+  # header digest is one of CRC32C|CRC32C-NONE|NONE-CRC32C|NONE
+  header-digest = "CRC32C"
+@end example
+
+
+Howto use a configuration file to set iSCSI configuration options:
+@example
+cat >iscsi.conf <<EOF
+[iscsi]
+  user = "me"
+  password = "my password"
+  initiator-name = "iqn.qemu.test:my-initiator"
+  header-digest = "CRC32C"
+EOF
+
+qemu-system-i386 -drive file=iscsi://127.0.0.1/iqn.qemu.test/1 \
+    -readconfig iscsi.conf
+@end example
+
+
+Howto set up a simple iSCSI target on loopback and accessing it via QEMU:
+@example
+This example shows how to set up an iSCSI target with one CDROM and one DISK
+using the Linux STGT software target. This target is available on Red Hat based
+systems as the package 'scsi-target-utils'.
+
+tgtd --iscsi portal=127.0.0.1:3260
+tgtadm --lld iscsi --op new --mode target --tid 1 -T iqn.qemu.test
+tgtadm --lld iscsi --mode logicalunit --op new --tid 1 --lun 1 \
+    -b /IMAGES/disk.img --device-type=disk
+tgtadm --lld iscsi --mode logicalunit --op new --tid 1 --lun 2 \
+    -b /IMAGES/cd.iso --device-type=cd
+tgtadm --lld iscsi --op bind --mode target --tid 1 -I ALL
+
+qemu-system-i386 -iscsi initiator-name=iqn.qemu.test:my-initiator \
+    -boot d -drive file=iscsi://127.0.0.1/iqn.qemu.test/1 \
+    -cdrom iscsi://127.0.0.1/iqn.qemu.test/2
+@end example
+
+@node disk_images_gluster
+@subsection GlusterFS disk images
+
+GlusterFS is a user space distributed file system.
+
+You can boot from the GlusterFS disk image with the command:
+@example
+URI:
+qemu-system-x86_64 -drive file=gluster[+@var{type}]://[@var{host}[:@var{port}]]/@var{volume}/@var{path}
+                               [?socket=...][,file.debug=9][,file.logfile=...]
+
+JSON:
+qemu-system-x86_64 'json:@{"driver":"qcow2",
+                           "file":@{"driver":"gluster",
+                                    "volume":"testvol","path":"a.img","debug":9,"logfile":"...",
+                                    "server":[@{"type":"tcp","host":"...","port":"..."@},
+                                              @{"type":"unix","socket":"..."@}]@}@}'
+@end example
+
+@var{gluster} is the protocol.
+
+@var{type} specifies the transport type used to connect to gluster
+management daemon (glusterd). Valid transport types are
+tcp and unix. In the URI form, if a transport type isn't specified,
+then tcp type is assumed.
+
+@var{host} specifies the server where the volume file specification for
+the given volume resides. This can be either a hostname or an ipv4 address.
+If transport type is unix, then @var{host} field should not be specified.
+Instead @var{socket} field needs to be populated with the path to unix domain
+socket.
+
+@var{port} is the port number on which glusterd is listening. This is optional
+and if not specified, it defaults to port 24007. If the transport type is unix,
+then @var{port} should not be specified.
+
+@var{volume} is the name of the gluster volume which contains the disk image.
+
+@var{path} is the path to the actual disk image that resides on gluster volume.
+
+@var{debug} is the logging level of the gluster protocol driver. Debug levels
+are 0-9, with 9 being the most verbose, and 0 representing no debugging output.
+The default level is 4. The current logging levels defined in the gluster source
+are 0 - None, 1 - Emergency, 2 - Alert, 3 - Critical, 4 - Error, 5 - Warning,
+6 - Notice, 7 - Info, 8 - Debug, 9 - Trace
+
+@var{logfile} is a commandline option to mention log file path which helps in
+logging to the specified file and also help in persisting the gfapi logs. The
+default is stderr.
+
+
+
+
+You can create a GlusterFS disk image with the command:
+@example
+qemu-img create gluster://@var{host}/@var{volume}/@var{path} @var{size}
+@end example
+
+Examples
+@example
+qemu-system-x86_64 -drive file=gluster://1.2.3.4/testvol/a.img
+qemu-system-x86_64 -drive file=gluster+tcp://1.2.3.4/testvol/a.img
+qemu-system-x86_64 -drive file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img
+qemu-system-x86_64 -drive file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
+qemu-system-x86_64 -drive file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img
+qemu-system-x86_64 -drive file=gluster+tcp://server.domain.com:24007/testvol/dir/a.img
+qemu-system-x86_64 -drive file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket
+qemu-system-x86_64 -drive file=gluster+rdma://1.2.3.4:24007/testvol/a.img
+qemu-system-x86_64 -drive file=gluster://1.2.3.4/testvol/a.img,file.debug=9,file.logfile=/var/log/qemu-gluster.log
+qemu-system-x86_64 'json:@{"driver":"qcow2",
+                           "file":@{"driver":"gluster",
+                                    "volume":"testvol","path":"a.img",
+                                    "debug":9,"logfile":"/var/log/qemu-gluster.log",
+                                    "server":[@{"type":"tcp","host":"1.2.3.4","port":24007@},
+                                              @{"type":"unix","socket":"/var/run/glusterd.socket"@}]@}@}'
+qemu-system-x86_64 -drive driver=qcow2,file.driver=gluster,file.volume=testvol,file.path=/path/a.img,
+                                       file.debug=9,file.logfile=/var/log/qemu-gluster.log,
+                                       file.server.0.type=tcp,file.server.0.host=1.2.3.4,file.server.0.port=24007,
+                                       file.server.1.type=unix,file.server.1.socket=/var/run/glusterd.socket
+@end example
+
+@node disk_images_ssh
+@subsection Secure Shell (ssh) disk images
+
+You can access disk images located on a remote ssh server
+by using the ssh protocol:
+
+@example
+qemu-system-x86_64 -drive file=ssh://[@var{user}@@]@var{server}[:@var{port}]/@var{path}[?host_key_check=@var{host_key_check}]
+@end example
+
+Alternative syntax using properties:
+
+@example
+qemu-system-x86_64 -drive file.driver=ssh[,file.user=@var{user}],file.host=@var{server}[,file.port=@var{port}],file.path=@var{path}[,file.host_key_check=@var{host_key_check}]
+@end example
+
+@var{ssh} is the protocol.
+
+@var{user} is the remote user.  If not specified, then the local
+username is tried.
+
+@var{server} specifies the remote ssh server.  Any ssh server can be
+used, but it must implement the sftp-server protocol.  Most Unix/Linux
+systems should work without requiring any extra configuration.
+
+@var{port} is the port number on which sshd is listening.  By default
+the standard ssh port (22) is used.
+
+@var{path} is the path to the disk image.
+
+The optional @var{host_key_check} parameter controls how the remote
+host's key is checked.  The default is @code{yes} which means to use
+the local @file{.ssh/known_hosts} file.  Setting this to @code{no}
+turns off known-hosts checking.  Or you can check that the host key
+matches a specific fingerprint:
+@code{host_key_check=md5:78:45:8e:14:57:4f:d5:45:83:0a:0e:f3:49:82:c9:c8}
+(@code{sha1:} can also be used as a prefix, but note that OpenSSH
+tools only use MD5 to print fingerprints).
+
+Currently authentication must be done using ssh-agent.  Other
+authentication methods may be supported in future.
+
+Note: Many ssh servers do not support an @code{fsync}-style operation.
+The ssh driver cannot guarantee that disk flush requests are
+obeyed, and this causes a risk of disk corruption if the remote
+server or network goes down during writes.  The driver will
+print a warning when @code{fsync} is not supported:
+
+warning: ssh server @code{ssh.example.com:22} does not support fsync
+
+With sufficiently new versions of libssh2 and OpenSSH, @code{fsync} is
+supported.
+
+@c man end
+
+@ignore
+
+@setfilename qemu-block-drivers
+@settitle QEMU block drivers reference
+
+@c man begin SEEALSO
+The HTML documentation of QEMU for more precise information and Linux
+user mode emulator invocation.
+@c man end
+
+@c man begin AUTHOR
+Fabrice Bellard and the QEMU Project developers
+@c man end
+
+@end ignore
--- a/exec.c
+++ b/exec.c
@@ -187,21 +187,18 @@ typedef struct PhysPageMap {
 } PhysPageMap;

 struct AddressSpaceDispatch {
-    struct rcu_head rcu;
-
    MemoryRegionSection *mru_section;
    /* This is a multi-level map on the physical address space.
     * The bottom level has pointers to MemoryRegionSections.
     */
    PhysPageEntry phys_map;
    PhysPageMap map;
-    AddressSpace *as;
 };

 #define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
 typedef struct subpage_t {
    MemoryRegion iomem;
-    AddressSpace *as;
+    FlatView *fv;
    hwaddr base;
    uint16_t sub_section[];
 } subpage_t;
@@ -361,7 +358,7 @@ static void phys_page_compact(PhysPageEntry *lp, Node *nodes)
    }
 }

-static void phys_page_compact_all(AddressSpaceDispatch *d, int nodes_nb)
+void address_space_dispatch_compact(AddressSpaceDispatch *d)
 {
    if (d->phys_map.skip) {
        phys_page_compact(&d->phys_map, d->map.nodes);
@@ -471,12 +468,13 @@ address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *x
 }

 /* Called from RCU critical section */
-static MemoryRegionSection address_space_do_translate(AddressSpace *as,
-                                                      hwaddr addr,
-                                                      hwaddr *xlat,
-                                                      hwaddr *plen,
-                                                      bool is_write,
-                                                      bool is_mmio)
+static MemoryRegionSection flatview_do_translate(FlatView *fv,
+                                                 hwaddr addr,
+                                                 hwaddr *xlat,
+                                                 hwaddr *plen,
+                                                 bool is_write,
+                                                 bool is_mmio,
+                                                 AddressSpace **target_as)
 {
    IOMMUTLBEntry iotlb;
    MemoryRegionSection *section;
@@ -484,8 +482,9 @@ static MemoryRegionSection address_space_do_translate(AddressSpace *as,
    IOMMUMemoryRegionClass *imrc;

    for (;;) {
-        AddressSpaceDispatch *d = atomic_rcu_read(&as->dispatch);
-        section = address_space_translate_internal(d, addr, &addr, plen, is_mmio);
+        section = address_space_translate_internal(
+                flatview_to_dispatch(fv), addr, &addr,
+                plen, is_mmio);

        iommu_mr = memory_region_get_iommu(section->mr);
        if (!iommu_mr) {
@@ -502,7 +501,8 @@ static MemoryRegionSection address_space_do_translate(AddressSpace *as,
            goto translate_fail;
        }

-        as = iotlb.target_as;
+        fv = address_space_to_flatview(iotlb.target_as);
+        *target_as = iotlb.target_as;
    }

    *xlat = addr;
@@ -524,8 +524,8 @@ IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
    plen = (hwaddr)-1;

    /* This can never be MMIO. */
-    section = address_space_do_translate(as, addr, &xlat, &plen,
-                                         is_write, false);
+    section = flatview_do_translate(address_space_to_flatview(as), addr,
+                                    &xlat, &plen, is_write, false, &as);

    /* Illegal translation */
    if (section.mr == &io_mem_unassigned) {
@@ -548,7 +548,7 @@ IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
    plen -= 1;

    return (IOMMUTLBEntry) {
-        .target_as = section.address_space,
+        .target_as = as,
        .iova = addr & ~plen,
        .translated_addr = xlat & ~plen,
        .addr_mask = plen,
@@ -561,15 +561,15 @@ iotlb_fail:
 }

 /* Called from RCU critical section */
-MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr,
-                                      hwaddr *xlat, hwaddr *plen,
-                                      bool is_write)
+MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat,
+                                 hwaddr *plen, bool is_write)
 {
    MemoryRegion *mr;
    MemoryRegionSection section;
+    AddressSpace *as = NULL;

    /* This can be MMIO, so setup MMIO bit. */
-    section = address_space_do_translate(as, addr, xlat, plen, is_write, true);
+    section = flatview_do_translate(fv, addr, xlat, plen, is_write, true, &as);
    mr = section.mr;

    if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
@@ -1219,7 +1219,7 @@ hwaddr memory_region_section_get_iotlb(CPUState *cpu,
    } else {
        AddressSpaceDispatch *d;

-        d = atomic_rcu_read(&section->address_space->dispatch);
+        d = flatview_to_dispatch(section->fv);
        iotlb = section - d->map.sections;
        iotlb += xlat;
    }
@@ -1245,7 +1245,7 @@ hwaddr memory_region_section_get_iotlb(CPUState *cpu,

 static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
                             uint16_t section);
-static subpage_t *subpage_init(AddressSpace *as, hwaddr base);
+static subpage_t *subpage_init(FlatView *fv, hwaddr base);

 static void *(*phys_mem_alloc)(size_t size, uint64_t *align) =
                               qemu_anon_ram_alloc;
@@ -1302,8 +1302,9 @@ static void phys_sections_free(PhysPageMap *map)
    g_free(map->nodes);
 }

-static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *section)
+static void register_subpage(FlatView *fv, MemoryRegionSection *section)
 {
+    AddressSpaceDispatch *d = flatview_to_dispatch(fv);
    subpage_t *subpage;
    hwaddr base = section->offset_within_address_space
        & TARGET_PAGE_MASK;
@@ -1317,8 +1318,8 @@ static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *secti
    assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);

    if (!(existing->mr->subpage)) {
-        subpage = subpage_init(d->as, base);
-        subsection.address_space = d->as;
+        subpage = subpage_init(fv, base);
+        subsection.fv = fv;
        subsection.mr = &subpage->iomem;
        phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
                      phys_section_add(&d->map, &subsection));
@@ -1332,9 +1333,10 @@ static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *secti
 }


-static void register_multipage(AddressSpaceDispatch *d,
+static void register_multipage(FlatView *fv,
                               MemoryRegionSection *section)
 {
+    AddressSpaceDispatch *d = flatview_to_dispatch(fv);
    hwaddr start_addr = section->offset_within_address_space;
    uint16_t section_index = phys_section_add(&d->map, section);
    uint64_t num_pages = int128_get64(int128_rshift(section->size,
@@ -1344,10 +1346,8 @@ static void register_multipage(AddressSpaceDispatch *d,
    phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
 }

-static void mem_add(MemoryListener *listener, MemoryRegionSection *section)
+void flatview_add_to_dispatch(FlatView *fv, MemoryRegionSection *section)
 {
-    AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
-    AddressSpaceDispatch *d = as->next_dispatch;
    MemoryRegionSection now = *section, remain = *section;
    Int128 page_size = int128_make64(TARGET_PAGE_SIZE);

@@ -1356,7 +1356,7 @@ static void mem_add(MemoryListener *listener, MemoryRegionSection *section)
                       - now.offset_within_address_space;

        now.size = int128_min(int128_make64(left), now.size);
-        register_subpage(d, &now);
+        register_subpage(fv, &now);
    } else {
        now.size = int128_zero();
    }
@@ -1366,13 +1366,13 @@ static void mem_add(MemoryListener *listener, MemoryRegionSection *section)
        remain.offset_within_region += int128_get64(now.size);
        now = remain;
        if (int128_lt(remain.size, page_size)) {
-            register_subpage(d, &now);
+            register_subpage(fv, &now);
        } else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
            now.size = page_size;
-            register_subpage(d, &now);
+            register_subpage(fv, &now);
        } else {
            now.size = int128_and(now.size, int128_neg(page_size));
-            register_multipage(d, &now);
+            register_multipage(fv, &now);
        }
    }
 }
@@ -2500,6 +2500,11 @@ static const MemoryRegionOps watch_mem_ops = {
    .endianness = DEVICE_NATIVE_ENDIAN,
 };

+static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
+                                  const uint8_t *buf, int len);
+static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
+                                  bool is_write);
+
 static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
                                unsigned len, MemTxAttrs attrs)
 {
@@ -2511,8 +2516,7 @@ static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
    printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
           subpage, len, addr);
 #endif
-    res = address_space_read(subpage->as, addr + subpage->base,
-                             attrs, buf, len);
+    res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len);
    if (res) {
        return res;
    }
@@ -2561,8 +2565,7 @@ static MemTxResult subpage_write(void *opaque, hwaddr addr,
    default:
        abort();
    }
-    return address_space_write(subpage->as, addr + subpage->base,
-                               attrs, buf, len);
+    return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len);
 }

 static bool subpage_accepts(void *opaque, hwaddr addr,
@@ -2574,8 +2577,8 @@ static bool subpage_accepts(void *opaque, hwaddr addr,
           __func__, subpage, is_write ? 'w' : 'r', len, addr);
 #endif

-    return address_space_access_valid(subpage->as, addr + subpage->base,
-                                      len, is_write);
+    return flatview_access_valid(subpage->fv, addr + subpage->base,
+                                 len, is_write);
 }

 static const MemoryRegionOps subpage_ops = {
@@ -2609,12 +2612,12 @@ static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
    return 0;
 }

-static subpage_t *subpage_init(AddressSpace *as, hwaddr base)
+static subpage_t *subpage_init(FlatView *fv, hwaddr base)
 {
    subpage_t *mmio;

    mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
-    mmio->as = as;
+    mmio->fv = fv;
    mmio->base = base;
    memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
                          NULL, TARGET_PAGE_SIZE);
@@ -2628,12 +2631,11 @@ static subpage_t *subpage_init(AddressSpace *as, hwaddr base)
    return mmio;
 }

-static uint16_t dummy_section(PhysPageMap *map, AddressSpace *as,
-                              MemoryRegion *mr)
+static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr)
 {
-    assert(as);
+    assert(fv);
    MemoryRegionSection section = {
-        .address_space = as,
+        .fv = fv,
        .mr = mr,
        .offset_within_address_space = 0,
        .offset_within_region = 0,
@@ -2670,46 +2672,31 @@ static void io_mem_init(void)
                          NULL, UINT64_MAX);
 }

-static void mem_begin(MemoryListener *listener)
+AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
 {
-    AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
    AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
    uint16_t n;

-    n = dummy_section(&d->map, as, &io_mem_unassigned);
+    n = dummy_section(&d->map, fv, &io_mem_unassigned);
    assert(n == PHYS_SECTION_UNASSIGNED);
-    n = dummy_section(&d->map, as, &io_mem_notdirty);
+    n = dummy_section(&d->map, fv, &io_mem_notdirty);
    assert(n == PHYS_SECTION_NOTDIRTY);
-    n = dummy_section(&d->map, as, &io_mem_rom);
+    n = dummy_section(&d->map, fv, &io_mem_rom);
    assert(n == PHYS_SECTION_ROM);
-    n = dummy_section(&d->map, as, &io_mem_watch);
+    n = dummy_section(&d->map, fv, &io_mem_watch);
    assert(n == PHYS_SECTION_WATCH);

    d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
-    d->as = as;
-    as->next_dispatch = d;
+
+    return d;
 }

-static void address_space_dispatch_free(AddressSpaceDispatch *d)
+void address_space_dispatch_free(AddressSpaceDispatch *d)
 {
    phys_sections_free(&d->map);
    g_free(d);
 }

-static void mem_commit(MemoryListener *listener)
-{
-    AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
-    AddressSpaceDispatch *cur = as->dispatch;
-    AddressSpaceDispatch *next = as->next_dispatch;
-
-    phys_page_compact_all(next, next->map.nodes_nb);
-
-    atomic_rcu_set(&as->dispatch, next);
-    if (cur) {
-        call_rcu(cur, address_space_dispatch_free, rcu);
-    }
-}
-
 static void tcg_commit(MemoryListener *listener)
 {
    CPUAddressSpace *cpuas;
@@ -2723,39 +2710,11 @@ static void tcg_commit(MemoryListener *listener)
     * We reload the dispatch pointer now because cpu_reloading_memory_map()
     * may have split the RCU critical section.
     */
-    d = atomic_rcu_read(&cpuas->as->dispatch);
+    d = address_space_to_dispatch(cpuas->as);
    atomic_rcu_set(&cpuas->memory_dispatch, d);
    tlb_flush(cpuas->cpu);
 }

-void address_space_init_dispatch(AddressSpace *as)
-{
-    as->dispatch = NULL;
-    as->dispatch_listener = (MemoryListener) {
-        .begin = mem_begin,
-        .commit = mem_commit,
-        .region_add = mem_add,
-        .region_nop = mem_add,
-        .priority = 0,
-    };
-    memory_listener_register(&as->dispatch_listener, as);
-}
-
-void address_space_unregister(AddressSpace *as)
-{
-    memory_listener_unregister(&as->dispatch_listener);
-}
-
-void address_space_destroy_dispatch(AddressSpace *as)
-{
-    AddressSpaceDispatch *d = as->dispatch;
-
-    atomic_rcu_set(&as->dispatch, NULL);
-    if (d) {
-        call_rcu(d, address_space_dispatch_free, rcu);
-    }
-}
-
 static void memory_map_init(void)
 {
    system_memory = g_malloc(sizeof(*system_memory));
@@ -2899,11 +2858,11 @@ static bool prepare_mmio_access(MemoryRegion *mr)
 }

 /* Called within RCU critical section.  */
-static MemTxResult address_space_write_continue(AddressSpace *as, hwaddr addr,
-                                                MemTxAttrs attrs,
-                                                const uint8_t *buf,
-                                                int len, hwaddr addr1,
-                                                hwaddr l, MemoryRegion *mr)
+static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
+                                           MemTxAttrs attrs,
+                                           const uint8_t *buf,
+                                           int len, hwaddr addr1,
+                                           hwaddr l, MemoryRegion *mr)
 {
    uint8_t *ptr;
    uint64_t val;
@@ -2965,14 +2924,14 @@ static MemTxResult address_space_write_continue(AddressSpace *as, hwaddr addr,
        }

        l = len;
-        mr = address_space_translate(as, addr, &addr1, &l, true);
+        mr = flatview_translate(fv, addr, &addr1, &l, true);
    }

    return result;
 }

-MemTxResult address_space_write(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
-                                const uint8_t *buf, int len)
+static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
+                                  const uint8_t *buf, int len)
 {
    hwaddr l;
    hwaddr addr1;
@@ -2982,20 +2941,27 @@ MemTxResult address_space_write(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
    if (len > 0) {
        rcu_read_lock();
        l = len;
-        mr = address_space_translate(as, addr, &addr1, &l, true);
-        result = address_space_write_continue(as, addr, attrs, buf, len,
-                                              addr1, l, mr);
+        mr = flatview_translate(fv, addr, &addr1, &l, true);
+        result = flatview_write_continue(fv, addr, attrs, buf, len,
+                                         addr1, l, mr);
        rcu_read_unlock();
    }

    return result;
 }

+MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
+                                              MemTxAttrs attrs,
+                                              const uint8_t *buf, int len)
+{
+    return flatview_write(address_space_to_flatview(as), addr, attrs, buf, len);
+}
+
 /* Called within RCU critical section.  */
-MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr,
-                                        MemTxAttrs attrs, uint8_t *buf,
-                                        int len, hwaddr addr1, hwaddr l,
-                                        MemoryRegion *mr)
+MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
+                                   MemTxAttrs attrs, uint8_t *buf,
+                                   int len, hwaddr addr1, hwaddr l,
+                                   MemoryRegion *mr)
 {
    uint8_t *ptr;
    uint64_t val;
@@ -3055,14 +3021,14 @@ MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr,
        }

        l = len;
-        mr = address_space_translate(as, addr, &addr1, &l, false);
+        mr = flatview_translate(fv, addr, &addr1, &l, false);
    }

    return result;
 }

-MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
-                                    MemTxAttrs attrs, uint8_t *buf, int len)
+MemTxResult flatview_read_full(FlatView *fv, hwaddr addr,
+                               MemTxAttrs attrs, uint8_t *buf, int len)
 {
    hwaddr l;
    hwaddr addr1;
@@ -3072,25 +3038,33 @@ MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
    if (len > 0) {
        rcu_read_lock();
        l = len;
-        mr = address_space_translate(as, addr, &addr1, &l, false);
-        result = address_space_read_continue(as, addr, attrs, buf, len,
-                                             addr1, l, mr);
+        mr = flatview_translate(fv, addr, &addr1, &l, false);
+        result = flatview_read_continue(fv, addr, attrs, buf, len,
+                                        addr1, l, mr);
        rcu_read_unlock();
    }

    return result;
 }

-MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
-                             uint8_t *buf, int len, bool is_write)
+static MemTxResult flatview_rw(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
+                               uint8_t *buf, int len, bool is_write)
 {
    if (is_write) {
-        return address_space_write(as, addr, attrs, (uint8_t *)buf, len);
+        return flatview_write(fv, addr, attrs, (uint8_t *)buf, len);
    } else {
-        return address_space_read(as, addr, attrs, (uint8_t *)buf, len);
+        return flatview_read(fv, addr, attrs, (uint8_t *)buf, len);
    }
 }

+MemTxResult address_space_rw(AddressSpace *as, hwaddr addr,
+                             MemTxAttrs attrs, uint8_t *buf,
+                             int len, bool is_write)
+{
+    return flatview_rw(address_space_to_flatview(as),
+                       addr, attrs, buf, len, is_write);
+}
+
 void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
                            int len, int is_write)
 {
@@ -3248,7 +3222,8 @@ static void cpu_notify_map_clients(void)
    qemu_mutex_unlock(&map_client_list_lock);
 }

-bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_write)
+static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
+                                  bool is_write)
 {
    MemoryRegion *mr;
    hwaddr l, xlat;
@@ -3256,7 +3231,7 @@ bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_
    rcu_read_lock();
    while (len > 0) {
        l = len;
-        mr = address_space_translate(as, addr, &xlat, &l, is_write);
+        mr = flatview_translate(fv, addr, &xlat, &l, is_write);
        if (!memory_access_is_direct(mr, is_write)) {
            l = memory_access_size(mr, l, addr);
            if (!memory_region_access_valid(mr, xlat, l, is_write)) {
@@ -3272,8 +3247,16 @@ bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_
    return true;
 }

+bool address_space_access_valid(AddressSpace *as, hwaddr addr,
+                                int len, bool is_write)
+{
+    return flatview_access_valid(address_space_to_flatview(as),
+                                 addr, len, is_write);
+}
+
 static hwaddr
-address_space_extend_translation(AddressSpace *as, hwaddr addr, hwaddr target_len,
+flatview_extend_translation(FlatView *fv, hwaddr addr,
+                                 hwaddr target_len,
                                 MemoryRegion *mr, hwaddr base, hwaddr len,
                                 bool is_write)
 {
@@ -3290,7 +3273,8 @@ address_space_extend_translation(AddressSpace *as, hwaddr addr, hwaddr target_le
        }

        len = target_len;
-        this_mr = address_space_translate(as, addr, &xlat, &len, is_write);
+        this_mr = flatview_translate(fv, addr, &xlat,
+                                                   &len, is_write);
        if (this_mr != mr || xlat != base + done) {
            return done;
        }
@@ -3313,6 +3297,7 @@ void *address_space_map(AddressSpace *as,
    hwaddr l, xlat;
    MemoryRegion *mr;
    void *ptr;
+    FlatView *fv = address_space_to_flatview(as);

    if (len == 0) {
        return NULL;
@@ -3320,7 +3305,7 @@ void *address_space_map(AddressSpace *as,

    l = len;
    rcu_read_lock();
-    mr = address_space_translate(as, addr, &xlat, &l, is_write);
+    mr = flatview_translate(fv, addr, &xlat, &l, is_write);

    if (!memory_access_is_direct(mr, is_write)) {
        if (atomic_xchg(&bounce.in_use, true)) {
@@ -3336,7 +3321,7 @@ void *address_space_map(AddressSpace *as,
        memory_region_ref(mr);
        bounce.mr = mr;
        if (!is_write) {
-            address_space_read(as, addr, MEMTXATTRS_UNSPECIFIED,
+            flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
                               bounce.buffer, l);
        }

@@ -3347,7 +3332,8 @@ void *address_space_map(AddressSpace *as,


    memory_region_ref(mr);
-    *plen = address_space_extend_translation(as, addr, len, mr, xlat, l, is_write);
+    *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
+                                             l, is_write);
    ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
    rcu_read_unlock();

@@ -3630,3 +3616,87 @@ void page_size_init(void)
    }
    qemu_host_page_mask = -(intptr_t)qemu_host_page_size;
 }
+
+#if !defined(CONFIG_USER_ONLY)
+
+static void mtree_print_phys_entries(fprintf_function mon, void *f,
+                                     int start, int end, int skip, int ptr)
+{
+    if (start == end - 1) {
+        mon(f, "\t%3d      ", start);
+    } else {
+        mon(f, "\t%3d..%-3d ", start, end - 1);
+    }
+    mon(f, " skip=%d ", skip);
+    if (ptr == PHYS_MAP_NODE_NIL) {
+        mon(f, " ptr=NIL");
+    } else if (!skip) {
+        mon(f, " ptr=#%d", ptr);
+    } else {
+        mon(f, " ptr=[%d]", ptr);
+    }
+    mon(f, "\n");
+}
+
+#define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \
+                           int128_sub((size), int128_one())) : 0)
+
+void mtree_print_dispatch(fprintf_function mon, void *f,
+                          AddressSpaceDispatch *d, MemoryRegion *root)
+{
+    int i;
+
+    mon(f, "  Dispatch\n");
+    mon(f, "    Physical sections\n");
+
+    for (i = 0; i < d->map.sections_nb; ++i) {
+        MemoryRegionSection *s = d->map.sections + i;
+        const char *names[] = { " [unassigned]", " [not dirty]",
+                                " [ROM]", " [watch]" };
+
+        mon(f, "      #%d @" TARGET_FMT_plx ".." TARGET_FMT_plx " %s%s%s%s%s",
+            i,
+            s->offset_within_address_space,
+            s->offset_within_address_space + MR_SIZE(s->mr->size),
+            s->mr->name ? s->mr->name : "(noname)",
+            i < ARRAY_SIZE(names) ? names[i] : "",
+            s->mr == root ? " [ROOT]" : "",
+            s == d->mru_section ? " [MRU]" : "",
+            s->mr->is_iommu ? " [iommu]" : "");
+
+        if (s->mr->alias) {
+            mon(f, " alias=%s", s->mr->alias->name ?
+                    s->mr->alias->name : "noname");
+        }
+        mon(f, "\n");
+    }
+
+    mon(f, "    Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n",
+               P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip);
+    for (i = 0; i < d->map.nodes_nb; ++i) {
+        int j, jprev;
+        PhysPageEntry prev;
+        Node *n = d->map.nodes + i;
+
+        mon(f, "      [%d]\n", i);
+
+        for (j = 0, jprev = 0, prev = *n[0]; j < ARRAY_SIZE(*n); ++j) {
+            PhysPageEntry *pe = *n + j;
+
+            if (pe->ptr == prev.ptr && pe->skip == prev.skip) {
+                continue;
+            }
+
+            mtree_print_phys_entries(mon, f, jprev, j, prev.skip, prev.ptr);
+
+            jprev = j;
+            prev = *pe;
+        }
+
+        if (jprev != ARRAY_SIZE(*n)) {
+            mtree_print_phys_entries(mon, f, jprev, j, prev.skip, prev.ptr);
+        }
+    }
+}
+
+#endif
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -23,7 +23,7 @@ ETEXI

 STEXI
@item info version
-@findex version
+@findex info version
 Show the version of QEMU.
 ETEXI

@@ -37,7 +37,7 @@ ETEXI

 STEXI
@item info network
-@findex network
+@findex info network
 Show the network state.
 ETEXI

@@ -51,7 +51,7 @@ ETEXI

 STEXI
@item info chardev
-@findex chardev
+@findex info chardev
 Show the character devices.
 ETEXI

@@ -66,7 +66,7 @@ ETEXI

 STEXI
@item info block
-@findex block
+@findex info block
 Show info of one block device or all block devices.
 ETEXI

@@ -80,7 +80,7 @@ ETEXI

 STEXI
@item info blockstats
-@findex blockstats
+@findex info blockstats
 Show block device statistics.
 ETEXI

@@ -94,7 +94,7 @@ ETEXI

 STEXI
@item info block-jobs
-@findex block-jobs
+@findex info block-jobs
 Show progress of ongoing block device operations.
 ETEXI

@@ -108,7 +108,7 @@ ETEXI

 STEXI
@item info registers
-@findex registers
+@findex info registers
 Show the cpu registers.
 ETEXI

@@ -125,7 +125,7 @@ ETEXI

 STEXI
@item info lapic
-@findex lapic
+@findex info lapic
 Show local APIC state
 ETEXI

@@ -141,7 +141,7 @@ ETEXI

 STEXI
@item info ioapic
-@findex ioapic
+@findex info ioapic
 Show io APIC state
 ETEXI

@@ -155,7 +155,7 @@ ETEXI

 STEXI
@item info cpus
-@findex cpus
+@findex info cpus
 Show infos for each CPU.
 ETEXI

@@ -169,7 +169,7 @@ ETEXI

 STEXI
@item info history
-@findex history
+@findex info history
 Show the command line history.
 ETEXI

@@ -183,7 +183,7 @@ ETEXI

 STEXI
@item info irq
-@findex irq
+@findex info irq
 Show the interrupts statistics (if available).
 ETEXI

@@ -197,7 +197,7 @@ ETEXI

 STEXI
@item info pic
-@findex pic
+@findex info pic
 Show i8259 (PIC) state.
 ETEXI

@@ -211,7 +211,7 @@ ETEXI

 STEXI
@item info pci
-@findex pci
+@findex info pci
 Show PCI information.
 ETEXI

@@ -228,7 +228,7 @@ ETEXI

 STEXI
@item info tlb
-@findex tlb
+@findex info tlb
 Show virtual to physical memory mappings.
 ETEXI

@@ -244,21 +244,22 @@ ETEXI

 STEXI
@item info mem
-@findex mem
+@findex info mem
 Show the active virtual memory mappings.
 ETEXI

    {
        .name       = "mtree",
-        .args_type  = "flatview:-f",
-        .params     = "[-f]",
-        .help       = "show memory tree (-f: dump flat view for address spaces)",
+        .args_type  = "flatview:-f,dispatch_tree:-d",
+        .params     = "[-f][-d]",
+        .help       = "show memory tree (-f: dump flat view for address spaces;"
+                      "-d: dump dispatch tree, valid with -f only)",
        .cmd        = hmp_info_mtree,
    },

 STEXI
@item info mtree
-@findex mtree
+@findex info mtree
 Show memory tree.
 ETEXI

@@ -274,7 +275,7 @@ ETEXI

 STEXI
@item info jit
-@findex jit
+@findex info jit
 Show dynamic compiler info.
 ETEXI

@@ -290,7 +291,7 @@ ETEXI

 STEXI
@item info opcount
-@findex opcount
+@findex info opcount
 Show dynamic compiler opcode counters
 ETEXI

@@ -304,7 +305,7 @@ ETEXI

 STEXI
@item info kvm
-@findex kvm
+@findex info kvm
 Show KVM information.
 ETEXI

@@ -318,7 +319,7 @@ ETEXI

 STEXI
@item info numa
-@findex numa
+@findex info numa
 Show NUMA information.
 ETEXI

@@ -332,7 +333,7 @@ ETEXI

 STEXI
@item info usb
-@findex usb
+@findex info usb
 Show guest USB devices.
 ETEXI

@@ -346,7 +347,7 @@ ETEXI

 STEXI
@item info usbhost
-@findex usbhost
+@findex info usbhost
 Show host USB devices.
 ETEXI

@@ -360,7 +361,7 @@ ETEXI

 STEXI
@item info profile
-@findex profile
+@findex info profile
 Show profiling information.
 ETEXI

@@ -374,7 +375,7 @@ ETEXI

 STEXI
@item info capture
-@findex capture
+@findex info capture
 Show capture information.
 ETEXI

@@ -388,7 +389,7 @@ ETEXI

 STEXI
@item info snapshots
-@findex snapshots
+@findex info snapshots
 Show the currently saved VM snapshots.
 ETEXI

@@ -402,7 +403,7 @@ ETEXI

 STEXI
@item info status
-@findex status
+@findex info status
 Show the current VM status (running|paused).
 ETEXI

@@ -416,7 +417,7 @@ ETEXI

 STEXI
@item info mice
-@findex mice
+@findex info mice
 Show which guest mouse is receiving events.
 ETEXI

@@ -430,7 +431,7 @@ ETEXI

 STEXI
@item info vnc
-@findex vnc
+@findex info vnc
 Show the vnc server status.
 ETEXI

@@ -446,7 +447,7 @@ ETEXI

 STEXI
@item info spice
-@findex spice
+@findex info spice
 Show the spice server status.
 ETEXI

@@ -460,7 +461,7 @@ ETEXI

 STEXI
@item info name
-@findex name
+@findex info name
 Show the current VM name.
 ETEXI

@@ -474,7 +475,7 @@ ETEXI

 STEXI
@item info uuid
-@findex uuid
+@findex info uuid
 Show the current VM UUID.
 ETEXI

@@ -488,7 +489,7 @@ ETEXI

 STEXI
@item info cpustats
-@findex cpustats
+@findex info cpustats
 Show CPU statistics.
 ETEXI

@@ -504,7 +505,7 @@ ETEXI

 STEXI
@item info usernet
-@findex usernet
+@findex info usernet
 Show user network stack connection states.
 ETEXI

@@ -518,7 +519,7 @@ ETEXI

 STEXI
@item info migrate
-@findex migrate
+@findex info migrate
 Show migration status.
 ETEXI

@@ -532,7 +533,7 @@ ETEXI

 STEXI
@item info migrate_capabilities
-@findex migrate_capabilities
+@findex info migrate_capabilities
 Show current migration capabilities.
 ETEXI

@@ -546,7 +547,7 @@ ETEXI

 STEXI
@item info migrate_parameters
-@findex migrate_parameters
+@findex info migrate_parameters
 Show current migration parameters.
 ETEXI

@@ -560,7 +561,7 @@ ETEXI

 STEXI
@item info migrate_cache_size
-@findex migrate_cache_size
+@findex info migrate_cache_size
 Show current migration xbzrle cache size.
 ETEXI

@@ -574,7 +575,7 @@ ETEXI

 STEXI
@item info balloon
-@findex balloon
+@findex info balloon
 Show balloon information.
 ETEXI

@@ -588,7 +589,7 @@ ETEXI

 STEXI
@item info qtree
-@findex qtree
+@findex info qtree
 Show device tree.
 ETEXI

@@ -602,7 +603,7 @@ ETEXI

 STEXI
@item info qdm
-@findex qdm
+@findex info qdm
 Show qdev device model list.
 ETEXI

@@ -616,7 +617,7 @@ ETEXI

 STEXI
@item info qom-tree
-@findex qom-tree
+@findex info qom-tree
 Show QOM composition tree.
 ETEXI

@@ -630,7 +631,7 @@ ETEXI

 STEXI
@item info roms
-@findex roms
+@findex info roms
 Show roms.
 ETEXI

@@ -646,7 +647,7 @@ ETEXI

 STEXI
@item info trace-events
-@findex trace-events
+@findex info trace-events
 Show available trace-events & their state.
 ETEXI

@@ -660,7 +661,7 @@ ETEXI

 STEXI
@item info tpm
-@findex tpm
+@findex info tpm
 Show the TPM device.
 ETEXI

@@ -674,7 +675,7 @@ ETEXI

 STEXI
@item info memdev
-@findex memdev
+@findex info memdev
 Show memory backends
 ETEXI

@@ -688,7 +689,7 @@ ETEXI

 STEXI
@item info memory-devices
-@findex memory-devices
+@findex info memory-devices
 Show memory devices.
 ETEXI

@@ -702,7 +703,7 @@ ETEXI

 STEXI
@item info iothreads
-@findex iothreads
+@findex info iothreads
 Show iothread's identifiers.
 ETEXI

@@ -716,7 +717,7 @@ ETEXI

 STEXI
@item info rocker @var{name}
-@findex rocker
+@findex info rocker
 Show rocker switch.
 ETEXI

@@ -729,8 +730,8 @@ ETEXI
    },

 STEXI
-@item info rocker_ports @var{name}-ports
-@findex ocker-ports
+@item info rocker-ports @var{name}-ports
+@findex info rocker-ports
 Show rocker ports.
 ETEXI

@@ -743,8 +744,8 @@ ETEXI
    },

 STEXI
-@item info rocker_of_dpa_flows @var{name} [@var{tbl_id}]
-@findex rocker-of-dpa-flows
+@item info rocker-of-dpa-flows @var{name} [@var{tbl_id}]
+@findex info rocker-of-dpa-flows
 Show rocker OF-DPA flow tables.
 ETEXI

@@ -758,7 +759,7 @@ ETEXI

 STEXI
@item info rocker-of-dpa-groups @var{name} [@var{type}]
-@findex rocker-of-dpa-groups
+@findex info rocker-of-dpa-groups
 Show rocker OF-DPA groups.
 ETEXI

@@ -774,7 +775,7 @@ ETEXI

 STEXI
@item info skeys @var{address}
-@findex skeys
+@findex info skeys
 Display the value of a storage key (s390 only)
 ETEXI

@@ -790,7 +791,7 @@ ETEXI

 STEXI
@item info cmma @var{address}
-@findex cmma
+@findex info cmma
 Display the values of the CMMA storage attributes for a range of pages (s390 only)
 ETEXI

@@ -804,7 +805,7 @@ ETEXI

 STEXI
@item info dump
-@findex dump
+@findex info dump
 Display the latest dump status.
 ETEXI

@@ -818,7 +819,7 @@ ETEXI

 STEXI
@item info ramblock
-@findex ramblock
+@findex info ramblock
 Dump all the ramblocks of the system.
 ETEXI

@@ -832,14 +833,8 @@ ETEXI

 STEXI
@item info hotpluggable-cpus
-@findex hotpluggable-cpus
+@findex info hotpluggable-cpus
 Show information about hotpluggable CPUs
-ETEXI
-
-STEXI
-@item info vm-generation-id
-@findex vm-generation-id
-Show Virtual Machine Generation ID
 ETEXI

    {
@@ -851,10 +846,9 @@ ETEXI
    },

 STEXI
-@item info memory_size_summary
-@findex memory_size_summary
-Display the amount of initially allocated and present hotpluggable (if
-enabled) memory in bytes.
+@item info vm-generation-id
+@findex info vm-generation-id
+Show Virtual Machine Generation ID
 ETEXI

    {
@@ -866,6 +860,13 @@ ETEXI
        .cmd        = hmp_info_memory_size_summary,
    },

+STEXI
+@item info memory_size_summary
+@findex info memory_size_summary
+Display the amount of initially allocated and present hotpluggable (if
+enabled) memory in bytes.
+ETEXI
+
 STEXI
@end table
 ETEXI
--- a/hmp.c
+++ b/hmp.c
@@ -336,6 +336,12 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict)
        monitor_printf(mon, "%s: %s\n",
            MigrationParameter_str(MIGRATION_PARAMETER_BLOCK_INCREMENTAL),
            params->block_incremental ? "on" : "off");
+        monitor_printf(mon, "%s: %" PRId64 "\n",
+            MigrationParameter_str(MIGRATION_PARAMETER_X_MULTIFD_CHANNELS),
+            params->x_multifd_channels);
+        monitor_printf(mon, "%s: %" PRId64 "\n",
+            MigrationParameter_str(MIGRATION_PARAMETER_X_MULTIFD_PAGE_COUNT),
+            params->x_multifd_page_count);
    }

    qapi_free_MigrationParameters(params);
@@ -1621,6 +1627,14 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
        p->has_block_incremental = true;
        visit_type_bool(v, param, &p->block_incremental, &err);
        break;
+    case MIGRATION_PARAMETER_X_MULTIFD_CHANNELS:
+        p->has_x_multifd_channels = true;
+        visit_type_int(v, param, &p->x_multifd_channels, &err);
+        break;
+    case MIGRATION_PARAMETER_X_MULTIFD_PAGE_COUNT:
+        p->has_x_multifd_page_count = true;
+        visit_type_int(v, param, &p->x_multifd_page_count, &err);
+        break;
    default:
        assert(0);
    }
@@ -2380,6 +2394,7 @@ void hmp_info_memdev(Monitor *mon, const QDict *qdict)
    monitor_printf(mon, "\n");

    qapi_free_MemdevList(memdev_list);
+    hmp_handle_error(mon, &err);
 }

 void hmp_info_memory_devices(Monitor *mon, const QDict *qdict)
@@ -2418,6 +2433,7 @@ void hmp_info_memory_devices(Monitor *mon, const QDict *qdict)
    }

    qapi_free_MemoryDeviceInfoList(info_list);
+    hmp_handle_error(mon, &err);
 }

 void hmp_info_iothreads(Monitor *mon, const QDict *qdict)
--- a/hw/arm/Makefile.objs
+++ b/hw/arm/Makefile.objs
@@ -19,3 +19,4 @@ obj-$(CONFIG_FSL_IMX31) += fsl-imx31.o kzm.o
 obj-$(CONFIG_FSL_IMX6) += fsl-imx6.o sabrelite.o
 obj-$(CONFIG_ASPEED_SOC) += aspeed_soc.o aspeed.o
 obj-$(CONFIG_MPS2) += mps2.o
+obj-$(CONFIG_MSF2) += msf2-soc.o msf2-som.o
--- a/hw/arm/armv7m.c
+++ b/hw/arm/armv7m.c
@@ -41,7 +41,7 @@ static MemTxResult bitband_read(void *opaque, hwaddr offset,

    /* Find address in underlying memory and round down to multiple of size */
    addr = bitband_addr(s, offset) & (-size);
-    res = address_space_read(s->source_as, addr, attrs, buf, size);
+    res = address_space_read(&s->source_as, addr, attrs, buf, size);
    if (res) {
        return res;
    }
@@ -66,7 +66,7 @@ static MemTxResult bitband_write(void *opaque, hwaddr offset, uint64_t value,

    /* Find address in underlying memory and round down to multiple of size */
    addr = bitband_addr(s, offset) & (-size);
-    res = address_space_read(s->source_as, addr, attrs, buf, size);
+    res = address_space_read(&s->source_as, addr, attrs, buf, size);
    if (res) {
        return res;
    }
@@ -79,7 +79,7 @@ static MemTxResult bitband_write(void *opaque, hwaddr offset, uint64_t value,
    } else {
        buf[bitpos >> 3] &= ~bit;
    }
-    return address_space_write(s->source_as, addr, attrs, buf, size);
+    return address_space_write(&s->source_as, addr, attrs, buf, size);
 }

 static const MemoryRegionOps bitband_ops = {
@@ -111,8 +111,7 @@ static void bitband_realize(DeviceState *dev, Error **errp)
        return;
    }

-    s->source_as = address_space_init_shareable(s->source_memory,
-                                                "bitband-source");
+    address_space_init(&s->source_as, s->source_memory, "bitband-source");
 }

 /* Board init.  */
--- a/hw/arm/msf2-soc.c
+++ b/hw/arm/msf2-soc.c
@@ -0,0 +1,238 @@
+/*
+ * SmartFusion2 SoC emulation.
+ *
+ * Copyright (c) 2017 Subbaraya Sundeep <sundeep.lkml@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu-common.h"
+#include "hw/arm/arm.h"
+#include "exec/address-spaces.h"
+#include "hw/char/serial.h"
+#include "hw/boards.h"
+#include "sysemu/block-backend.h"
+#include "qemu/cutils.h"
+#include "hw/arm/msf2-soc.h"
+#include "hw/misc/unimp.h"
+
+#define MSF2_TIMER_BASE       0x40004000
+#define MSF2_SYSREG_BASE      0x40038000
+
+#define ENVM_BASE_ADDRESS     0x60000000
+
+#define SRAM_BASE_ADDRESS     0x20000000
+
+#define MSF2_ENVM_MAX_SIZE    (512 * K_BYTE)
+
+/*
+ * eSRAM max size is 80k without SECDED(Single error correction and
+ * dual error detection) feature and 64k with SECDED.
+ * We do not support SECDED now.
+ */
+#define MSF2_ESRAM_MAX_SIZE       (80 * K_BYTE)
+
+static const uint32_t spi_addr[MSF2_NUM_SPIS] = { 0x40001000 , 0x40011000 };
+static const uint32_t uart_addr[MSF2_NUM_UARTS] = { 0x40000000 , 0x40010000 };
+
+static const int spi_irq[MSF2_NUM_SPIS] = { 2, 3 };
+static const int uart_irq[MSF2_NUM_UARTS] = { 10, 11 };
+static const int timer_irq[MSF2_NUM_TIMERS] = { 14, 15 };
+
+static void m2sxxx_soc_initfn(Object *obj)
+{
+    MSF2State *s = MSF2_SOC(obj);
+    int i;
+
+    object_initialize(&s->armv7m, sizeof(s->armv7m), TYPE_ARMV7M);
+    qdev_set_parent_bus(DEVICE(&s->armv7m), sysbus_get_default());
+
+    object_initialize(&s->sysreg, sizeof(s->sysreg), TYPE_MSF2_SYSREG);
+    qdev_set_parent_bus(DEVICE(&s->sysreg), sysbus_get_default());
+
+    object_initialize(&s->timer, sizeof(s->timer), TYPE_MSS_TIMER);
+    qdev_set_parent_bus(DEVICE(&s->timer), sysbus_get_default());
+
+    for (i = 0; i < MSF2_NUM_SPIS; i++) {
+        object_initialize(&s->spi[i], sizeof(s->spi[i]),
+                          TYPE_MSS_SPI);
+        qdev_set_parent_bus(DEVICE(&s->spi[i]), sysbus_get_default());
+    }
+}
+
+static void m2sxxx_soc_realize(DeviceState *dev_soc, Error **errp)
+{
+    MSF2State *s = MSF2_SOC(dev_soc);
+    DeviceState *dev, *armv7m;
+    SysBusDevice *busdev;
+    Error *err = NULL;
+    int i;
+
+    MemoryRegion *system_memory = get_system_memory();
+    MemoryRegion *nvm = g_new(MemoryRegion, 1);
+    MemoryRegion *nvm_alias = g_new(MemoryRegion, 1);
+    MemoryRegion *sram = g_new(MemoryRegion, 1);
+
+    memory_region_init_rom(nvm, NULL, "MSF2.eNVM", s->envm_size,
+                           &error_fatal);
+    /*
+     * On power-on, the eNVM region 0x60000000 is automatically
+     * remapped to the Cortex-M3 processor executable region
+     * start address (0x0). We do not support remapping other eNVM,
+     * eSRAM and DDR regions by guest(via Sysreg) currently.
+     */
+    memory_region_init_alias(nvm_alias, NULL, "MSF2.eNVM",
+                             nvm, 0, s->envm_size);
+
+    memory_region_add_subregion(system_memory, ENVM_BASE_ADDRESS, nvm);
+    memory_region_add_subregion(system_memory, 0, nvm_alias);
+
+    memory_region_init_ram(sram, NULL, "MSF2.eSRAM", s->esram_size,
+                           &error_fatal);
+    memory_region_add_subregion(system_memory, SRAM_BASE_ADDRESS, sram);
+
+    armv7m = DEVICE(&s->armv7m);
+    qdev_prop_set_uint32(armv7m, "num-irq", 81);
+    qdev_prop_set_string(armv7m, "cpu-type", s->cpu_type);
+    object_property_set_link(OBJECT(&s->armv7m), OBJECT(get_system_memory()),
+                                     "memory", &error_abort);
+    object_property_set_bool(OBJECT(&s->armv7m), true, "realized", &err);
+    if (err != NULL) {
+        error_propagate(errp, err);
+        return;
+    }
+
+    if (!s->m3clk) {
+        error_setg(errp, "Invalid m3clk value");
+        error_append_hint(errp, "m3clk can not be zero\n");
+        return;
+    }
+    system_clock_scale = NANOSECONDS_PER_SECOND / s->m3clk;
+
+    for (i = 0; i < MSF2_NUM_UARTS; i++) {
+        if (serial_hds[i]) {
+            serial_mm_init(get_system_memory(), uart_addr[i], 2,
+                           qdev_get_gpio_in(armv7m, uart_irq[i]),
+                           115200, serial_hds[i], DEVICE_NATIVE_ENDIAN);
+        }
+    }
+
+    dev = DEVICE(&s->timer);
+    /* APB0 clock is the timer input clock */
+    qdev_prop_set_uint32(dev, "clock-frequency", s->m3clk / s->apb0div);
+    object_property_set_bool(OBJECT(&s->timer), true, "realized", &err);
+    if (err != NULL) {
+        error_propagate(errp, err);
+        return;
+    }
+    busdev = SYS_BUS_DEVICE(dev);
+    sysbus_mmio_map(busdev, 0, MSF2_TIMER_BASE);
+    sysbus_connect_irq(busdev, 0,
+                           qdev_get_gpio_in(armv7m, timer_irq[0]));
+    sysbus_connect_irq(busdev, 1,
+                           qdev_get_gpio_in(armv7m, timer_irq[1]));
+
+    dev = DEVICE(&s->sysreg);
+    qdev_prop_set_uint32(dev, "apb0divisor", s->apb0div);
+    qdev_prop_set_uint32(dev, "apb1divisor", s->apb1div);
+    object_property_set_bool(OBJECT(&s->sysreg), true, "realized", &err);
+    if (err != NULL) {
+        error_propagate(errp, err);
+        return;
+    }
+    busdev = SYS_BUS_DEVICE(dev);
+    sysbus_mmio_map(busdev, 0, MSF2_SYSREG_BASE);
+
+    for (i = 0; i < MSF2_NUM_SPIS; i++) {
+        gchar *bus_name;
+
+        object_property_set_bool(OBJECT(&s->spi[i]), true, "realized", &err);
+        if (err != NULL) {
+            error_propagate(errp, err);
+            return;
+        }
+
+        sysbus_mmio_map(SYS_BUS_DEVICE(&s->spi[i]), 0, spi_addr[i]);
+        sysbus_connect_irq(SYS_BUS_DEVICE(&s->spi[i]), 0,
+                           qdev_get_gpio_in(armv7m, spi_irq[i]));
+
+        /* Alias controller SPI bus to the SoC itself */
+        bus_name = g_strdup_printf("spi%d", i);
+        object_property_add_alias(OBJECT(s), bus_name,
+                                  OBJECT(&s->spi[i]), "spi",
+                                  &error_abort);
+        g_free(bus_name);
+    }
+
+    /* Below devices are not modelled yet. */
+    create_unimplemented_device("i2c_0", 0x40002000, 0x1000);
+    create_unimplemented_device("dma", 0x40003000, 0x1000);
+    create_unimplemented_device("watchdog", 0x40005000, 0x1000);
+    create_unimplemented_device("i2c_1", 0x40012000, 0x1000);
+    create_unimplemented_device("gpio", 0x40013000, 0x1000);
+    create_unimplemented_device("hs-dma", 0x40014000, 0x1000);
+    create_unimplemented_device("can", 0x40015000, 0x1000);
+    create_unimplemented_device("rtc", 0x40017000, 0x1000);
+    create_unimplemented_device("apb_config", 0x40020000, 0x10000);
+    create_unimplemented_device("emac", 0x40041000, 0x1000);
+    create_unimplemented_device("usb", 0x40043000, 0x1000);
+}
+
+static Property m2sxxx_soc_properties[] = {
+    /*
+     * part name specifies the type of SmartFusion2 device variant(this
+     * property is for information purpose only.
+     */
+    DEFINE_PROP_STRING("cpu-type", MSF2State, cpu_type),
+    DEFINE_PROP_STRING("part-name", MSF2State, part_name),
+    DEFINE_PROP_UINT64("eNVM-size", MSF2State, envm_size, MSF2_ENVM_MAX_SIZE),
+    DEFINE_PROP_UINT64("eSRAM-size", MSF2State, esram_size,
+                        MSF2_ESRAM_MAX_SIZE),
+    /* Libero GUI shows 100Mhz as default for clocks */
+    DEFINE_PROP_UINT32("m3clk", MSF2State, m3clk, 100 * 1000000),
+    /* default divisors in Libero GUI */
+    DEFINE_PROP_UINT8("apb0div", MSF2State, apb0div, 2),
+    DEFINE_PROP_UINT8("apb1div", MSF2State, apb1div, 2),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void m2sxxx_soc_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->realize = m2sxxx_soc_realize;
+    dc->props = m2sxxx_soc_properties;
+}
+
+static const TypeInfo m2sxxx_soc_info = {
+    .name          = TYPE_MSF2_SOC,
+    .parent        = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(MSF2State),
+    .instance_init = m2sxxx_soc_initfn,
+    .class_init    = m2sxxx_soc_class_init,
+};
+
+static void m2sxxx_soc_types(void)
+{
+    type_register_static(&m2sxxx_soc_info);
+}
+
+type_init(m2sxxx_soc_types)
--- a/hw/arm/msf2-som.c
+++ b/hw/arm/msf2-som.c
@@ -0,0 +1,105 @@
+/*
+ * SmartFusion2 SOM starter kit(from Emcraft) emulation.
+ *
+ * Copyright (c) 2017 Subbaraya Sundeep <sundeep.lkml@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "hw/boards.h"
+#include "hw/arm/arm.h"
+#include "exec/address-spaces.h"
+#include "qemu/cutils.h"
+#include "hw/arm/msf2-soc.h"
+#include "cpu.h"
+
+#define DDR_BASE_ADDRESS      0xA0000000
+#define DDR_SIZE              (64 * M_BYTE)
+
+#define M2S010_ENVM_SIZE      (256 * K_BYTE)
+#define M2S010_ESRAM_SIZE     (64 * K_BYTE)
+
+static void emcraft_sf2_s2s010_init(MachineState *machine)
+{
+    DeviceState *dev;
+    DeviceState *spi_flash;
+    MSF2State *soc;
+    MachineClass *mc = MACHINE_GET_CLASS(machine);
+    DriveInfo *dinfo = drive_get_next(IF_MTD);
+    qemu_irq cs_line;
+    SSIBus *spi_bus;
+    MemoryRegion *sysmem = get_system_memory();
+    MemoryRegion *ddr = g_new(MemoryRegion, 1);
+
+    if (strcmp(machine->cpu_type, mc->default_cpu_type) != 0) {
+        error_report("This board can only be used with CPU %s",
+                     mc->default_cpu_type);
+    }
+
+    memory_region_init_ram(ddr, NULL, "ddr-ram", DDR_SIZE,
+                           &error_fatal);
+    memory_region_add_subregion(sysmem, DDR_BASE_ADDRESS, ddr);
+
+    dev = qdev_create(NULL, TYPE_MSF2_SOC);
+    qdev_prop_set_string(dev, "part-name", "M2S010");
+    qdev_prop_set_string(dev, "cpu-type", mc->default_cpu_type);
+
+    qdev_prop_set_uint64(dev, "eNVM-size", M2S010_ENVM_SIZE);
+    qdev_prop_set_uint64(dev, "eSRAM-size", M2S010_ESRAM_SIZE);
+
+    /*
+     * CPU clock and peripheral clocks(APB0, APB1)are configurable
+     * in Libero. CPU clock is divided by APB0 and APB1 divisors for
+     * peripherals. Emcraft's SoM kit comes with these settings by default.
+     */
+    qdev_prop_set_uint32(dev, "m3clk", 142 * 1000000);
+    qdev_prop_set_uint32(dev, "apb0div", 2);
+    qdev_prop_set_uint32(dev, "apb1div", 2);
+
+    object_property_set_bool(OBJECT(dev), true, "realized", &error_fatal);
+
+    soc = MSF2_SOC(dev);
+
+    /* Attach SPI flash to SPI0 controller */
+    spi_bus = (SSIBus *)qdev_get_child_bus(dev, "spi0");
+    spi_flash = ssi_create_slave_no_init(spi_bus, "s25sl12801");
+    qdev_prop_set_uint8(spi_flash, "spansion-cr2nv", 1);
+    if (dinfo) {
+        qdev_prop_set_drive(spi_flash, "drive", blk_by_legacy_dinfo(dinfo),
+                                    &error_fatal);
+    }
+    qdev_init_nofail(spi_flash);
+    cs_line = qdev_get_gpio_in_named(spi_flash, SSI_GPIO_CS, 0);
+    sysbus_connect_irq(SYS_BUS_DEVICE(&soc->spi[0]), 1, cs_line);
+
+    armv7m_load_kernel(ARM_CPU(first_cpu), machine->kernel_filename,
+                       soc->envm_size);
+}
+
+static void emcraft_sf2_machine_init(MachineClass *mc)
+{
+    mc->desc = "SmartFusion2 SOM kit from Emcraft (M2S010)";
+    mc->init = emcraft_sf2_s2s010_init;
+    mc->default_cpu_type = ARM_CPU_TYPE_NAME("cortex-m3");
+}
+
+DEFINE_MACHINE("emcraft-sf2", emcraft_sf2_machine_init)
--- a/hw/arm/omap2.c
+++ b/hw/arm/omap2.c
@@ -2087,19 +2087,44 @@ static void omap_sysctl_write(void *opaque, hwaddr addr,
    }
 }

+static uint64_t omap_sysctl_readfn(void *opaque, hwaddr addr,
+                                   unsigned size)
+{
+    switch (size) {
+    case 1:
+        return omap_sysctl_read8(opaque, addr);
+    case 2:
+        return omap_badwidth_read32(opaque, addr); /* TODO */
+    case 4:
+        return omap_sysctl_read(opaque, addr);
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static void omap_sysctl_writefn(void *opaque, hwaddr addr,
+                                uint64_t value, unsigned size)
+{
+    switch (size) {
+    case 1:
+        omap_sysctl_write8(opaque, addr, value);
+        break;
+    case 2:
+        omap_badwidth_write32(opaque, addr, value); /* TODO */
+        break;
+    case 4:
+        omap_sysctl_write(opaque, addr, value);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
 static const MemoryRegionOps omap_sysctl_ops = {
-    .old_mmio = {
-        .read = {
-            omap_sysctl_read8,
-            omap_badwidth_read32,	/* TODO */
-            omap_sysctl_read,
-        },
-        .write = {
-            omap_sysctl_write8,
-            omap_badwidth_write32,	/* TODO */
-            omap_sysctl_write,
-        },
-    },
+    .read = omap_sysctl_readfn,
+    .write = omap_sysctl_writefn,
+    .valid.min_access_size = 1,
+    .valid.max_access_size = 4,
    .endianness = DEVICE_NATIVE_ENDIAN,
 };

--- a/hw/arm/palm.c
+++ b/hw/arm/palm.c
@@ -31,26 +31,16 @@
 #include "exec/address-spaces.h"
 #include "cpu.h"

-static uint32_t static_readb(void *opaque, hwaddr offset)
+static uint64_t static_read(void *opaque, hwaddr offset, unsigned size)
 {
-    uint32_t *val = (uint32_t *) opaque;
-    return *val >> ((offset & 3) << 3);
+    uint32_t *val = (uint32_t *)opaque;
+    uint32_t sizemask = 7 >> size;
+
+    return *val >> ((offset & sizemask) << 3);
 }

-static uint32_t static_readh(void *opaque, hwaddr offset)
-{
-    uint32_t *val = (uint32_t *) opaque;
-    return *val >> ((offset & 1) << 3);
-}
-
-static uint32_t static_readw(void *opaque, hwaddr offset)
-{
-    uint32_t *val = (uint32_t *) opaque;
-    return *val >> ((offset & 0) << 3);
-}
-
-static void static_write(void *opaque, hwaddr offset,
-                uint32_t value)
+static void static_write(void *opaque, hwaddr offset, uint64_t value,
+                         unsigned size)
 {
 #ifdef SPY
    printf("%s: value %08lx written at " PA_FMT "\n",
@@ -59,10 +49,10 @@ static void static_write(void *opaque, hwaddr offset,
 }

 static const MemoryRegionOps static_ops = {
-    .old_mmio = {
-        .read = { static_readb, static_readh, static_readw, },
-        .write = { static_write, static_write, static_write, },
-    },
+    .read = static_read,
+    .write = static_write,
+    .valid.min_access_size = 1,
+    .valid.max_access_size = 4,
    .endianness = DEVICE_NATIVE_ENDIAN,
 };

--- a/hw/arm/pxa2xx.c
+++ b/hw/arm/pxa2xx.c
@@ -1143,13 +1143,15 @@ static void pxa2xx_rtc_init(Object *obj)
    sysbus_init_mmio(dev, &s->iomem);
 }

-static void pxa2xx_rtc_pre_save(void *opaque)
+static int pxa2xx_rtc_pre_save(void *opaque)
 {
    PXA2xxRTCState *s = (PXA2xxRTCState *) opaque;

    pxa2xx_rtc_hzupdate(s);
    pxa2xx_rtc_piupdate(s);
    pxa2xx_rtc_swupdate(s);
+
+    return 0;
 }

 static int pxa2xx_rtc_post_load(void *opaque, int version_id)
--- a/hw/arm/strongarm.c
+++ b/hw/arm/strongarm.c
@@ -406,11 +406,13 @@ static void strongarm_rtc_init(Object *obj)
    sysbus_init_mmio(dev, &s->iomem);
 }

-static void strongarm_rtc_pre_save(void *opaque)
+static int strongarm_rtc_pre_save(void *opaque)
 {
    StrongARMRTCState *s = opaque;

    strongarm_rtc_hzupdate(s);
+
+    return 0;
 }

 static int strongarm_rtc_post_load(void *opaque, int version_id)
--- a/hw/arm/xlnx-zynqmp.c
+++ b/hw/arm/xlnx-zynqmp.c
@@ -440,6 +440,8 @@ static void xlnx_zynqmp_class_init(ObjectClass *oc, void *data)

    dc->props = xlnx_zynqmp_props;
    dc->realize = xlnx_zynqmp_realize;
+    /* Reason: Uses serial_hds in realize function, thus can't be used twice */
+    dc->user_creatable = false;
 }

 static const TypeInfo xlnx_zynqmp_type_info = {
--- a/hw/audio/wm8750.c
+++ b/hw/audio/wm8750.c
@@ -567,11 +567,13 @@ static int wm8750_rx(I2CSlave *i2c)
    return 0x00;
 }

-static void wm8750_pre_save(void *opaque)
+static int wm8750_pre_save(void *opaque)
 {
    WM8750State *s = opaque;

    s->rate_vmstate = s->rate - wm_rate_table;
+
+    return 0;
 }

 static int wm8750_post_load(void *opaque, int version_id)
--- a/hw/block/fdc.c
+++ b/hw/block/fdc.c
@@ -1101,11 +1101,13 @@ static int reconstruct_phase(FDCtrl *fdctrl)
    }
 }

-static void fdc_pre_save(void *opaque)
+static int fdc_pre_save(void *opaque)
 {
    FDCtrl *s = opaque;

    s->dor_vmstate = s->dor | GET_CUR_DRV(s);
+
+    return 0;
 }

 static int fdc_pre_load(void *opaque)
--- a/hw/block/m25p80.c
+++ b/hw/block/m25p80.c
@@ -1251,9 +1251,11 @@ static void m25p80_reset(DeviceState *d)
    reset_memory(s);
 }

-static void m25p80_pre_save(void *opaque)
+static int m25p80_pre_save(void *opaque)
 {
    flash_sync_dirty((Flash *)opaque, -1);
+
+    return 0;
 }

 static Property m25p80_properties[] = {
--- a/hw/block/nand.c
+++ b/hw/block/nand.c
@@ -325,11 +325,13 @@ static void nand_command(NANDFlashState *s)
    }
 }

-static void nand_pre_save(void *opaque)
+static int nand_pre_save(void *opaque)
 {
    NANDFlashState *s = NAND(opaque);

    s->ioaddr_vmstate = s->ioaddr - s->io;
+
+    return 0;
 }

 static int nand_post_load(void *opaque, int version_id)
--- a/hw/block/onenand.c
+++ b/hw/block/onenand.c
@@ -137,7 +137,7 @@ static void onenand_intr_update(OneNANDState *s)
    qemu_set_irq(s->intr, ((s->intstatus >> 15) ^ (~s->config[0] >> 6)) & 1);
 }

-static void onenand_pre_save(void *opaque)
+static int onenand_pre_save(void *opaque)
 {
    OneNANDState *s = opaque;
    if (s->current == s->otp) {
@@ -147,6 +147,8 @@ static void onenand_pre_save(void *opaque)
    } else {
        s->current_direction = 0;
    }
+
+    return 0;
 }

 static int onenand_post_load(void *opaque, int version_id)
@@ -518,10 +520,6 @@ static void onenand_command(OneNANDState *s)
        s->intstatus |= ONEN_INT;

        for (b = 0; b < s->blocks; b ++) {
-            if (b >= s->blocks) {
-                s->status |= ONEN_ERR_CMD;
-                break;
-            }
            if (s->blockwp[b] == ONEN_LOCK_LOCKTIGHTEN)
                break;

--- a/hw/char/serial.c
+++ b/hw/char/serial.c
@@ -630,10 +630,12 @@ static void serial_event(void *opaque, int event)
        serial_receive_break(s);
 }

-static void serial_pre_save(void *opaque)
+static int serial_pre_save(void *opaque)
 {
    SerialState *s = opaque;
    s->fcr_vmstate = s->fcr;
+
+    return 0;
 }

 static int serial_pre_load(void *opaque)
--- a/hw/char/terminal3270.c
+++ b/hw/char/terminal3270.c
@@ -30,7 +30,6 @@ typedef struct Terminal3270 {
    uint8_t inv[INPUT_BUFFER_SIZE];
    uint8_t outv[OUTPUT_BUFFER_SIZE];
    int in_len;
-    int out_len;
    bool handshake_done;
    guint timer_tag;
 } Terminal3270;
@@ -145,7 +144,6 @@ static void chr_event(void *opaque, int event)

    /* Ensure the initial status correct, always reset them. */
    t->in_len = 0;
-    t->out_len = 0;
    t->handshake_done = false;
    if (t->timer_tag) {
        g_source_remove(t->timer_tag);
@@ -182,14 +180,18 @@ static void terminal_init(EmulatedCcw3270Device *dev, Error **errp)
                             terminal_read, chr_event, NULL, t, NULL, true);
 }

-static int read_payload_3270(EmulatedCcw3270Device *dev, uint32_t cda,
-                             uint16_t count)
+static inline CcwDataStream *get_cds(Terminal3270 *t)
+{
+    return &(CCW_DEVICE(&t->cdev)->sch->cds);
+}
+
+static int read_payload_3270(EmulatedCcw3270Device *dev)
 {
    Terminal3270 *t = TERMINAL_3270(dev);
    int len;

-    len = MIN(count, t->in_len);
-    cpu_physical_memory_write(cda, t->inv, len);
+    len = MIN(ccw_dstream_avail(get_cds(t)), t->in_len);
+    ccw_dstream_write_buf(get_cds(t), t->inv, len);
    t->in_len -= len;

    return len;
@@ -222,13 +224,14 @@ static int insert_IAC_escape_char(uint8_t *outv, int out_len)
 * Write 3270 outbound to socket.
 * Return the count of 3270 data field if succeeded, zero if failed.
 */
-static int write_payload_3270(EmulatedCcw3270Device *dev, uint8_t cmd,
-                              uint32_t cda, uint16_t count)
+static int write_payload_3270(EmulatedCcw3270Device *dev, uint8_t cmd)
 {
    Terminal3270 *t = TERMINAL_3270(dev);
    int retval = 0;
-
-    assert(count <= (OUTPUT_BUFFER_SIZE - 3) / 2);
+    int count = ccw_dstream_avail(get_cds(t));
+    int bound = (OUTPUT_BUFFER_SIZE - 3) / 2;
+    int len = MIN(count, bound);
+    int out_len = 0;

    if (!t->handshake_done) {
        if (!(t->outv[0] == IAC && t->outv[1] != IAC)) {
@@ -243,16 +246,23 @@ static int write_payload_3270(EmulatedCcw3270Device *dev, uint8_t cmd,
        /* We just say we consumed all data if there's no backend. */
        return count;
    }
-    t->outv[0] = cmd;
-    cpu_physical_memory_read(cda, &t->outv[1], count);
-    t->out_len = count + 1;

-    t->out_len = insert_IAC_escape_char(t->outv, t->out_len);
-    t->outv[t->out_len++] = IAC;
-    t->outv[t->out_len++] = IAC_EOR;
+    t->outv[out_len++] = cmd;
+    do {
+        ccw_dstream_read_buf(get_cds(t), &t->outv[out_len], len);
+        count = ccw_dstream_avail(get_cds(t));
+        out_len += len;

-    retval = qemu_chr_fe_write_all(&t->chr, t->outv, t->out_len);
-    return (retval <= 0) ? 0 : (retval - 3);
+        out_len = insert_IAC_escape_char(t->outv, out_len);
+        if (!count) {
+            t->outv[out_len++] = IAC;
+            t->outv[out_len++] = IAC_EOR;
+        }
+        retval = qemu_chr_fe_write_all(&t->chr, t->outv, out_len);
+        len = MIN(count, bound);
+        out_len = 0;
+    } while (len && retval >= 0);
+    return (retval <= 0) ? 0 : get_cds(t)->count;
 }

 static Property terminal_properties[] = {
--- a/hw/char/virtio-console.c
+++ b/hw/char/virtio-console.c
@@ -187,6 +187,26 @@ static int chr_be_change(void *opaque)
    return 0;
 }

+static void virtconsole_enable_backend(VirtIOSerialPort *port, bool enable)
+{
+    VirtConsole *vcon = VIRTIO_CONSOLE(port);
+
+    if (!qemu_chr_fe_backend_connected(&vcon->chr)) {
+        return;
+    }
+
+    if (enable) {
+        VirtIOSerialPortClass *k = VIRTIO_SERIAL_PORT_GET_CLASS(port);
+
+        qemu_chr_fe_set_handlers(&vcon->chr, chr_can_read, chr_read,
+                                 k->is_console ? NULL : chr_event,
+                                 chr_be_change, vcon, NULL, false);
+    } else {
+        qemu_chr_fe_set_handlers(&vcon->chr, NULL, NULL, NULL,
+                                 NULL, NULL, NULL, false);
+    }
+}
+
 static void virtconsole_realize(DeviceState *dev, Error **errp)
 {
    VirtIOSerialPort *port = VIRTIO_SERIAL_PORT(dev);
@@ -258,6 +278,7 @@ static void virtserialport_class_init(ObjectClass *klass, void *data)
    k->unrealize = virtconsole_unrealize;
    k->have_data = flush_buf;
    k->set_guest_connected = set_guest_connected;
+    k->enable_backend = virtconsole_enable_backend;
    k->guest_writable = guest_writable;
    dc->props = virtserialport_properties;
 }
--- a/hw/char/virtio-serial-bus.c
+++ b/hw/char/virtio-serial-bus.c
@@ -637,6 +637,13 @@ static void set_status(VirtIODevice *vdev, uint8_t status)
    if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) {
        guest_reset(vser);
    }
+
+    QTAILQ_FOREACH(port, &vser->ports, next) {
+        VirtIOSerialPortClass *vsc = VIRTIO_SERIAL_PORT_GET_CLASS(port);
+        if (vsc->enable_backend) {
+            vsc->enable_backend(port, vdev->vm_running);
+        }
+    }
 }

 static void vser_reset(VirtIODevice *vdev)
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -758,6 +758,38 @@ void machine_run_board_init(MachineState *machine)
    if (nb_numa_nodes) {
        machine_numa_finish_init(machine);
    }
+
+    /* If the machine supports the valid_cpu_types check and the user
+     * specified a CPU with -cpu check here that the user CPU is supported.
+     */
+    if (machine_class->valid_cpu_types && machine->cpu_type) {
+        ObjectClass *class = object_class_by_name(machine->cpu_type);
+        int i;
+
+        for (i = 0; machine_class->valid_cpu_types[i]; i++) {
+            if (object_class_dynamic_cast(class,
+                                          machine_class->valid_cpu_types[i])) {
+                /* The user specificed CPU is in the valid field, we are
+                 * good to go.
+                 */
+                break;
+            }
+        }
+
+        if (!machine_class->valid_cpu_types[i]) {
+            /* The user specified CPU is not valid */
+            error_report("Invalid CPU type: %s", machine->cpu_type);
+            error_printf("The valid types are: %s",
+                         machine_class->valid_cpu_types[0]);
+            for (i = 1; machine_class->valid_cpu_types[i]; i++) {
+                error_printf(", %s", machine_class->valid_cpu_types[i]);
+            }
+            error_printf("\n");
+
+            exit(1);
+        }
+    }
+
    machine_class->init(machine);
 }

--- a/hw/display/qxl.c
+++ b/hw/display/qxl.c
@@ -2204,7 +2204,7 @@ static void qxl_realize_secondary(PCIDevice *dev, Error **errp)
    qxl_realize_common(qxl, errp);
 }

-static void qxl_pre_save(void *opaque)
+static int qxl_pre_save(void *opaque)
 {
    PCIQXLDevice* d = opaque;
    uint8_t *ram_start = d->vga.vram_ptr;
@@ -2216,6 +2216,8 @@ static void qxl_pre_save(void *opaque)
        d->last_release_offset = (uint8_t *)d->last_release - ram_start;
    }
    assert(d->last_release_offset < d->vga.vram_size);
+
+    return 0;
 }

 static int qxl_pre_load(void *opaque)
--- a/hw/display/trace-events
+++ b/hw/display/trace-events
@@ -6,6 +6,7 @@ jazz_led_write(uint64_t addr, uint8_t new) "write addr=0x%"PRIx64": 0x%x"

 # hw/display/xenfb.c
 xenfb_mouse_event(void *opaque, int dx, int dy, int dz, int button_state, int abs_pointer_wanted) "%p x %d y %d z %d bs 0x%x abs %d"
+xenfb_key_event(void *opaque, int scancode, int button_state) "%p scancode %d bs 0x%x"
 xenfb_input_connected(void *xendev, int abs_pointer_wanted) "%p abs %d"

 # hw/display/g364fb.c
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -1050,9 +1050,7 @@ static int virtio_gpu_save(QEMUFile *f, void *opaque, size_t size,
    }
    qemu_put_be32(f, 0); /* end of list */

-    vmstate_save_state(f, &vmstate_virtio_gpu_scanouts, g, NULL);
-
-    return 0;
+    return vmstate_save_state(f, &vmstate_virtio_gpu_scanouts, g, NULL);
 }

 static int virtio_gpu_load(QEMUFile *f, void *opaque, size_t size,
@@ -1321,6 +1319,7 @@ static void virtio_gpu_class_init(ObjectClass *klass, void *data)

    vdc->reset = virtio_gpu_reset;

+    set_bit(DEVICE_CATEGORY_DISPLAY, dc->categories);
    dc->props = virtio_gpu_properties;
    dc->vmsd = &vmstate_virtio_gpu;
    dc->hotpluggable = false;
--- a/hw/display/xenfb.c
+++ b/hw/display/xenfb.c
@@ -290,6 +290,7 @@ static void xenfb_key_event(void *opaque, int scancode)
 	scancode |= 0x80;
 	xenfb->extended = 0;
    }
+    trace_xenfb_key_event(opaque, scancode2linux[scancode], down);
    xenfb_send_key(xenfb, down, scancode2linux[scancode]);
 }

--- a/hw/gpio/omap_gpio.c
+++ b/hw/gpio/omap_gpio.c
@@ -525,17 +525,23 @@ static void omap2_gpio_module_write(void *opaque, hwaddr addr,
    }
 }

-static uint32_t omap2_gpio_module_readp(void *opaque, hwaddr addr)
+static uint64_t omap2_gpio_module_readp(void *opaque, hwaddr addr,
+                                        unsigned size)
 {
    return omap2_gpio_module_read(opaque, addr & ~3) >> ((addr & 3) << 3);
 }

 static void omap2_gpio_module_writep(void *opaque, hwaddr addr,
-                uint32_t value)
+                                     uint64_t value, unsigned size)
 {
    uint32_t cur = 0;
    uint32_t mask = 0xffff;

+    if (size == 4) {
+        omap2_gpio_module_write(opaque, addr, value);
+        return;
+    }
+
    switch (addr & ~3) {
    case 0x00:	/* GPIO_REVISION */
    case 0x14:	/* GPIO_SYSSTATUS */
@@ -581,18 +587,10 @@ static void omap2_gpio_module_writep(void *opaque, hwaddr addr,
 }

 static const MemoryRegionOps omap2_gpio_module_ops = {
-    .old_mmio = {
-        .read = {
-            omap2_gpio_module_readp,
-            omap2_gpio_module_readp,
-            omap2_gpio_module_read,
-        },
-        .write = {
-            omap2_gpio_module_writep,
-            omap2_gpio_module_writep,
-            omap2_gpio_module_write,
-        },
-    },
+    .read = omap2_gpio_module_readp,
+    .write = omap2_gpio_module_writep,
+    .valid.min_access_size = 1,
+    .valid.max_access_size = 4,
    .endianness = DEVICE_NATIVE_ENDIAN,
 };

--- a/hw/i2c/core.c
+++ b/hw/i2c/core.c
@@ -41,7 +41,7 @@ static const TypeInfo i2c_bus_info = {
    .instance_size = sizeof(I2CBus),
 };

-static void i2c_bus_pre_save(void *opaque)
+static int i2c_bus_pre_save(void *opaque)
 {
    I2CBus *bus = opaque;

@@ -53,6 +53,8 @@ static void i2c_bus_pre_save(void *opaque)
            bus->saved_address = I2C_BROADCAST;
        }
    }
+
+    return 0;
 }

 static const VMStateDescription vmstate_i2c_bus = {
--- a/hw/i2c/omap_i2c.c
+++ b/hw/i2c/omap_i2c.c
@@ -430,19 +430,39 @@ static void omap_i2c_writeb(void *opaque, hwaddr addr,
    }
 }

+static uint64_t omap_i2c_readfn(void *opaque, hwaddr addr,
+                                unsigned size)
+{
+    switch (size) {
+    case 2:
+        return omap_i2c_read(opaque, addr);
+    default:
+        return omap_badwidth_read16(opaque, addr);
+    }
+}
+
+static void omap_i2c_writefn(void *opaque, hwaddr addr,
+                             uint64_t value, unsigned size)
+{
+    switch (size) {
+    case 1:
+        /* Only the last fifo write can be 8 bit. */
+        omap_i2c_writeb(opaque, addr, value);
+        break;
+    case 2:
+        omap_i2c_write(opaque, addr, value);
+        break;
+    default:
+        omap_badwidth_write16(opaque, addr, value);
+        break;
+    }
+}
+
 static const MemoryRegionOps omap_i2c_ops = {
-    .old_mmio = {
-        .read = {
-            omap_badwidth_read16,
-            omap_i2c_read,
-            omap_badwidth_read16,
-        },
-        .write = {
-            omap_i2c_writeb, /* Only the last fifo write can be 8 bit.  */
-            omap_i2c_write,
-            omap_badwidth_write16,
-        },
-    },
+    .read = omap_i2c_readfn,
+    .write = omap_i2c_writefn,
+    .valid.min_access_size = 1,
+    .valid.max_access_size = 4,
    .endianness = DEVICE_NATIVE_ENDIAN,
 };

--- a/hw/i386/kvm/clock.c
+++ b/hw/i386/kvm/clock.c
@@ -62,7 +62,7 @@ static uint64_t kvmclock_current_nsec(KVMClockState *s)
 {
    CPUState *cpu = first_cpu;
    CPUX86State *env = cpu->env_ptr;
-    hwaddr kvmclock_struct_pa = env->system_time_msr & ~1ULL;
+    hwaddr kvmclock_struct_pa;
    uint64_t migration_tsc = env->tsc;
    struct pvclock_vcpu_time_info time;
    uint64_t delta;
@@ -77,6 +77,7 @@ static uint64_t kvmclock_current_nsec(KVMClockState *s)
        return 0;
    }

+    kvmclock_struct_pa = env->system_time_msr & ~1ULL;
    cpu_physical_memory_read(kvmclock_struct_pa, &time, sizeof(time));

    assert(time.tsc_timestamp <= migration_tsc);
@@ -254,11 +255,13 @@ static const VMStateDescription kvmclock_reliable_get_clock = {
 *  final pages of memory (which happens between vm_stop()
 *  and pre_save()) takes max_downtime.
 */
-static void kvmclock_pre_save(void *opaque)
+static int kvmclock_pre_save(void *opaque)
 {
    KVMClockState *s = opaque;

    kvm_update_clock(s);
+
+    return 0;
 }

 static const VMStateDescription kvmclock_vmsd = {
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@@ -184,7 +184,7 @@ static void ahci_check_irq(AHCIState *s)
 static void ahci_trigger_irq(AHCIState *s, AHCIDevice *d,
                             enum AHCIPortIRQ irqbit)
 {
-    g_assert(irqbit >= 0 && irqbit < 32);
+    g_assert((unsigned)irqbit < 32);
    uint32_t irq = 1U << irqbit;
    uint32_t irqstat = d->port_regs.irq_stat | irq;

--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -68,7 +68,7 @@ const char *IDE_DMA_CMD_lookup[IDE_DMA__COUNT] = {

 static const char *IDE_DMA_CMD_str(enum ide_dma_cmd enval)
 {
-    if (enval >= IDE_DMA__BEGIN && enval < IDE_DMA__COUNT) {
+    if ((unsigned)enval < IDE_DMA__COUNT) {
        return IDE_DMA_CMD_lookup[enval];
    }
    return "DMA UNKNOWN CMD";
@@ -2752,7 +2752,7 @@ static int ide_drive_pio_post_load(void *opaque, int version_id)
    return 0;
 }

-static void ide_drive_pio_pre_save(void *opaque)
+static int ide_drive_pio_pre_save(void *opaque)
 {
    IDEState *s = opaque;
    int idx;
@@ -2768,6 +2768,8 @@ static void ide_drive_pio_pre_save(void *opaque)
    } else {
        s->end_transfer_fn_idx = idx;
    }
+
+    return 0;
 }

 static bool ide_drive_pio_state_needed(void *opaque)
--- a/hw/ide/macio.c
+++ b/hw/ide/macio.c
@@ -255,114 +255,100 @@ static void pmac_ide_flush(DBDMA_io *io)
 }

 /* PowerMac IDE memory IO */
-static void pmac_ide_writeb (void *opaque,
-                             hwaddr addr, uint32_t val)
+static uint64_t pmac_ide_read(void *opaque, hwaddr addr, unsigned size)
 {
    MACIOIDEState *d = opaque;
+    uint64_t retval = 0xffffffff;
+    int reg = addr >> 4;

-    addr = (addr & 0xFFF) >> 4;
-    switch (addr) {
-    case 1 ... 7:
-        ide_ioport_write(&d->bus, addr, val);
+    switch (reg) {
+    case 0x0:
+        if (size == 2) {
+            retval = ide_data_readw(&d->bus, 0);
+        } else if (size == 4) {
+            retval = ide_data_readl(&d->bus, 0);
+        }
        break;
-    case 8:
-    case 22:
-        ide_cmd_write(&d->bus, 0, val);
+    case 0x1 ... 0x7:
+        if (size == 1) {
+            retval = ide_ioport_read(&d->bus, reg);
+        }
        break;
-    default:
+    case 0x8:
+    case 0x16:
+        if (size == 1) {
+            retval = ide_status_read(&d->bus, 0);
+        }
+        break;
+    case 0x20:
+        if (size == 4) {
+            retval = d->timing_reg;
+        }
+        break;
+    case 0x30:
+        /* This is an interrupt state register that only exists
+         * in the KeyLargo and later variants. Bit 0x8000_0000
+         * latches the DMA interrupt and has to be written to
+         * clear. Bit 0x4000_0000 is an image of the disk
+         * interrupt. MacOS X relies on this and will hang if
+         * we don't provide at least the disk interrupt
+         */
+        if (size == 4) {
+            retval = d->irq_reg;
+        }
        break;
    }
-}

-static uint32_t pmac_ide_readb (void *opaque,hwaddr addr)
-{
-    uint8_t retval;
-    MACIOIDEState *d = opaque;
-
-    addr = (addr & 0xFFF) >> 4;
-    switch (addr) {
-    case 1 ... 7:
-        retval = ide_ioport_read(&d->bus, addr);
-        break;
-    case 8:
-    case 22:
-        retval = ide_status_read(&d->bus, 0);
-        break;
-    default:
-        retval = 0xFF;
-        break;
-    }
    return retval;
 }

-static void pmac_ide_writew (void *opaque,
-                             hwaddr addr, uint32_t val)
+
+static void pmac_ide_write(void *opaque, hwaddr addr, uint64_t val,
+                           unsigned size)
 {
    MACIOIDEState *d = opaque;
+    int reg = addr >> 4;

-    addr = (addr & 0xFFF) >> 4;
-    val = bswap16(val);
-    if (addr == 0) {
-        ide_data_writew(&d->bus, 0, val);
+    switch (reg) {
+    case 0x0:
+        if (size == 2) {
+            ide_data_writew(&d->bus, 0, val);
+        } else if (size == 4) {
+            ide_data_writel(&d->bus, 0, val);
+        }
+        break;
+    case 0x1 ... 0x7:
+        if (size == 1) {
+            ide_ioport_write(&d->bus, reg, val);
+        }
+        break;
+    case 0x8:
+    case 0x16:
+        if (size == 1) {
+            ide_cmd_write(&d->bus, 0, val);
+        }
+        break;
+    case 0x20:
+        if (size == 4) {
+            d->timing_reg = val;
+        }
+        break;
+    case 0x30:
+        if (size == 4) {
+            if (val & 0x80000000u) {
+                d->irq_reg &= 0x7fffffff;
+            }
+        }
+        break;
    }
 }

-static uint32_t pmac_ide_readw (void *opaque,hwaddr addr)
-{
-    uint16_t retval;
-    MACIOIDEState *d = opaque;
-
-    addr = (addr & 0xFFF) >> 4;
-    if (addr == 0) {
-        retval = ide_data_readw(&d->bus, 0);
-    } else {
-        retval = 0xFFFF;
-    }
-    retval = bswap16(retval);
-    return retval;
-}
-
-static void pmac_ide_writel (void *opaque,
-                             hwaddr addr, uint32_t val)
-{
-    MACIOIDEState *d = opaque;
-
-    addr = (addr & 0xFFF) >> 4;
-    val = bswap32(val);
-    if (addr == 0) {
-        ide_data_writel(&d->bus, 0, val);
-    }
-}
-
-static uint32_t pmac_ide_readl (void *opaque,hwaddr addr)
-{
-    uint32_t retval;
-    MACIOIDEState *d = opaque;
-
-    addr = (addr & 0xFFF) >> 4;
-    if (addr == 0) {
-        retval = ide_data_readl(&d->bus, 0);
-    } else {
-        retval = 0xFFFFFFFF;
-    }
-    retval = bswap32(retval);
-    return retval;
-}
-
 static const MemoryRegionOps pmac_ide_ops = {
-    .old_mmio = {
-        .write = {
-            pmac_ide_writeb,
-            pmac_ide_writew,
-            pmac_ide_writel,
-        },
-        .read = {
-            pmac_ide_readb,
-            pmac_ide_readw,
-            pmac_ide_readl,
-        },
-    },
-    .endianness = DEVICE_NATIVE_ENDIAN,
+    .read = pmac_ide_read,
+    .write = pmac_ide_write,
+    .valid.min_access_size = 1,
+    .valid.max_access_size = 4,
+    .endianness = DEVICE_LITTLE_ENDIAN,
 };

 static const VMStateDescription vmstate_pmac = {
@@ -426,13 +412,32 @@ static void macio_ide_realizefn(DeviceState *dev, Error **errp)
 {
    MACIOIDEState *s = MACIO_IDE(dev);

-    ide_init2(&s->bus, s->irq);
+    ide_init2(&s->bus, s->ide_irq);

    /* Register DMA callbacks */
    s->dma.ops = &dbdma_ops;
    s->bus.dma = &s->dma;
 }

+static void pmac_ide_irq(void *opaque, int n, int level)
+{
+    MACIOIDEState *s = opaque;
+    uint32_t mask = 0x80000000u >> n;
+
+    /* We need to reflect the IRQ state in the irq register */
+    if (level) {
+        s->irq_reg |= mask;
+    } else {
+        s->irq_reg &= ~mask;
+    }
+
+    if (n) {
+        qemu_set_irq(s->real_ide_irq, level);
+    } else {
+        qemu_set_irq(s->real_dma_irq, level);
+    }
+}
+
 static void macio_ide_initfn(Object *obj)
 {
    SysBusDevice *d = SYS_BUS_DEVICE(obj);
@@ -441,16 +446,28 @@ static void macio_ide_initfn(Object *obj)
    ide_bus_new(&s->bus, sizeof(s->bus), DEVICE(obj), 0, 2);
    memory_region_init_io(&s->mem, obj, &pmac_ide_ops, s, "pmac-ide", 0x1000);
    sysbus_init_mmio(d, &s->mem);
-    sysbus_init_irq(d, &s->irq);
-    sysbus_init_irq(d, &s->dma_irq);
+    sysbus_init_irq(d, &s->real_ide_irq);
+    sysbus_init_irq(d, &s->real_dma_irq);
+    s->dma_irq = qemu_allocate_irq(pmac_ide_irq, s, 0);
+    s->ide_irq = qemu_allocate_irq(pmac_ide_irq, s, 1);
+
+    object_property_add_link(obj, "dbdma", TYPE_MAC_DBDMA,
+                             (Object **) &s->dbdma,
+                             qdev_prop_allow_set_link_before_realize, 0, NULL);
 }

+static Property macio_ide_properties[] = {
+    DEFINE_PROP_UINT32("channel", MACIOIDEState, channel, 0),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void macio_ide_class_init(ObjectClass *oc, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(oc);

    dc->realize = macio_ide_realizefn;
    dc->reset = macio_ide_reset;
+    dc->props = macio_ide_properties;
    dc->vmsd = &vmstate_pmac;
    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
 }
@@ -480,10 +497,9 @@ void macio_ide_init_drives(MACIOIDEState *s, DriveInfo **hd_table)
    }
 }

-void macio_ide_register_dma(MACIOIDEState *s, void *dbdma, int channel)
+void macio_ide_register_dma(MACIOIDEState *s)
 {
-    s->dbdma = dbdma;
-    DBDMA_register_channel(dbdma, channel, s->dma_irq,
+    DBDMA_register_channel(s->dbdma, s->channel, s->dma_irq,
                           pmac_ide_transfer, pmac_ide_flush, s);
 }

--- a/hw/ide/pci.c
+++ b/hw/ide/pci.c
@@ -296,7 +296,7 @@ static bool ide_bmdma_status_needed(void *opaque)
    return ((bm->status & abused_bits) != 0);
 }

-static void ide_bmdma_pre_save(void *opaque)
+static int ide_bmdma_pre_save(void *opaque)
 {
    BMDMAState *bm = opaque;
    uint8_t abused_bits = BM_MIGRATION_COMPAT_STATUS_BITS;
@@ -310,6 +310,8 @@ static void ide_bmdma_pre_save(void *opaque)
    bm->migration_retry_nsector = bm->bus->retry_nsector;
    bm->migration_compat_status =
        (bm->status & ~abused_bits) | (bm->bus->error_status & abused_bits);
+
+    return 0;
 }

 /* This function accesses bm->bus->error_status which is loaded only after
--- a/hw/input/ps2.c
+++ b/hw/input/ps2.c
@@ -1216,12 +1216,14 @@ static int ps2_kbd_post_load(void* opaque, int version_id)
    return 0;
 }

-static void ps2_kbd_pre_save(void *opaque)
+static int ps2_kbd_pre_save(void *opaque)
 {
    PS2KbdState *s = (PS2KbdState *)opaque;
    PS2State *ps2 = &s->common;

    ps2_common_post_load(ps2);
+
+    return 0;
 }

 static const VMStateDescription vmstate_ps2_keyboard = {
@@ -1254,12 +1256,14 @@ static int ps2_mouse_post_load(void *opaque, int version_id)
    return 0;
 }

-static void ps2_mouse_pre_save(void *opaque)
+static int ps2_mouse_pre_save(void *opaque)
 {
    PS2MouseState *s = (PS2MouseState *)opaque;
    PS2State *ps2 = &s->common;

    ps2_common_post_load(ps2);
+
+    return 0;
 }

 static const VMStateDescription vmstate_ps2_mouse = {
--- a/hw/input/tsc210x.c
+++ b/hw/input/tsc210x.c
@@ -976,10 +976,12 @@ static void tsc210x_i2s_set_rate(TSC210xState *s, int in, int out)
    s->i2s_rx_rate = in;
 }

-static void tsc210x_pre_save(void *opaque)
+static int tsc210x_pre_save(void *opaque)
 {
    TSC210xState *s = (TSC210xState *) opaque;
    s->now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+
+    return 0;
 }

 static int tsc210x_post_load(void *opaque, int version_id)
--- a/hw/input/virtio-input-hid.c
+++ b/hw/input/virtio-input-hid.c
@@ -190,6 +190,7 @@ static void virtio_input_key_config(VirtIOInput *vinput,
 static void virtio_input_handle_event(DeviceState *dev, QemuConsole *src,
                                      InputEvent *evt)
 {
+    VirtIOInputHID *vhid = VIRTIO_INPUT_HID(dev);
    VirtIOInput *vinput = VIRTIO_INPUT(dev);
    virtio_input_event event;
    int qcode;
@@ -215,7 +216,14 @@ static void virtio_input_handle_event(DeviceState *dev, QemuConsole *src,
        break;
    case INPUT_EVENT_KIND_BTN:
        btn = evt->u.btn.data;
-        if (keymap_button[btn->button]) {
+        if (vhid->wheel_axis && (btn->button == INPUT_BUTTON_WHEEL_UP ||
+                                 btn->button == INPUT_BUTTON_WHEEL_DOWN)) {
+            event.type  = cpu_to_le16(EV_REL);
+            event.code  = cpu_to_le16(REL_WHEEL);
+            event.value = cpu_to_le32(btn->button == INPUT_BUTTON_WHEEL_UP
+                                      ? 1 : -1);
+            virtio_input_send(vinput, &event);
+        } else if (keymap_button[btn->button]) {
            event.type  = cpu_to_le16(EV_KEY);
            event.code  = cpu_to_le16(keymap_button[btn->button]);
            event.value = cpu_to_le32(btn->down ? 1 : 0);
@@ -407,7 +415,7 @@ static QemuInputHandler virtio_mouse_handler = {
    .sync  = virtio_input_handle_sync,
 };

-static struct virtio_input_config virtio_mouse_config[] = {
+static struct virtio_input_config virtio_mouse_config_v1[] = {
    {
        .select    = VIRTIO_INPUT_CFG_ID_NAME,
        .size      = sizeof(VIRTIO_ID_NAME_MOUSE),
@@ -432,13 +440,53 @@ static struct virtio_input_config virtio_mouse_config[] = {
    { /* end of list */ },
 };

+static struct virtio_input_config virtio_mouse_config_v2[] = {
+    {
+        .select    = VIRTIO_INPUT_CFG_ID_NAME,
+        .size      = sizeof(VIRTIO_ID_NAME_MOUSE),
+        .u.string  = VIRTIO_ID_NAME_MOUSE,
+    },{
+        .select    = VIRTIO_INPUT_CFG_ID_DEVIDS,
+        .size      = sizeof(struct virtio_input_devids),
+        .u.ids     = {
+            .bustype = const_le16(BUS_VIRTUAL),
+            .vendor  = const_le16(0x0627), /* same we use for usb hid devices */
+            .product = const_le16(0x0002),
+            .version = const_le16(0x0002),
+        },
+    },{
+        .select    = VIRTIO_INPUT_CFG_EV_BITS,
+        .subsel    = EV_REL,
+        .size      = 2,
+        .u.bitmap  = {
+            (1 << REL_X) | (1 << REL_Y),
+            (1 << (REL_WHEEL - 8))
+        },
+    },
+    { /* end of list */ },
+};
+
+static Property virtio_mouse_properties[] = {
+    DEFINE_PROP_BOOL("wheel-axis", VirtIOInputHID, wheel_axis, true),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_mouse_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->props  = virtio_mouse_properties;
+}
+
 static void virtio_mouse_init(Object *obj)
 {
    VirtIOInputHID *vhid = VIRTIO_INPUT_HID(obj);
    VirtIOInput *vinput = VIRTIO_INPUT(obj);

    vhid->handler = &virtio_mouse_handler;
-    virtio_input_init_config(vinput, virtio_mouse_config);
+    virtio_input_init_config(vinput, vhid->wheel_axis
+                             ? virtio_mouse_config_v2
+                             : virtio_mouse_config_v1);
    virtio_input_key_config(vinput, keymap_button,
                            ARRAY_SIZE(keymap_button));
 }
@@ -448,6 +496,7 @@ static const TypeInfo virtio_mouse_info = {
    .parent        = TYPE_VIRTIO_INPUT_HID,
    .instance_size = sizeof(VirtIOInputHID),
    .instance_init = virtio_mouse_init,
+    .class_init    = virtio_mouse_class_init,
 };

 /* ----------------------------------------------------------------- */
@@ -459,7 +508,7 @@ static QemuInputHandler virtio_tablet_handler = {
    .sync  = virtio_input_handle_sync,
 };

-static struct virtio_input_config virtio_tablet_config[] = {
+static struct virtio_input_config virtio_tablet_config_v1[] = {
    {
        .select    = VIRTIO_INPUT_CFG_ID_NAME,
        .size      = sizeof(VIRTIO_ID_NAME_TABLET),
@@ -496,13 +545,72 @@ static struct virtio_input_config virtio_tablet_config[] = {
    { /* end of list */ },
 };

+static struct virtio_input_config virtio_tablet_config_v2[] = {
+    {
+        .select    = VIRTIO_INPUT_CFG_ID_NAME,
+        .size      = sizeof(VIRTIO_ID_NAME_TABLET),
+        .u.string  = VIRTIO_ID_NAME_TABLET,
+    },{
+        .select    = VIRTIO_INPUT_CFG_ID_DEVIDS,
+        .size      = sizeof(struct virtio_input_devids),
+        .u.ids     = {
+            .bustype = const_le16(BUS_VIRTUAL),
+            .vendor  = const_le16(0x0627), /* same we use for usb hid devices */
+            .product = const_le16(0x0003),
+            .version = const_le16(0x0002),
+        },
+    },{
+        .select    = VIRTIO_INPUT_CFG_EV_BITS,
+        .subsel    = EV_ABS,
+        .size      = 1,
+        .u.bitmap  = {
+            (1 << ABS_X) | (1 << ABS_Y),
+        },
+    },{
+        .select    = VIRTIO_INPUT_CFG_EV_BITS,
+        .subsel    = EV_REL,
+        .size      = 2,
+        .u.bitmap  = {
+            0,
+            (1 << (REL_WHEEL - 8))
+        },
+    },{
+        .select    = VIRTIO_INPUT_CFG_ABS_INFO,
+        .subsel    = ABS_X,
+        .size      = sizeof(virtio_input_absinfo),
+        .u.abs.min = const_le32(INPUT_EVENT_ABS_MIN),
+        .u.abs.max = const_le32(INPUT_EVENT_ABS_MAX),
+    },{
+        .select    = VIRTIO_INPUT_CFG_ABS_INFO,
+        .subsel    = ABS_Y,
+        .size      = sizeof(virtio_input_absinfo),
+        .u.abs.min = const_le32(INPUT_EVENT_ABS_MIN),
+        .u.abs.max = const_le32(INPUT_EVENT_ABS_MAX),
+    },
+    { /* end of list */ },
+};
+
+static Property virtio_tablet_properties[] = {
+    DEFINE_PROP_BOOL("wheel-axis", VirtIOInputHID, wheel_axis, true),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_tablet_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->props  = virtio_tablet_properties;
+}
+
 static void virtio_tablet_init(Object *obj)
 {
    VirtIOInputHID *vhid = VIRTIO_INPUT_HID(obj);
    VirtIOInput *vinput = VIRTIO_INPUT(obj);

    vhid->handler = &virtio_tablet_handler;
-    virtio_input_init_config(vinput, virtio_tablet_config);
+    virtio_input_init_config(vinput, vhid->wheel_axis
+                             ? virtio_tablet_config_v2
+                             : virtio_tablet_config_v1);
    virtio_input_key_config(vinput, keymap_button,
                            ARRAY_SIZE(keymap_button));
 }
@@ -512,6 +620,7 @@ static const TypeInfo virtio_tablet_info = {
    .parent        = TYPE_VIRTIO_INPUT_HID,
    .instance_size = sizeof(VirtIOInputHID),
    .instance_init = virtio_tablet_init,
+    .class_init    = virtio_tablet_class_init,
 };

 /* ----------------------------------------------------------------- */
--- a/hw/intc/apic_common.c
+++ b/hw/intc/apic_common.c
@@ -360,7 +360,7 @@ static int apic_pre_load(void *opaque)
    return 0;
 }

-static void apic_dispatch_pre_save(void *opaque)
+static int apic_dispatch_pre_save(void *opaque)
 {
    APICCommonState *s = APIC_COMMON(opaque);
    APICCommonClass *info = APIC_COMMON_GET_CLASS(s);
@@ -368,6 +368,8 @@ static void apic_dispatch_pre_save(void *opaque)
    if (info->pre_save) {
        info->pre_save(s);
    }
+
+    return 0;
 }

 static int apic_dispatch_post_load(void *opaque, int version_id)
--- a/hw/intc/arm_gic_common.c
+++ b/hw/intc/arm_gic_common.c
@@ -23,7 +23,7 @@
 #include "gic_internal.h"
 #include "hw/arm/linux-boot-if.h"

-static void gic_pre_save(void *opaque)
+static int gic_pre_save(void *opaque)
 {
    GICState *s = (GICState *)opaque;
    ARMGICCommonClass *c = ARM_GIC_COMMON_GET_CLASS(s);
@@ -31,6 +31,8 @@ static void gic_pre_save(void *opaque)
    if (c->pre_save) {
        c->pre_save(s);
    }
+
+    return 0;
 }

 static int gic_post_load(void *opaque, int version_id)
--- a/hw/intc/arm_gicv3_common.c
+++ b/hw/intc/arm_gicv3_common.c
@@ -28,7 +28,7 @@
 #include "gicv3_internal.h"
 #include "hw/arm/linux-boot-if.h"

-static void gicv3_pre_save(void *opaque)
+static int gicv3_pre_save(void *opaque)
 {
    GICv3State *s = (GICv3State *)opaque;
    ARMGICv3CommonClass *c = ARM_GICV3_COMMON_GET_CLASS(s);
@@ -36,6 +36,8 @@ static void gicv3_pre_save(void *opaque)
    if (c->pre_save) {
        c->pre_save(s);
    }
+
+    return 0;
 }

 static int gicv3_post_load(void *opaque, int version_id)
--- a/hw/intc/arm_gicv3_its_common.c
+++ b/hw/intc/arm_gicv3_its_common.c
@@ -23,7 +23,7 @@
 #include "hw/intc/arm_gicv3_its_common.h"
 #include "qemu/log.h"

-static void gicv3_its_pre_save(void *opaque)
+static int gicv3_its_pre_save(void *opaque)
 {
    GICv3ITSState *s = (GICv3ITSState *)opaque;
    GICv3ITSCommonClass *c = ARM_GICV3_ITS_COMMON_GET_CLASS(s);
@@ -31,6 +31,8 @@ static void gicv3_its_pre_save(void *opaque)
    if (c->pre_save) {
        c->pre_save(s);
    }
+
+    return 0;
 }

 static int gicv3_its_post_load(void *opaque, int version_id)
--- a/hw/intc/armv7m_nvic.c
+++ b/hw/intc/armv7m_nvic.c
--- a/hw/intc/i8259_common.c
+++ b/hw/intc/i8259_common.c
@@ -46,7 +46,7 @@ void pic_reset_common(PICCommonState *s)
    /* Note: ELCR is not reset */
 }

-static void pic_dispatch_pre_save(void *opaque)
+static int pic_dispatch_pre_save(void *opaque)
 {
    PICCommonState *s = opaque;
    PICCommonClass *info = PIC_COMMON_GET_CLASS(s);
@@ -54,6 +54,8 @@ static void pic_dispatch_pre_save(void *opaque)
    if (info->pre_save) {
        info->pre_save(s);
    }
+
+    return 0;
 }

 static int pic_dispatch_post_load(void *opaque, int version_id)
--- a/hw/intc/ioapic_common.c
+++ b/hw/intc/ioapic_common.c
@@ -102,7 +102,7 @@ void ioapic_reset_common(DeviceState *dev)
    }
 }

-static void ioapic_dispatch_pre_save(void *opaque)
+static int ioapic_dispatch_pre_save(void *opaque)
 {
    IOAPICCommonState *s = IOAPIC_COMMON(opaque);
    IOAPICCommonClass *info = IOAPIC_COMMON_GET_CLASS(s);
@@ -110,6 +110,8 @@ static void ioapic_dispatch_pre_save(void *opaque)
    if (info->pre_save) {
        info->pre_save(s);
    }
+
+    return 0;
 }

 static int ioapic_dispatch_post_load(void *opaque, int version_id)
--- a/hw/intc/openpic.c
+++ b/hw/intc/openpic.c
@@ -92,6 +92,16 @@ static int get_current_cpu(void);
 #define RAVEN_MAX_TMR      OPENPIC_MAX_TMR
 #define RAVEN_MAX_IPI      OPENPIC_MAX_IPI

+/* KeyLargo */
+#define KEYLARGO_MAX_CPU  4
+#define KEYLARGO_MAX_EXT  64
+#define KEYLARGO_MAX_IPI  4
+#define KEYLARGO_MAX_IRQ  (64 + KEYLARGO_MAX_IPI)
+#define KEYLARGO_MAX_TMR  0
+#define KEYLARGO_IPI_IRQ  (KEYLARGO_MAX_EXT) /* First IPI IRQ */
+/* Timers don't exist but this makes the code happy... */
+#define KEYLARGO_TMR_IRQ  (KEYLARGO_IPI_IRQ + KEYLARGO_MAX_IPI)
+
 /* Interrupt definitions */
 #define RAVEN_FE_IRQ     (RAVEN_MAX_EXT)     /* Internal functional IRQ */
 #define RAVEN_ERR_IRQ    (RAVEN_MAX_EXT + 1) /* Error IRQ */
@@ -120,6 +130,7 @@ static FslMpicInfo fsl_mpic_42 = {
 #define VID_REVISION_1_3   3

 #define VIR_GENERIC      0x00000000 /* Generic Vendor ID */
+#define VIR_MPIC2A       0x00004614 /* IBM MPIC-2A */

 #define GCR_RESET        0x80000000
 #define GCR_MODE_PASS    0x00000000
@@ -329,6 +340,8 @@ typedef struct OpenPICState {
    uint32_t nb_cpus;
    /* Timer registers */
    OpenPICTimer timers[OPENPIC_MAX_TMR];
+    uint32_t max_tmr;
+
    /* Shared MSI registers */
    OpenPICMSI msi[MAX_MSI];
    uint32_t max_irq;
@@ -1715,6 +1728,28 @@ static void openpic_realize(DeviceState *dev, Error **errp)
            return;
        }

+        map_list(opp, list_le, &list_count);
+        break;
+
+    case OPENPIC_MODEL_KEYLARGO:
+        opp->nb_irqs = KEYLARGO_MAX_EXT;
+        opp->vid = VID_REVISION_1_2;
+        opp->vir = VIR_GENERIC;
+        opp->vector_mask = 0xFF;
+        opp->tfrr_reset = 4160000;
+        opp->ivpr_reset = IVPR_MASK_MASK | IVPR_MODE_MASK;
+        opp->idr_reset = 0;
+        opp->max_irq = KEYLARGO_MAX_IRQ;
+        opp->irq_ipi0 = KEYLARGO_IPI_IRQ;
+        opp->irq_tim0 = KEYLARGO_TMR_IRQ;
+        opp->brr1 = -1;
+        opp->mpic_mode_mask = GCR_MODE_MIXED;
+
+        if (opp->nb_cpus != 1) {
+            error_setg(errp, "Only UP supported today");
+            return;
+        }
+
        map_list(opp, list_le, &list_count);
        break;
    }
--- a/hw/intc/openpic_kvm.c
+++ b/hw/intc/openpic_kvm.c
@@ -124,7 +124,7 @@ static void kvm_openpic_region_add(MemoryListener *listener,
    uint64_t reg_base;
    int ret;

-    if (section->address_space != &address_space_memory) {
+    if (section->fv != address_space_to_flatview(&address_space_memory)) {
        abort();
    }

--- a/hw/intc/s390_flic_kvm.c
+++ b/hw/intc/s390_flic_kvm.c
@@ -420,7 +420,7 @@ typedef struct KVMS390FLICStateMigTmp {
    uint8_t nimm;
 } KVMS390FLICStateMigTmp;

-static void kvm_flic_ais_pre_save(void *opaque)
+static int kvm_flic_ais_pre_save(void *opaque)
 {
    KVMS390FLICStateMigTmp *tmp = opaque;
    KVMS390FLICState *flic = tmp->parent;
@@ -433,11 +433,13 @@ static void kvm_flic_ais_pre_save(void *opaque)

    if (ioctl(flic->fd, KVM_GET_DEVICE_ATTR, &attr)) {
        error_report("Failed to retrieve kvm flic ais states");
-        return;
+        return -EINVAL;
    }

    tmp->simm = ais.simm;
    tmp->nimm = ais.nimm;
+
+    return 0;
 }

 static int kvm_flic_ais_post_load(void *opaque, int version_id)
--- a/hw/intc/trace-events
+++ b/hw/intc/trace-events
@@ -167,16 +167,17 @@ gicv3_redist_set_irq(uint32_t cpu, int irq, int level) "GICv3 redistributor 0x%x
 gicv3_redist_send_sgi(uint32_t cpu, int irq) "GICv3 redistributor 0x%x pending SGI %d"

 # hw/intc/armv7m_nvic.c
-nvic_recompute_state(int vectpending, int exception_prio) "NVIC state recomputed: vectpending %d exception_prio %d"
-nvic_set_prio(int irq, uint8_t prio) "NVIC set irq %d priority %d"
+nvic_recompute_state(int vectpending, int vectpending_prio, int exception_prio) "NVIC state recomputed: vectpending %d vectpending_prio %d exception_prio %d"
+nvic_recompute_state_secure(int vectpending, bool vectpending_is_s_banked, int vectpending_prio, int exception_prio) "NVIC state recomputed: vectpending %d is_s_banked %d vectpending_prio %d exception_prio %d"
+nvic_set_prio(int irq, bool secure, uint8_t prio) "NVIC set irq %d secure-bank %d priority %d"
 nvic_irq_update(int vectpending, int pendprio, int exception_prio, int level) "NVIC vectpending %d pending prio %d exception_prio %d: setting irq line to %d"
 nvic_escalate_prio(int irq, int irqprio, int runprio) "NVIC escalating irq %d to HardFault: insufficient priority %d >= %d"
 nvic_escalate_disabled(int irq) "NVIC escalating irq %d to HardFault: disabled"
-nvic_set_pending(int irq, int en, int prio) "NVIC set pending irq %d (enabled: %d priority %d)"
-nvic_clear_pending(int irq, int en, int prio) "NVIC clear pending irq %d (enabled: %d priority %d)"
+nvic_set_pending(int irq, bool secure, int en, int prio) "NVIC set pending irq %d secure-bank %d (enabled: %d priority %d)"
+nvic_clear_pending(int irq, bool secure, int en, int prio) "NVIC clear pending irq %d secure-bank %d (enabled: %d priority %d)"
 nvic_set_pending_level(int irq) "NVIC set pending: irq %d higher prio than vectpending: setting irq line to 1"
-nvic_acknowledge_irq(int irq, int prio) "NVIC acknowledge IRQ: %d now active (prio %d)"
-nvic_complete_irq(int irq) "NVIC complete IRQ %d"
+nvic_acknowledge_irq(int irq, int prio, bool targets_secure) "NVIC acknowledge IRQ: %d now active (prio %d targets_secure %d)"
+nvic_complete_irq(int irq, bool secure) "NVIC complete IRQ %d (secure %d)"
 nvic_set_irq_level(int irq, int level) "NVIC external irq %d level set to %d"
 nvic_sysreg_read(uint64_t addr, uint32_t value, unsigned size) "NVIC sysreg read addr 0x%" PRIx64 " data 0x%" PRIx32 " size %u"
 nvic_sysreg_write(uint64_t addr, uint32_t value, unsigned size) "NVIC sysreg write addr 0x%" PRIx64 " data 0x%" PRIx32 " size %u"
--- a/hw/intc/xics.c
+++ b/hw/intc/xics.c
@@ -241,7 +241,7 @@ static void icp_irq(ICSState *ics, int server, int nr, uint8_t priority)
    }
 }

-static void icp_dispatch_pre_save(void *opaque)
+static int icp_dispatch_pre_save(void *opaque)
 {
    ICPState *icp = opaque;
    ICPStateClass *info = ICP_GET_CLASS(icp);
@@ -249,6 +249,8 @@ static void icp_dispatch_pre_save(void *opaque)
    if (info->pre_save) {
        info->pre_save(icp);
    }
+
+    return 0;
 }

 static int icp_dispatch_post_load(void *opaque, int version_id)
@@ -533,7 +535,7 @@ static void ics_simple_reset(void *dev)
    }
 }

-static void ics_simple_dispatch_pre_save(void *opaque)
+static int ics_simple_dispatch_pre_save(void *opaque)
 {
    ICSState *ics = opaque;
    ICSStateClass *info = ICS_BASE_GET_CLASS(ics);
@@ -541,6 +543,8 @@ static void ics_simple_dispatch_pre_save(void *opaque)
    if (info->pre_save) {
        info->pre_save(ics);
    }
+
+    return 0;
 }

 static int ics_simple_dispatch_post_load(void *opaque, int version_id)
--- a/hw/isa/pc87312.c
+++ b/hw/isa/pc87312.c
@@ -386,6 +386,8 @@ static void pc87312_class_init(ObjectClass *klass, void *data)
    dc->reset = pc87312_reset;
    dc->vmsd = &vmstate_pc87312;
    dc->props = pc87312_properties;
+    /* Reason: Uses parallel_hds[0] in realize(), so it can't be used twice */
+    dc->user_creatable = false;
 }

 static const TypeInfo pc87312_type_info = {
--- a/hw/misc/Makefile.objs
+++ b/hw/misc/Makefile.objs
@@ -59,3 +59,4 @@ obj-$(CONFIG_HYPERV_TESTDEV) += hyperv_testdev.o
 obj-$(CONFIG_AUX) += auxbus.o
 obj-$(CONFIG_ASPEED_SOC) += aspeed_scu.o aspeed_sdmc.o
 obj-y += mmio_interface.o
+obj-$(CONFIG_MSF2) += msf2-sysreg.o
--- a/Show More
+++ b/Show More