input: Add trace event for empty keyboard queue

When driving QEMU from the outside, we have basically no chance to determine how quickly the guest OS picks up key events, so we usually have to limit ourselves to very slow keyboard presses to make sure the guest always has enough chance to pick them up. This patch adds a trace events when the keyboarde queue is drained. An external driver can use that as hint that new keys can be pressed. Signed-off-by: Alexander Graf <agraf@suse.de> Message-id: 1490883775-94658-1-git-send-email-agraf@suse.de Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
input: don't queue delay if paused
2017-05-03 14:20:12 +02:00 · 2017-05-03 14:19:40 +02:00 · 2017-05-03 14:18:21 +02:00 · 2017-05-02 15:16:29 +01:00 · 2017-04-29 18:44:16 +02:00 · 2017-04-29 18:44:16 +02:00
1347 changed files with 50898 additions and 20799 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -99,14 +99,15 @@
 /pc-bios/optionrom/kvmvapic.img
 /pc-bios/s390-ccw/s390-ccw.elf
 /pc-bios/s390-ccw/s390-ccw.img
+/docs/qemu-ga-qapi.texi
 /docs/qemu-ga-ref.html
+/docs/qemu-ga-ref.info*
 /docs/qemu-ga-ref.txt
+/docs/qemu-qmp-qapi.texi
 /docs/qemu-qmp-ref.html
+/docs/qemu-qmp-ref.info*
 /docs/qemu-qmp-ref.txt
-docs/qemu-ga-ref.info*
-docs/qemu-qmp-ref.info*
-/qemu-ga-qapi.texi
-/qemu-qapi.texi
+/docs/version.texi
 *.tps
 .stgit-*
 cscope.*
--- a/.shippable.yml
+++ b/.shippable.yml
@@ -0,0 +1,21 @@
+language: c
+env:
+  matrix:
+    - IMAGE=debian-armhf-cross
+      TARGET_LIST=arm-softmmu,arm-linux-user
+    - IMAGE=debian-arm64-cross
+      TARGET_LIST=aarch64-softmmu,aarch64-linux-user
+    - IMAGE=debian-s390x-cross
+      TARGET_LIST=s390x-softmmu,s390x-linux-user
+build:
+  pre_ci:
+    - make docker-image-${IMAGE}
+  pre_ci_boot:
+    image_name: qemu
+    image_tag: ${IMAGE}
+    pull: false
+    options: "-e HOME=/root"
+  ci:
+    - unset CC
+    - ./configure ${QEMU_CONFIGURE_OPTS} --target-list=${TARGET_LIST}
+    - make -j2
--- a/.travis.yml
+++ b/.travis.yml
@@ -92,8 +92,8 @@ matrix:
    - env: CONFIG=""
      os: osx
      compiler: clang
-    # Plain Trusty Build
-    - env: CONFIG=""
+    # Plain Trusty System Build
+    - env: CONFIG="--disable-linux-user"
      sudo: required
      addons:
      dist: trusty
@@ -103,16 +103,45 @@ matrix:
        - sudo apt-get build-dep -qq qemu
        - wget -O - http://people.linaro.org/~alex.bennee/qemu-submodule-git-seed.tar.xz | tar -xvJ
        - git submodule update --init --recursive
-    # Trusty build with latest stable clang
-    - env: CONFIG=""
+    # Plain Trusty Linux User Build
+    - env: CONFIG="--disable-system"
      sudo: required
+      addons:
+      dist: trusty
+      compiler: gcc
+      before_install:
+        - sudo apt-get update -qq
+        - sudo apt-get build-dep -qq qemu
+        - wget -O - http://people.linaro.org/~alex.bennee/qemu-submodule-git-seed.tar.xz | tar -xvJ
+        - git submodule update --init --recursive
+    # Trusty System build with latest stable clang
+    - sudo: required
      addons:
      dist: trusty
      language: generic
      compiler: none
      env:
        - COMPILER_NAME=clang CXX=clang++-3.9 CC=clang-3.9
-        - CONFIG="--cc=clang-3.9 --cxx=clang++-3.9"
+        - CONFIG="--disable-linux-user --cc=clang-3.9 --cxx=clang++-3.9"
+      before_install:
+        - wget -nv -O - http://llvm.org/apt/llvm-snapshot.gpg.key | sudo apt-key add -
+        - sudo apt-add-repository -y 'deb http://llvm.org/apt/trusty llvm-toolchain-trusty-3.9 main'
+        - sudo apt-get update -qq
+        - sudo apt-get install -qq -y clang-3.9
+        - sudo apt-get build-dep -qq qemu
+        - wget -O - http://people.linaro.org/~alex.bennee/qemu-submodule-git-seed.tar.xz | tar -xvJ
+        - git submodule update --init --recursive
+      before_script:
+        - ./configure ${CONFIG} || cat config.log
+    # Trusty Linux User build with latest stable clang
+    - sudo: required
+      addons:
+      dist: trusty
+      language: generic
+      compiler: none
+      env:
+        - COMPILER_NAME=clang CXX=clang++-3.9 CC=clang-3.9
+        - CONFIG="--disable-system --cc=clang-3.9 --cxx=clang++-3.9"
      before_install:
        - wget -nv -O - http://llvm.org/apt/llvm-snapshot.gpg.key | sudo apt-key add -
        - sudo apt-add-repository -y 'deb http://llvm.org/apt/trusty llvm-toolchain-trusty-3.9 main'
--- a/7
+++ b/7
@@ -116,3 +116,10 @@ if (a == 1) {
 Rationale: Yoda conditions (as in 'if (1 == a)') are awkward to read.
 Besides, good compilers already warn users when '==' is mis-typed as '=',
 even when the constant is on the right.
+
+7. Comment style
+
+We use traditional C-style /* */ comments and avoid // comments.
+
+Rationale: The // form is valid in C99, so this is purely a matter of
+consistency of style. The checkpatch script will warn you about this.
--- a/85
+++ b/85
@@ -327,6 +327,7 @@ L: xen-devel@lists.xenproject.org
 S: Supported
 F: xen-*
 F: */xen*
+F: hw/9pfs/xen-9p-backend.c
 F: hw/char/xen_console.c
 F: hw/display/xenfb.c
 F: hw/net/xen_nic.c
@@ -561,20 +562,19 @@ F: hw/lm32/milkymist.c
 M68K Machines
 -------------
 an5206
-S: Orphan
+M: Thomas Huth <huth@tuxfamily.org>
+S: Odd Fixes
 F: hw/m68k/an5206.c
 F: hw/m68k/mcf5206.c

-dummy_m68k
-S: Orphan
-F: hw/m68k/dummy_m68k.c
-
 mcf5208
-S: Orphan
+M: Thomas Huth <huth@tuxfamily.org>
+S: Odd Fixes
 F: hw/m68k/mcf5208.c
 F: hw/m68k/mcf_intc.c
 F: hw/char/mcf_uart.c
 F: hw/net/mcf_fec.c
+F: include/hw/m68k/mcf*.h

 MicroBlaze Machines
 -------------------
@@ -601,15 +601,28 @@ S: Maintained
 F: hw/mips/mips_malta.c

 Mipssim
-L: qemu-devel@nongnu.org
-S: Orphan
+M: Yongbok Kim <yongbok.kim@imgtec.com>
+S: Odd Fixes
 F: hw/mips/mips_mipssim.c
+F: hw/net/mipsnet.c

 R4000
 M: Aurelien Jarno <aurelien@aurel32.net>
 S: Maintained
 F: hw/mips/mips_r4k.c

+Fulong 2E
+M: Yongbok Kim <yongbok.kim@imgtec.com>
+S: Odd Fixes
+F: hw/mips/mips_fulong2e.c
+
+Boston
+M: Paul Burton <paul.burton@imgtec.com>
+S: Maintained
+F: hw/core/loader-fit.c
+F: hw/mips/boston.c
+F: hw/pci-host/xilinx-pcie.c
+
 OpenRISC Machines
 -----------------
 or1k-sim
@@ -633,7 +646,6 @@ F: hw/ppc/ppc440_bamboo.c

 e500
 M: Alexander Graf <agraf@suse.de>
-M: Scott Wood <scottwood@freescale.com>
 L: qemu-ppc@nongnu.org
 S: Supported
 F: hw/ppc/e500.[hc]
@@ -644,7 +656,6 @@ F: pc-bios/u-boot.e500

 mpc8544ds
 M: Alexander Graf <agraf@suse.de>
-M: Scott Wood <scottwood@freescale.com>
 L: qemu-ppc@nongnu.org
 S: Supported
 F: hw/ppc/mpc8544ds.c
@@ -909,6 +920,8 @@ F: hw/acpi/*
 F: hw/smbios/*
 F: hw/i386/acpi-build.[hc]
 F: hw/arm/virt-acpi-build.c
+F: tests/bios-tables-test.c
+F: tests/acpi-utils.[hc]

 ppc4xx
 M: Alexander Graf <agraf@suse.de>
@@ -919,7 +932,6 @@ F: include/hw/ppc/ppc4xx.h

 ppce500
 M: Alexander Graf <agraf@suse.de>
-M: Scott Wood <scottwood@freescale.com>
 L: qemu-ppc@nongnu.org
 S: Supported
 F: hw/ppc/e500*
@@ -1034,7 +1046,7 @@ F: hw/input/virtio-input*.c
 F: include/hw/virtio/virtio-input.h

 virtio-serial
-M: Amit Shah <amit.shah@redhat.com>
+M: Amit Shah <amit@kernel.org>
 S: Supported
 F: hw/char/virtio-serial-bus.c
 F: hw/char/virtio-console.c
@@ -1043,7 +1055,7 @@ F: tests/virtio-console-test.c
 F: tests/virtio-serial-test.c

 virtio-rng
-M: Amit Shah <amit.shah@redhat.com>
+M: Amit Shah <amit@kernel.org>
 S: Supported
 F: hw/virtio/virtio-rng.c
 F: include/hw/virtio/virtio-rng.h
@@ -1123,6 +1135,15 @@ F: hw/nvram/chrp_nvram.c
 F: include/hw/nvram/chrp_nvram.h
 F: tests/prom-env-test.c

+VM Generation ID
+M: Ben Warren <ben@skyportsystems.com>
+S: Maintained
+F: hw/acpi/vmgenid.c
+F: include/hw/acpi/vmgenid.h
+F: docs/specs/vmgenid.txt
+F: tests/vmgenid-test.c
+F: stubs/vmgenid.c
+
 Subsystems
 ----------
 Audio
@@ -1208,6 +1229,15 @@ M: Samuel Thibault <samuel.thibault@ens-lyon.org>
 S: Maintained
 F: backends/baum.c

+Command line option argument parsing
+M: Markus Armbruster <armbru@redhat.com>
+S: Supported
+F: include/qemu/option.h
+F: tests/test-keyval.c
+F: tests/test-qemu-opts.c
+F: util/keyval.c
+F: util/qemu-option.c
+
 Coverity model
 M: Markus Armbruster <armbru@redhat.com>
 S: Supported
@@ -1342,7 +1372,9 @@ X: include/qapi/qmp/
 F: include/qapi/qmp/dispatch.h
 F: tests/qapi-schema/
 F: tests/test-*-visitor.c
+F: tests/test-qapi-*.c
 F: tests/test-qmp-*.c
+F: tests/test-visitor-serialization.c
 F: scripts/qapi*
 F: docs/qapi*
 T: git git://repo.or.cz/qemu/armbru.git qapi-next
@@ -1394,6 +1426,7 @@ F: qmp.c
 F: monitor.c
 F: docs/*qmp-*
 F: scripts/qmp/
+F: tests/qmp-test.c
 T: git git://repo.or.cz/qemu/armbru.git qapi-next

 Register API
@@ -1431,7 +1464,6 @@ F: scripts/checkpatch.pl

 Migration
 M: Juan Quintela <quintela@redhat.com>
-M: Amit Shah <amit.shah@redhat.com>
 M: Dr. David Alan Gilbert <dgilbert@redhat.com>
 S: Maintained
 F: include/migration/
@@ -1664,14 +1696,6 @@ S: Supported
 F: block/ssh.c
 T: git git://github.com/codyprime/qemu-kvm-jtc.git block

-ARCHIPELAGO
-M: Chrysostomos Nanakos <chris@include.gr>
-M: Jeff Cody <jcody@redhat.com>
-L: qemu-block@nongnu.org
-S: Maintained
-F: block/archipelago.c
-T: git git://github.com/codyprime/qemu-kvm-jtc.git block
-
 CURL
 M: Jeff Cody <jcody@redhat.com>
 L: qemu-block@nongnu.org
@@ -1791,8 +1815,8 @@ S: Supported
 F: tests/image-fuzzer/

 Replication
-M: Wen Congyang <wency@cn.fujitsu.com>
-M: Changlong Xie <xiecl.fnst@cn.fujitsu.com>
+M: Wen Congyang <wencongyang2@huawei.com>
+M: Xie Changlong <xiechanglong.d@gmail.com>
 S: Supported
 F: replication*
 F: block/replication.c
@@ -1802,9 +1826,14 @@ F: docs/block-replication.txt
 Build and test automation
 -------------------------
 M: Alex Bennée <alex.bennee@linaro.org>
+M: Fam Zheng <famz@redhat.com>
 L: qemu-devel@nongnu.org
-S: Supported
+S: Maintained
 F: .travis.yml
+F: .shippable.yml
+F: tests/docker/
+W: https://travis-ci.org/qemu/qemu
+W: http://patchew.org/QEMU/

 Documentation
 -------------
@@ -1813,9 +1842,3 @@ M: Daniel P. Berrange <berrange@redhat.com>
 S: Odd Fixes
 F: docs/build-system.txt

-Docker testing
--------------
-Docker based testing framework and cases
-M: Fam Zheng <famz@redhat.com>
-S: Maintained
-F: tests/docker/
--- a/80
+++ b/80
@@ -26,6 +26,7 @@ endif

 CONFIG_SOFTMMU := $(if $(filter %-softmmu,$(TARGET_DIRS)),y)
 CONFIG_USER_ONLY := $(if $(filter %-user,$(TARGET_DIRS)),y)
+CONFIG_XEN := $(CONFIG_XEN_BACKEND)
 CONFIG_ALL=y
 -include config-all-devices.mak
 -include config-all-disas.mak
@@ -50,24 +51,24 @@ endif

 include $(SRC_PATH)/rules.mak

-GENERATED_HEADERS = qemu-version.h config-host.h qemu-options.def
-GENERATED_HEADERS += qmp-commands.h qapi-types.h qapi-visit.h qapi-event.h
-GENERATED_SOURCES += qmp-marshal.c qapi-types.c qapi-visit.c qapi-event.c
-GENERATED_HEADERS += qmp-introspect.h
-GENERATED_SOURCES += qmp-introspect.c
+GENERATED_FILES = qemu-version.h config-host.h qemu-options.def
+GENERATED_FILES += qmp-commands.h qapi-types.h qapi-visit.h qapi-event.h
+GENERATED_FILES += qmp-marshal.c qapi-types.c qapi-visit.c qapi-event.c
+GENERATED_FILES += qmp-introspect.h
+GENERATED_FILES += qmp-introspect.c

-GENERATED_HEADERS += trace/generated-tcg-tracers.h
+GENERATED_FILES += trace/generated-tcg-tracers.h

-GENERATED_HEADERS += trace/generated-helpers-wrappers.h
-GENERATED_HEADERS += trace/generated-helpers.h
-GENERATED_SOURCES += trace/generated-helpers.c
+GENERATED_FILES += trace/generated-helpers-wrappers.h
+GENERATED_FILES += trace/generated-helpers.h
+GENERATED_FILES += trace/generated-helpers.c

 ifdef CONFIG_TRACE_UST
-GENERATED_HEADERS += trace-ust-all.h
-GENERATED_SOURCES += trace-ust-all.c
+GENERATED_FILES += trace-ust-all.h
+GENERATED_FILES += trace-ust-all.c
 endif

-GENERATED_HEADERS += module_block.h
+GENERATED_FILES += module_block.h

 TRACE_HEADERS = trace-root.h $(trace-events-subdirs:%=%/trace.h)
 TRACE_SOURCES = trace-root.c $(trace-events-subdirs:%=%/trace.c)
@@ -80,11 +81,15 @@ ifdef CONFIG_TRACE_UST
 TRACE_HEADERS += trace-ust-root.h $(trace-events-subdirs:%=%/trace-ust.h)
 endif

-GENERATED_HEADERS += $(TRACE_HEADERS)
-GENERATED_SOURCES += $(TRACE_SOURCES)
+GENERATED_FILES += $(TRACE_HEADERS)
+GENERATED_FILES += $(TRACE_SOURCES)
+GENERATED_FILES += $(BUILD_DIR)/trace-events-all

 trace-group-name = $(shell dirname $1 | sed -e 's/[^a-zA-Z0-9]/_/g')

+tracetool-y = $(SRC_PATH)/scripts/tracetool.py
+tracetool-y += $(shell find $(SRC_PATH)/scripts/tracetool -name "*.py")
+
 %/trace.h: %/trace.h-timestamp
 	@cmp $< $@ >/dev/null 2>&1 || cp $< $@
 %/trace.h-timestamp: $(SRC_PATH)/%/trace-events $(tracetool-y)
@@ -299,7 +304,11 @@ qemu-version.h: FORCE
 				printf '""\n'; \
 			fi; \
 		fi) > $@.tmp)
-	$(call quiet-command, cmp -s $@ $@.tmp || mv $@.tmp $@)
+	$(call quiet-command, if ! cmp -s $@ $@.tmp; then \
+	  mv $@.tmp $@; \
+	 else \
+	  rm $@.tmp; \
+	 fi)

 config-host.h: config-host.h-timestamp
 config-host.h-timestamp: config-host.mak
@@ -337,7 +346,7 @@ dtc/%:
 	mkdir -p $@

 $(SUBDIR_RULES): libqemuutil.a libqemustub.a $(common-obj-y) $(chardev-obj-y) \
-	$(qom-obj-y) $(crypto-aes-obj-$(CONFIG_USER_ONLY)) $(trace-obj-y)
+	$(qom-obj-y) $(crypto-aes-obj-$(CONFIG_USER_ONLY))

 ROMSUBDIR_RULES=$(patsubst %,romsubdir-%, $(ROMS))
 # Only keep -O and -g cflags
@@ -357,11 +366,11 @@ Makefile: $(version-obj-y)
 # Build libraries

 libqemustub.a: $(stub-obj-y)
-libqemuutil.a: $(util-obj-y)
+libqemuutil.a: $(util-obj-y) $(trace-obj-y)

 ######################################################################

-COMMON_LDADDS = $(trace-obj-y) libqemuutil.a libqemustub.a
+COMMON_LDADDS = libqemuutil.a libqemustub.a

 qemu-img.o: qemu-img-cmds.h

@@ -383,7 +392,6 @@ qemu-ga$(EXESUF): QEMU_CFLAGS += -I qga/qapi-generated
 gen-out-type = $(subst .,-,$(suffix $@))

 qapi-py = $(SRC_PATH)/scripts/qapi.py $(SRC_PATH)/scripts/ordereddict.py
-qapi-py += $(SRC_PATH)/scripts/qapi2texi.py

 qga/qapi-generated/qga-qapi-types.c qga/qapi-generated/qga-qapi-types.h :\
 $(SRC_PATH)/qga/qapi-schema.json $(SRC_PATH)/scripts/qapi-types.py $(qapi-py)
@@ -481,11 +489,10 @@ clean:
 	rm -f fsdev/*.pod
 	rm -f qemu-img-cmds.h
 	rm -f ui/shader/*-vert.h ui/shader/*-frag.h
-	@# May not be present in GENERATED_HEADERS
+	@# May not be present in GENERATED_FILES
 	rm -f trace/generated-tracers-dtrace.dtrace*
 	rm -f trace/generated-tracers-dtrace.h*
-	rm -f $(foreach f,$(GENERATED_HEADERS),$(f) $(f)-timestamp)
-	rm -f $(foreach f,$(GENERATED_SOURCES),$(f) $(f)-timestamp)
+	rm -f $(foreach f,$(GENERATED_FILES),$(f) $(f)-timestamp)
 	rm -rf qapi-generated
 	rm -rf qga/qapi-generated
 	for d in $(ALL_SUBDIRS); do \
@@ -512,7 +519,7 @@ distclean: clean
 	rm -f qemu-doc.vr qemu-doc.txt
 	rm -f config.log
 	rm -f linux-headers/asm
-	rm -f qemu-ga-qapi.texi qemu-qapi.texi
+	rm -f docs/qemu-ga-qapi.texi docs/qemu-qmp-qapi.texi docs/version.texi
 	rm -f docs/qemu-qmp-ref.7 docs/qemu-ga-ref.7
 	rm -f docs/qemu-qmp-ref.txt docs/qemu-ga-ref.txt
 	rm -f docs/qemu-qmp-ref.pdf docs/qemu-ga-ref.pdf
@@ -589,8 +596,7 @@ endif
 endif


-install: all $(if $(BUILD_DOCS),install-doc) \
-install-datadir install-localstatedir
+install: all $(if $(BUILD_DOCS),install-doc) install-datadir install-localstatedir
 ifneq ($(TOOLS),)
 	$(call install-prog,$(subst qemu-ga,qemu-ga$(EXESUF),$(TOOLS)),$(DESTDIR)$(bindir))
 endif
@@ -659,8 +665,11 @@ ui/console-gl.o: $(SRC_PATH)/ui/console-gl.c \

 # documentation
 MAKEINFO=makeinfo
-MAKEINFOFLAGS=--no-split --number-sections -D 'VERSION $(VERSION)'
-TEXIFLAG=$(if $(V),,--quiet) --command='@set VERSION $(VERSION)'
+MAKEINFOFLAGS=--no-split --number-sections -I docs
+TEXIFLAG=$(if $(V),,--quiet)
+
+docs/version.texi: $(SRC_PATH)/VERSION
+	$(call quiet-command,echo "@set VERSION $(VERSION)" > $@,"GEN","$@")

 %.html: %.texi
 	$(call quiet-command,LC_ALL=C $(MAKEINFO) $(MAKEINFOFLAGS) --no-headers \
@@ -674,7 +683,10 @@ TEXIFLAG=$(if $(V),,--quiet) --command='@set VERSION $(VERSION)'
 	--plaintext $< -o $@,"GEN","$@")

 %.pdf: %.texi
-	$(call quiet-command,texi2pdf $(TEXIFLAG) -I $(SRC_PATH) -I . $< -o $@,"GEN","$@")
+	$(call quiet-command,texi2pdf $(TEXIFLAG) -I $(SRC_PATH) -I docs $< -o $@,"GEN","$@")
+
+docs/qemu-ga-ref.html docs/qemu-ga-ref.info docs/qemu-ga-ref.txt docs/qemu-ga-ref.pdf docs/qemu-ga-ref.7.pod: docs/version.texi
+docs/qemu-qmp-ref.html docs/qemu-qmp-ref.info docs/qemu-qmp-ref.txt docs/qemu-qmp-ref.pdf docs/qemu-qmp-ref.pod: docs/version.texi

 qemu-options.texi: $(SRC_PATH)/qemu-options.hx $(SRC_PATH)/scripts/hxtool
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"GEN","$@")
@@ -688,10 +700,12 @@ qemu-monitor-info.texi: $(SRC_PATH)/hmp-commands-info.hx $(SRC_PATH)/scripts/hxt
 qemu-img-cmds.texi: $(SRC_PATH)/qemu-img-cmds.hx $(SRC_PATH)/scripts/hxtool
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"GEN","$@")

-qemu-qapi.texi: $(qapi-modules) $(qapi-py)
+docs/qemu-qmp-qapi.texi docs/qemu-ga-qapi.texi: $(SRC_PATH)/scripts/qapi2texi.py $(qapi-py)
+
+docs/qemu-qmp-qapi.texi: $(qapi-modules)
 	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi2texi.py $< > $@,"GEN","$@")

-qemu-ga-qapi.texi: $(SRC_PATH)/qga/qapi-schema.json $(qapi-py)
+docs/qemu-ga-qapi.texi: $(SRC_PATH)/qga/qapi-schema.json
 	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi2texi.py $< > $@,"GEN","$@")

 qemu.1: qemu-doc.texi qemu-options.texi qemu-monitor.texi qemu-monitor-info.texi
@@ -712,10 +726,10 @@ qemu-doc.html qemu-doc.info qemu-doc.pdf qemu-doc.txt: \
 	qemu-monitor-info.texi

 docs/qemu-ga-ref.dvi docs/qemu-ga-ref.html docs/qemu-ga-ref.info docs/qemu-ga-ref.pdf docs/qemu-ga-ref.txt docs/qemu-ga-ref.7: \
-docs/qemu-ga-ref.texi qemu-ga-qapi.texi
+docs/qemu-ga-ref.texi docs/qemu-ga-qapi.texi

 docs/qemu-qmp-ref.dvi docs/qemu-qmp-ref.html docs/qemu-qmp-ref.info docs/qemu-qmp-ref.pdf docs/qemu-qmp-ref.txt docs/qemu-qmp-ref.7: \
-docs/qemu-qmp-ref.texi qemu-qapi.texi
+docs/qemu-qmp-ref.texi docs/qemu-qmp-qapi.texi


 ifdef CONFIG_WIN32
@@ -777,7 +791,7 @@ endif # CONFIG_WIN
 # Add a dependency on the generated files, so that they are always
 # rebuilt before other object files
 ifneq ($(filter-out $(UNCHECKED_GOALS),$(MAKECMDGOALS)),$(if $(MAKECMDGOALS),,fail))
-Makefile: $(GENERATED_HEADERS)
+Makefile: $(GENERATED_FILES)
 endif

 .SECONDARY: $(TRACE_HEADERS) $(TRACE_HEADERS:%=%-timestamp) \
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -9,12 +9,8 @@ chardev-obj-y = chardev/
 #######################################################################
 # block-obj-y is code used by both qemu system emulation and qemu-img

-block-obj-y = async.o thread-pool.o
 block-obj-y += nbd/
 block-obj-y += block.o blockjob.o
-block-obj-y += main-loop.o iohandler.o qemu-timer.o
-block-obj-$(CONFIG_POSIX) += aio-posix.o
-block-obj-$(CONFIG_WIN32) += aio-win32.o
 block-obj-y += block/
 block-obj-y += qemu-io-cmds.o
 block-obj-$(CONFIG_REPLICATION) += replication.o
@@ -125,6 +121,7 @@ trace-events-subdirs += crypto
 trace-events-subdirs += io
 trace-events-subdirs += migration
 trace-events-subdirs += block
+trace-events-subdirs += backends
 trace-events-subdirs += hw/block
 trace-events-subdirs += hw/block/dataplane
 trace-events-subdirs += hw/char
@@ -160,6 +157,7 @@ trace-events-subdirs += audio
 trace-events-subdirs += net
 trace-events-subdirs += target/arm
 trace-events-subdirs += target/i386
+trace-events-subdirs += target/mips
 trace-events-subdirs += target/sparc
 trace-events-subdirs += target/s390x
 trace-events-subdirs += target/ppc
--- a/Makefile.target
+++ b/Makefile.target
@@ -149,12 +149,6 @@ obj-y += dump.o
 obj-y += migration/ram.o migration/savevm.o
 LIBS := $(libs_softmmu) $(LIBS)

-# xen support
-obj-$(CONFIG_XEN) += xen-common.o
-obj-$(CONFIG_XEN_I386) += xen-hvm.o xen-mapcache.o
-obj-$(call lnot,$(CONFIG_XEN)) += xen-common-stub.o
-obj-$(call lnot,$(CONFIG_XEN_I386)) += xen-hvm-stub.o
-
 # Hardware support
 ifeq ($(TARGET_NAME), sparc64)
 obj-y += hw/sparc64/
@@ -162,7 +156,7 @@ else
 obj-y += hw/$(TARGET_BASE_ARCH)/
 endif

-GENERATED_HEADERS += hmp-commands.h hmp-commands-info.h
+GENERATED_FILES += hmp-commands.h hmp-commands-info.h

 endif # CONFIG_SOFTMMU

@@ -188,8 +182,7 @@ dummy := $(call unnest-vars,.., \
               qom-obj-y \
               io-obj-y \
               common-obj-y \
-               common-obj-m \
-               trace-obj-y)
+               common-obj-m)
 target-obj-y := $(target-obj-y-save)
 all-obj-y += $(common-obj-y)
 all-obj-y += $(target-obj-y)
@@ -201,7 +194,7 @@ all-obj-$(CONFIG_SOFTMMU) += $(io-obj-y)

 $(QEMU_PROG_BUILD): config-devices.mak

-COMMON_LDADDS = $(trace-obj-y) ../libqemuutil.a ../libqemustub.a
+COMMON_LDADDS = ../libqemuutil.a ../libqemustub.a

 # build either PROG or PROGW
 $(QEMU_PROG_BUILD): $(all-obj-y) $(COMMON_LDADDS)
@@ -238,5 +231,5 @@ ifdef CONFIG_TRACE_SYSTEMTAP
 	$(INSTALL_DATA) $(QEMU_PROG)-simpletrace.stp "$(DESTDIR)$(qemu_datadir)/../systemtap/tapset/$(QEMU_PROG)-simpletrace.stp"
 endif

-GENERATED_HEADERS += config-target.h
-Makefile: $(GENERATED_HEADERS)
+GENERATED_FILES += config-target.h
+Makefile: $(GENERATED_FILES)
--- a/2
+++ b/2
@@ -1 +1 @@
-2.8.50
+2.9.50
--- a/audio/audio.c
+++ b/audio/audio.c
@@ -28,6 +28,7 @@
 #include "qemu/timer.h"
 #include "sysemu/sysemu.h"
 #include "qemu/cutils.h"
+#include "sysemu/replay.h"

 #define AUDIO_CAP "audio"
 #include "audio_int.h"
@@ -1112,7 +1113,7 @@ static int audio_is_timer_needed (void)
 static void audio_reset_timer (AudioState *s)
 {
    if (audio_is_timer_needed ()) {
-        timer_mod (s->ts,
+        timer_mod_anticipate_ns(s->ts,
            qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + conf.period.ticks);
    }
    else {
@@ -1387,6 +1388,7 @@ static void audio_run_out (AudioState *s)

        prev_rpos = hw->rpos;
        played = hw->pcm_ops->run_out (hw, live);
+        replay_audio_out(&played);
        if (audio_bug (AUDIO_FUNC, hw->rpos >= hw->samples)) {
            dolog ("hw->rpos=%d hw->samples=%d played=%d\n",
                   hw->rpos, hw->samples, played);
@@ -1450,9 +1452,12 @@ static void audio_run_in (AudioState *s)

    while ((hw = audio_pcm_hw_find_any_enabled_in (hw))) {
        SWVoiceIn *sw;
-        int captured, min;
+        int captured = 0, min;

-        captured = hw->pcm_ops->run_in (hw);
+        if (replay_mode != REPLAY_MODE_PLAY) {
+            captured = hw->pcm_ops->run_in(hw);
+        }
+        replay_audio_in(&captured, hw->conv_buf, &hw->wpos, hw->samples);

        min = audio_pcm_hw_find_min_in (hw);
        hw->total_samples_captured += captured - min;
--- a/audio/audio.h
+++ b/audio/audio.h
@@ -166,4 +166,9 @@ int wav_start_capture (CaptureState *s, const char *path, int freq,
 bool audio_is_cleaning_up(void);
 void audio_cleanup(void);

+void audio_sample_to_uint64(void *samples, int pos,
+                            uint64_t *left, uint64_t *right);
+void audio_sample_from_uint64(void *samples, int pos,
+                            uint64_t left, uint64_t right);
+
 #endif /* QEMU_AUDIO_H */
--- a/audio/mixeng.c
+++ b/audio/mixeng.c
@@ -25,6 +25,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "qemu/bswap.h"
+#include "qemu/error-report.h"
 #include "audio.h"

 #define AUDIO_CAP "mixeng"
@@ -267,6 +268,37 @@ f_sample *mixeng_clip[2][2][2][3] = {
    }
 };

+
+void audio_sample_to_uint64(void *samples, int pos,
+                            uint64_t *left, uint64_t *right)
+{
+    struct st_sample *sample = samples;
+    sample += pos;
+#ifdef FLOAT_MIXENG
+    error_report(
+        "Coreaudio and floating point samples are not supported by replay yet");
+    abort();
+#else
+    *left = sample->l;
+    *right = sample->r;
+#endif
+}
+
+void audio_sample_from_uint64(void *samples, int pos,
+                            uint64_t left, uint64_t right)
+{
+    struct st_sample *sample = samples;
+    sample += pos;
+#ifdef FLOAT_MIXENG
+    error_report(
+        "Coreaudio and floating point samples are not supported by replay yet");
+    abort();
+#else
+    sample->l = left;
+    sample->r = right;
+#endif
+}
+
 /*
 * August 21, 1998
 * Copyright 1998 Fabrice Bellard.
--- a/audio/sdlaudio.c
+++ b/audio/sdlaudio.c
@@ -38,10 +38,14 @@
 #define AUDIO_CAP "sdl"
 #include "audio_int.h"

+#define USE_SEMAPHORE (SDL_MAJOR_VERSION < 2)
+
 typedef struct SDLVoiceOut {
    HWVoiceOut hw;
    int live;
+#if USE_SEMAPHORE
    int rpos;
+#endif
    int decr;
 } SDLVoiceOut;

@@ -53,8 +57,10 @@ static struct {

 static struct SDLAudioState {
    int exit;
+#if USE_SEMAPHORE
    SDL_mutex *mutex;
    SDL_sem *sem;
+#endif
    int initialized;
    bool driver_created;
 } glob_sdl;
@@ -73,31 +79,45 @@ static void GCC_FMT_ATTR (1, 2) sdl_logerr (const char *fmt, ...)

 static int sdl_lock (SDLAudioState *s, const char *forfn)
 {
+#if USE_SEMAPHORE
    if (SDL_LockMutex (s->mutex)) {
        sdl_logerr ("SDL_LockMutex for %s failed\n", forfn);
        return -1;
    }
+#else
+    SDL_LockAudio();
+#endif
+
    return 0;
 }

 static int sdl_unlock (SDLAudioState *s, const char *forfn)
 {
+#if USE_SEMAPHORE
    if (SDL_UnlockMutex (s->mutex)) {
        sdl_logerr ("SDL_UnlockMutex for %s failed\n", forfn);
        return -1;
    }
+#else
+    SDL_UnlockAudio();
+#endif
+
    return 0;
 }

 static int sdl_post (SDLAudioState *s, const char *forfn)
 {
+#if USE_SEMAPHORE
    if (SDL_SemPost (s->sem)) {
        sdl_logerr ("SDL_SemPost for %s failed\n", forfn);
        return -1;
    }
+#endif
+
    return 0;
 }

+#if USE_SEMAPHORE
 static int sdl_wait (SDLAudioState *s, const char *forfn)
 {
    if (SDL_SemWait (s->sem)) {
@@ -106,6 +126,7 @@ static int sdl_wait (SDLAudioState *s, const char *forfn)
    }
    return 0;
 }
+#endif

 static int sdl_unlock_and_post (SDLAudioState *s, const char *forfn)
 {
@@ -246,6 +267,7 @@ static void sdl_callback (void *opaque, Uint8 *buf, int len)
        int to_mix, decr;

        /* dolog ("in callback samples=%d\n", samples); */
+#if USE_SEMAPHORE
        sdl_wait (s, "sdl_callback");
        if (s->exit) {
            return;
@@ -264,6 +286,11 @@ static void sdl_callback (void *opaque, Uint8 *buf, int len)
        if (!sdl->live) {
            goto again;
        }
+#else
+        if (s->exit || !sdl->live) {
+            break;
+        }
+#endif

        /* dolog ("in callback live=%d\n", live); */
        to_mix = audio_MIN (samples, sdl->live);
@@ -274,7 +301,11 @@ static void sdl_callback (void *opaque, Uint8 *buf, int len)

            /* dolog ("in callback to_mix %d, chunk %d\n", to_mix, chunk); */
            hw->clip (buf, src, chunk);
+#if USE_SEMAPHORE
            sdl->rpos = (sdl->rpos + chunk) % hw->samples;
+#else
+            hw->rpos = (hw->rpos + chunk) % hw->samples;
+#endif
            to_mix -= chunk;
            buf += chunk << hw->info.shift;
        }
@@ -282,12 +313,21 @@ static void sdl_callback (void *opaque, Uint8 *buf, int len)
        sdl->live -= decr;
        sdl->decr += decr;

+#if USE_SEMAPHORE
    again:
        if (sdl_unlock (s, "sdl_callback")) {
            return;
        }
+#endif
    }
    /* dolog ("done len=%d\n", len); */
+
+#if (SDL_MAJOR_VERSION >= 2)
+    /* SDL2 does not clear the remaining buffer for us, so do it on our own */
+    if (samples) {
+        memset(buf, 0, samples << hw->info.shift);
+    }
+#endif
 }

 static int sdl_write_out (SWVoiceOut *sw, void *buf, int len)
@@ -315,8 +355,12 @@ static int sdl_run_out (HWVoiceOut *hw, int live)
    decr = audio_MIN (sdl->decr, live);
    sdl->decr -= decr;

+#if USE_SEMAPHORE
    sdl->live = live - decr;
    hw->rpos = sdl->rpos;
+#else
+    sdl->live = live;
+#endif

    if (sdl->live > 0) {
        sdl_unlock_and_post (s, "sdl_run_out");
@@ -405,6 +449,7 @@ static void *sdl_audio_init (void)
        return NULL;
    }

+#if USE_SEMAPHORE
    s->mutex = SDL_CreateMutex ();
    if (!s->mutex) {
        sdl_logerr ("Failed to create SDL mutex\n");
@@ -419,6 +464,7 @@ static void *sdl_audio_init (void)
        SDL_QuitSubSystem (SDL_INIT_AUDIO);
        return NULL;
    }
+#endif

    s->driver_created = true;
    return s;
@@ -428,8 +474,10 @@ static void sdl_audio_fini (void *opaque)
 {
    SDLAudioState *s = opaque;
    sdl_close (s);
+#if USE_SEMAPHORE
    SDL_DestroySemaphore (s->sem);
    SDL_DestroyMutex (s->mutex);
+#endif
    SDL_QuitSubSystem (SDL_INIT_AUDIO);
    s->driver_created = false;
 }
--- a/backends/Makefile.objs
+++ b/backends/Makefile.objs
@@ -1,7 +1,7 @@
 common-obj-y += rng.o rng-egd.o
 common-obj-$(CONFIG_POSIX) += rng-random.o

-common-obj-y += msmouse.o testdev.o
+common-obj-y += msmouse.o wctablet.o testdev.o
 common-obj-$(CONFIG_BRLAPI) += baum.o
 baum.o-cflags := $(SDL_CFLAGS)

--- a/backends/cryptodev-builtin.c
+++ b/backends/cryptodev-builtin.c
@@ -320,11 +320,13 @@ static int cryptodev_builtin_sym_operation(

    sess = builtin->sessions[op_info->session_id];

+    if (op_info->iv_len > 0) {
        ret = qcrypto_cipher_setiv(sess->cipher, op_info->iv,
                                   op_info->iv_len, errp);
        if (ret < 0) {
            return -VIRTIO_CRYPTO_ERR;
        }
+    }

    if (sess->direction == VIRTIO_CRYPTO_OP_ENCRYPT) {
        ret = qcrypto_cipher_encrypt(sess->cipher, op_info->src,
@@ -359,8 +361,6 @@ static void cryptodev_builtin_cleanup(
        }
    }

-    assert(queues == 1);
-
    for (i = 0; i < queues; i++) {
        cc = backend->conf.peers.ccs[i];
        if (cc) {
--- a/backends/hostmem-file.c
+++ b/backends/hostmem-file.c
@@ -51,7 +51,7 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
 #ifndef CONFIG_LINUX
    error_setg(errp, "-mem-path not supported on this host");
 #else
-    if (!memory_region_size(&backend->mr)) {
+    if (!host_memory_backend_mr_inited(backend)) {
        gchar *path;
        backend->force_prealloc = mem_prealloc;
        path = object_get_canonical_path(OBJECT(backend));
@@ -76,7 +76,7 @@ static void set_mem_path(Object *o, const char *str, Error **errp)
    HostMemoryBackend *backend = MEMORY_BACKEND(o);
    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o);

-    if (memory_region_size(&backend->mr)) {
+    if (host_memory_backend_mr_inited(backend)) {
        error_setg(errp, "cannot change property value");
        return;
    }
@@ -96,7 +96,7 @@ static void file_memory_backend_set_share(Object *o, bool value, Error **errp)
    HostMemoryBackend *backend = MEMORY_BACKEND(o);
    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o);

-    if (memory_region_size(&backend->mr)) {
+    if (host_memory_backend_mr_inited(backend)) {
        error_setg(errp, "cannot change property value");
        return;
    }
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -45,7 +45,7 @@ host_memory_backend_set_size(Object *obj, Visitor *v, const char *name,
    Error *local_err = NULL;
    uint64_t value;

-    if (memory_region_size(&backend->mr)) {
+    if (host_memory_backend_mr_inited(backend)) {
        error_setg(&local_err, "cannot change property value");
        goto out;
    }
@@ -64,14 +64,6 @@ out:
    error_propagate(errp, local_err);
 }

-static uint16List **host_memory_append_node(uint16List **node,
-                                            unsigned long value)
-{
-     *node = g_malloc0(sizeof(**node));
-     (*node)->value = value;
-     return &(*node)->next;
-}
-
 static void
 host_memory_backend_get_host_nodes(Object *obj, Visitor *v, const char *name,
                                   void *opaque, Error **errp)
@@ -82,23 +74,25 @@ host_memory_backend_get_host_nodes(Object *obj, Visitor *v, const char *name,
    unsigned long value;

    value = find_first_bit(backend->host_nodes, MAX_NODES);
-
-    node = host_memory_append_node(node, value);
-
    if (value == MAX_NODES) {
-        goto out;
+        return;
    }

+    *node = g_malloc0(sizeof(**node));
+    (*node)->value = value;
+    node = &(*node)->next;
+
    do {
        value = find_next_bit(backend->host_nodes, MAX_NODES, value + 1);
        if (value == MAX_NODES) {
            break;
        }

-        node = host_memory_append_node(node, value);
+        *node = g_malloc0(sizeof(**node));
+        (*node)->value = value;
+        node = &(*node)->next;
    } while (true);

-out:
    visit_type_uint16List(v, name, &host_nodes, errp);
 }

@@ -152,7 +146,7 @@ static void host_memory_backend_set_merge(Object *obj, bool value, Error **errp)
 {
    HostMemoryBackend *backend = MEMORY_BACKEND(obj);

-    if (!memory_region_size(&backend->mr)) {
+    if (!host_memory_backend_mr_inited(backend)) {
        backend->merge = value;
        return;
    }
@@ -178,7 +172,7 @@ static void host_memory_backend_set_dump(Object *obj, bool value, Error **errp)
 {
    HostMemoryBackend *backend = MEMORY_BACKEND(obj);

-    if (!memory_region_size(&backend->mr)) {
+    if (!host_memory_backend_mr_inited(backend)) {
        backend->dump = value;
        return;
    }
@@ -214,7 +208,7 @@ static void host_memory_backend_set_prealloc(Object *obj, bool value,
        }
    }

-    if (!memory_region_size(&backend->mr)) {
+    if (!host_memory_backend_mr_inited(backend)) {
        backend->prealloc = value;
        return;
    }
@@ -224,7 +218,7 @@ static void host_memory_backend_set_prealloc(Object *obj, bool value,
        void *ptr = memory_region_get_ram_ptr(&backend->mr);
        uint64_t sz = memory_region_size(&backend->mr);

-        os_mem_prealloc(fd, ptr, sz, &local_err);
+        os_mem_prealloc(fd, ptr, sz, smp_cpus, &local_err);
        if (local_err) {
            error_propagate(errp, local_err);
            return;
@@ -243,10 +237,19 @@ static void host_memory_backend_init(Object *obj)
    backend->prealloc = mem_prealloc;
 }

+bool host_memory_backend_mr_inited(HostMemoryBackend *backend)
+{
+    /*
+     * NOTE: We forbid zero-length memory backend, so here zero means
+     * "we haven't inited the backend memory region yet".
+     */
+    return memory_region_size(&backend->mr) != 0;
+}
+
 MemoryRegion *
 host_memory_backend_get_memory(HostMemoryBackend *backend, Error **errp)
 {
-    return memory_region_size(&backend->mr) ? &backend->mr : NULL;
+    return host_memory_backend_mr_inited(backend) ? &backend->mr : NULL;
 }

 void host_memory_backend_set_mapped(HostMemoryBackend *backend, bool mapped)
@@ -328,7 +331,7 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
         */
        if (backend->prealloc) {
            os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz,
-                            &local_err);
+                            smp_cpus, &local_err);
            if (local_err) {
                goto out;
            }
--- a/backends/trace-events
+++ b/backends/trace-events
@@ -0,0 +1,10 @@
+# See docs/tracing.txt for syntax documentation.
+
+# backends/wctablet.c
+wct_init(void) ""
+wct_cmd_re(void) ""
+wct_cmd_st(void) ""
+wct_cmd_sp(void) ""
+wct_cmd_ts(int input) "0x%02x"
+wct_cmd_other(const char *cmd) "%s"
+wct_speed(int speed) "%d"
--- a/backends/wctablet.c
+++ b/backends/wctablet.c
@@ -0,0 +1,369 @@
+/*
+ * QEMU Wacom Penpartner serial tablet emulation
+ *
+ * some protocol details:
+ *   http://linuxwacom.sourceforge.net/wiki/index.php/Serial_Protocol_IV
+ *
+ * Copyright (c) 2016 Anatoli Huseu1
+ * Copyright (c) 2016,17 Gerd Hoffmann
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <time.h>
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "sysemu/char.h"
+#include "ui/console.h"
+#include "ui/input.h"
+#include "trace.h"
+
+
+#define WC_OUTPUT_BUF_MAX_LEN 512
+#define WC_COMMAND_MAX_LEN 60
+
+#define WC_L7(n) ((n) & 127)
+#define WC_M7(n) (((n) >> 7) & 127)
+#define WC_H2(n) ((n) >> 14)
+
+#define WC_L4(n) ((n) & 15)
+#define WC_H4(n) (((n) >> 4) & 15)
+
+/* Model string and config string */
+#define WC_MODEL_STRING_LENGTH 18
+uint8_t WC_MODEL_STRING[WC_MODEL_STRING_LENGTH + 1] = "~#CT-0045R,V1.3-5,";
+
+#define WC_CONFIG_STRING_LENGTH 8
+uint8_t WC_CONFIG_STRING[WC_CONFIG_STRING_LENGTH + 1] = "96,N,8,0";
+
+#define WC_FULL_CONFIG_STRING_LENGTH 61
+uint8_t WC_FULL_CONFIG_STRING[WC_FULL_CONFIG_STRING_LENGTH + 1] = {
+    0x5c, 0x39, 0x36, 0x2c, 0x4e, 0x2c, 0x38, 0x2c,
+    0x31, 0x28, 0x01, 0x24, 0x57, 0x41, 0x43, 0x30,
+    0x30, 0x34, 0x35, 0x5c, 0x5c, 0x50, 0x45, 0x4e, 0x5c,
+    0x57, 0x41, 0x43, 0x30, 0x30, 0x30, 0x30, 0x5c,
+    0x54, 0x61, 0x62, 0x6c, 0x65, 0x74, 0x0d, 0x0a,
+    0x43, 0x54, 0x2d, 0x30, 0x30, 0x34, 0x35, 0x52,
+    0x2c, 0x56, 0x31, 0x2e, 0x33, 0x2d, 0x35, 0x0d,
+    0x0a, 0x45, 0x37, 0x29
+};
+
+/* This structure is used to save private info for Wacom Tablet. */
+typedef struct {
+    Chardev parent;
+    QemuInputHandlerState *hs;
+
+    /* Query string from serial */
+    uint8_t query[100];
+    int query_index;
+
+    /* Command to be sent to serial port */
+    uint8_t outbuf[WC_OUTPUT_BUF_MAX_LEN];
+    int outlen;
+
+    int line_speed;
+    bool send_events;
+    int axis[INPUT_AXIS__MAX];
+    bool btns[INPUT_BUTTON__MAX];
+
+} TabletChardev;
+
+#define TYPE_CHARDEV_WCTABLET "chardev-wctablet"
+#define WCTABLET_CHARDEV(obj)                                      \
+    OBJECT_CHECK(TabletChardev, (obj), TYPE_CHARDEV_WCTABLET)
+
+
+static void wctablet_chr_accept_input(Chardev *chr);
+
+static void wctablet_shift_input(TabletChardev *tablet, int count)
+{
+    tablet->query_index -= count;
+    memmove(tablet->query, tablet->query + count, tablet->query_index);
+    tablet->query[tablet->query_index] = 0;
+}
+
+static void wctablet_queue_output(TabletChardev *tablet, uint8_t *buf, int count)
+{
+    if (tablet->outlen + count > sizeof(tablet->outbuf)) {
+        return;
+    }
+
+    memcpy(tablet->outbuf + tablet->outlen, buf, count);
+    tablet->outlen += count;
+    wctablet_chr_accept_input(CHARDEV(tablet));
+}
+
+static void wctablet_reset(TabletChardev *tablet)
+{
+    /* clear buffers */
+    tablet->query_index = 0;
+    tablet->outlen = 0;
+    /* reset state */
+    tablet->send_events = false;
+}
+
+static void wctablet_queue_event(TabletChardev *tablet)
+{
+    uint8_t codes[8] = { 0xe0, 0, 0, 0, 0, 0, 0 };
+
+    if (tablet->line_speed != 9600) {
+        return;
+    }
+
+    int newX = tablet->axis[INPUT_AXIS_X] * 0.1537;
+    int nexY = tablet->axis[INPUT_AXIS_Y] * 0.1152;
+
+    codes[0] = codes[0] | WC_H2(newX);
+    codes[1] = codes[1] | WC_M7(newX);
+    codes[2] = codes[2] | WC_L7(newX);
+
+    codes[3] = codes[3] | WC_H2(nexY);
+    codes[4] = codes[4] | WC_M7(nexY);
+    codes[5] = codes[5] | WC_L7(nexY);
+
+    if (tablet->btns[INPUT_BUTTON_LEFT]) {
+        codes[0] = 0xa0;
+    }
+
+    wctablet_queue_output(tablet, codes, 7);
+}
+
+static void wctablet_input_event(DeviceState *dev, QemuConsole *src,
+                                InputEvent *evt)
+{
+    TabletChardev *tablet = (TabletChardev *)dev;
+    InputMoveEvent *move;
+    InputBtnEvent *btn;
+
+    switch (evt->type) {
+    case INPUT_EVENT_KIND_ABS:
+        move = evt->u.abs.data;
+        tablet->axis[move->axis] = move->value;
+        break;
+
+    case INPUT_EVENT_KIND_BTN:
+        btn = evt->u.btn.data;
+        tablet->btns[btn->button] = btn->down;
+        break;
+
+    default:
+        /* keep gcc happy */
+        break;
+    }
+}
+
+static void wctablet_input_sync(DeviceState *dev)
+{
+    TabletChardev *tablet = (TabletChardev *)dev;
+
+    if (tablet->send_events) {
+        wctablet_queue_event(tablet);
+    }
+}
+
+static QemuInputHandler wctablet_handler = {
+    .name  = "QEMU Wacome Pen Tablet",
+    .mask  = INPUT_EVENT_MASK_BTN | INPUT_EVENT_MASK_ABS,
+    .event = wctablet_input_event,
+    .sync  = wctablet_input_sync,
+};
+
+static void wctablet_chr_accept_input(Chardev *chr)
+{
+    TabletChardev *tablet = WCTABLET_CHARDEV(chr);
+    int len, canWrite;
+
+    canWrite = qemu_chr_be_can_write(chr);
+    len = canWrite;
+    if (len > tablet->outlen) {
+        len = tablet->outlen;
+    }
+
+    if (len) {
+        qemu_chr_be_write(chr, tablet->outbuf, len);
+        tablet->outlen -= len;
+        if (tablet->outlen) {
+            memmove(tablet->outbuf, tablet->outbuf + len, tablet->outlen);
+        }
+    }
+}
+
+static int wctablet_chr_write(struct Chardev *chr,
+                              const uint8_t *buf, int len)
+{
+    TabletChardev *tablet = WCTABLET_CHARDEV(chr);
+    unsigned int i, clen;
+    char *pos;
+
+    if (tablet->line_speed != 9600) {
+        return len;
+    }
+    for (i = 0; i < len && tablet->query_index < sizeof(tablet->query) - 1; i++) {
+        tablet->query[tablet->query_index++] = buf[i];
+    }
+    tablet->query[tablet->query_index] = 0;
+
+    while (tablet->query_index > 0 && (tablet->query[0] == '@'  ||
+                                       tablet->query[0] == '\r' ||
+                                       tablet->query[0] == '\n')) {
+        wctablet_shift_input(tablet, 1);
+    }
+    if (!tablet->query_index) {
+        return len;
+    }
+
+    if (strncmp((char *)tablet->query, "~#", 2) == 0) {
+        /* init / detect sequence */
+        trace_wct_init();
+        wctablet_shift_input(tablet, 2);
+        wctablet_queue_output(tablet, WC_MODEL_STRING,
+                              WC_MODEL_STRING_LENGTH);
+        return len;
+    }
+
+    /* detect line */
+    pos = strchr((char *)tablet->query, '\r');
+    if (!pos) {
+        pos = strchr((char *)tablet->query, '\n');
+    }
+    if (!pos) {
+        return len;
+    }
+    clen = pos - (char *)tablet->query;
+
+    /* process commands */
+    if (strncmp((char *)tablet->query, "RE", 2) == 0 &&
+        clen == 2) {
+        trace_wct_cmd_re();
+        wctablet_shift_input(tablet, 3);
+        wctablet_queue_output(tablet, WC_CONFIG_STRING,
+                              WC_CONFIG_STRING_LENGTH);
+
+    } else if (strncmp((char *)tablet->query, "ST", 2) == 0 &&
+               clen == 2) {
+        trace_wct_cmd_st();
+        wctablet_shift_input(tablet, 3);
+        tablet->send_events = true;
+        wctablet_queue_event(tablet);
+
+    } else if (strncmp((char *)tablet->query, "SP", 2) == 0 &&
+               clen == 2) {
+        trace_wct_cmd_sp();
+        wctablet_shift_input(tablet, 3);
+        tablet->send_events = false;
+
+    } else if (strncmp((char *)tablet->query, "TS", 2) == 0 &&
+               clen == 3) {
+        unsigned int input = tablet->query[2];
+        uint8_t codes[7] = {
+            0xa3,
+            ((input & 0x80) == 0) ? 0x7e : 0x7f,
+            (((WC_H4(input) & 0x7) ^ 0x5) << 4) | (WC_L4(input) ^ 0x7),
+            0x03,
+            0x7f,
+            0x7f,
+            0x00,
+        };
+        trace_wct_cmd_ts(input);
+        wctablet_shift_input(tablet, 4);
+        wctablet_queue_output(tablet, codes, 7);
+
+    } else {
+        tablet->query[clen] = 0; /* terminate line for printing */
+        trace_wct_cmd_other((char *)tablet->query);
+        wctablet_shift_input(tablet, clen + 1);
+
+    }
+
+    return len;
+}
+
+static int wctablet_chr_ioctl(Chardev *chr, int cmd, void *arg)
+{
+    TabletChardev *tablet = WCTABLET_CHARDEV(chr);
+    QEMUSerialSetParams *ssp;
+
+    switch (cmd) {
+    case CHR_IOCTL_SERIAL_SET_PARAMS:
+        ssp = arg;
+        if (tablet->line_speed != ssp->speed) {
+            trace_wct_speed(ssp->speed);
+            wctablet_reset(tablet);
+            tablet->line_speed = ssp->speed;
+        }
+        break;
+    default:
+        return -ENOTSUP;
+    }
+    return 0;
+}
+
+static void wctablet_chr_finalize(Object *obj)
+{
+    TabletChardev *tablet = WCTABLET_CHARDEV(obj);
+
+    qemu_input_handler_unregister(tablet->hs);
+    g_free(tablet);
+}
+
+static void wctablet_chr_open(Chardev *chr,
+                              ChardevBackend *backend,
+                              bool *be_opened,
+                              Error **errp)
+{
+    TabletChardev *tablet = WCTABLET_CHARDEV(chr);
+
+    *be_opened = true;
+
+    /* init state machine */
+    memcpy(tablet->outbuf, WC_FULL_CONFIG_STRING, WC_FULL_CONFIG_STRING_LENGTH);
+    tablet->outlen = WC_FULL_CONFIG_STRING_LENGTH;
+    tablet->query_index = 0;
+
+    tablet->hs = qemu_input_handler_register((DeviceState *)tablet,
+                                             &wctablet_handler);
+}
+
+static void wctablet_chr_class_init(ObjectClass *oc, void *data)
+{
+    ChardevClass *cc = CHARDEV_CLASS(oc);
+
+    cc->open = wctablet_chr_open;
+    cc->chr_write = wctablet_chr_write;
+    cc->chr_ioctl = wctablet_chr_ioctl;
+    cc->chr_accept_input = wctablet_chr_accept_input;
+}
+
+static const TypeInfo wctablet_type_info = {
+    .name = TYPE_CHARDEV_WCTABLET,
+    .parent = TYPE_CHARDEV,
+    .instance_size = sizeof(TabletChardev),
+    .instance_finalize = wctablet_chr_finalize,
+    .class_init = wctablet_chr_class_init,
+};
+
+static void register_types(void)
+{
+     type_register_static(&wctablet_type_info);
+}
+
+type_init(register_types);
--- a/block.c
+++ b/block.c
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -19,7 +19,7 @@ block-obj-$(CONFIG_LIBNFS) += nfs.o
 block-obj-$(CONFIG_CURL) += curl.o
 block-obj-$(CONFIG_RBD) += rbd.o
 block-obj-$(CONFIG_GLUSTERFS) += gluster.o
-block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o
+block-obj-$(CONFIG_VXHS) += vxhs.o
 block-obj-$(CONFIG_LIBSSH2) += ssh.o
 block-obj-y += accounting.o dirty-bitmap.o
 block-obj-y += write-threshold.o
@@ -39,9 +39,9 @@ rbd.o-cflags       := $(RBD_CFLAGS)
 rbd.o-libs         := $(RBD_LIBS)
 gluster.o-cflags   := $(GLUSTERFS_CFLAGS)
 gluster.o-libs     := $(GLUSTERFS_LIBS)
+vxhs.o-libs        := $(VXHS_LIBS)
 ssh.o-cflags       := $(LIBSSH2_CFLAGS)
 ssh.o-libs         := $(LIBSSH2_LIBS)
-archipelago.o-libs := $(ARCHIPELAGO_LIBS)
 block-obj-$(if $(CONFIG_BZIP2),m,n) += dmg-bz2.o
 dmg-bz2.o-libs     := $(BZIP2_LIBS)
 qcow.o-libs        := -lz
--- a/block/archipelago.c
+++ b/block/archipelago.c
--- a/block/backup.c
+++ b/block/backup.c
@@ -24,6 +24,7 @@
 #include "qemu/cutils.h"
 #include "sysemu/block-backend.h"
 #include "qemu/bitmap.h"
+#include "qemu/error-report.h"

 #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
 #define SLICE_TIME 100000000ULL /* ns */
@@ -64,7 +65,7 @@ static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
        retry = false;
        QLIST_FOREACH(req, &job->inflight_reqs, list) {
            if (end > req->start && start < req->end) {
-                qemu_co_queue_wait(&req->wait_queue);
+                qemu_co_queue_wait(&req->wait_queue, NULL);
                retry = true;
                break;
            }
@@ -467,13 +468,14 @@ static void coroutine_fn backup_run(void *opaque)
        /* Both FULL and TOP SYNC_MODE's require copying.. */
        for (; start < end; start++) {
            bool error_is_read;
+            int alloced = 0;
+
            if (yield_and_check(job)) {
                break;
            }

            if (job->sync_mode == MIRROR_SYNC_MODE_TOP) {
                int i, n;
-                int alloced = 0;

                /* Check to see if these blocks are already in the
                 * backing file. */
@@ -491,7 +493,7 @@ static void coroutine_fn backup_run(void *opaque)
                                sectors_per_cluster - i, &n);
                    i += n;

-                    if (alloced == 1 || n == 0) {
+                    if (alloced || n == 0) {
                        break;
                    }
                }
@@ -503,8 +505,13 @@ static void coroutine_fn backup_run(void *opaque)
                }
            }
            /* FULL sync mode we copy the whole drive. */
+            if (alloced < 0) {
+                ret = alloced;
+            } else {
                ret = backup_do_cow(job, start * sectors_per_cluster,
-                                sectors_per_cluster, &error_is_read, false);
+                                    sectors_per_cluster, &error_is_read,
+                                    false);
+            }
            if (ret < 0) {
                /* Depending on error action, fail now or retry cluster */
                BlockErrorAction action =
@@ -618,14 +625,24 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
        goto error;
    }

-    job = block_job_create(job_id, &backup_job_driver, bs, speed,
-                           creation_flags, cb, opaque, errp);
+    /* job->common.len is fixed, so we can't allow resize */
+    job = block_job_create(job_id, &backup_job_driver, bs,
+                           BLK_PERM_CONSISTENT_READ,
+                           BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE |
+                           BLK_PERM_WRITE_UNCHANGED | BLK_PERM_GRAPH_MOD,
+                           speed, creation_flags, cb, opaque, errp);
    if (!job) {
        goto error;
    }

-    job->target = blk_new();
-    blk_insert_bs(job->target, target);
+    /* The target must match the source in size, so no resize here either */
+    job->target = blk_new(BLK_PERM_WRITE,
+                          BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE |
+                          BLK_PERM_WRITE_UNCHANGED | BLK_PERM_GRAPH_MOD);
+    ret = blk_insert_bs(job->target, target, errp);
+    if (ret < 0) {
+        goto error;
+    }

    job->on_source_error = on_source_error;
    job->on_target_error = on_target_error;
@@ -638,7 +655,16 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
     * backup cluster size is smaller than the target cluster size. Even for
     * targets with a backing file, try to avoid COW if possible. */
    ret = bdrv_get_info(target, &bdi);
-    if (ret < 0 && !target->backing) {
+    if (ret == -ENOTSUP && !target->backing) {
+        /* Cluster size is not defined */
+        error_report("WARNING: The target block device doesn't provide "
+                     "information about the block size and it doesn't have a "
+                     "backing file. The default block size of %u bytes is "
+                     "used. If the actual block size of the target exceeds "
+                     "this default, the backup may be unusable",
+                     BACKUP_CLUSTER_SIZE_DEFAULT);
+        job->cluster_size = BACKUP_CLUSTER_SIZE_DEFAULT;
+    } else if (ret < 0 && !target->backing) {
        error_setg_errno(errp, -ret,
            "Couldn't determine the cluster size of the target image, "
            "which has no backing file");
@@ -652,7 +678,9 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
        job->cluster_size = MAX(BACKUP_CLUSTER_SIZE_DEFAULT, bdi.cluster_size);
    }

-    block_job_add_bdrv(&job->common, target);
+    /* Required permissions are already taken with target's blk_new() */
+    block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL,
+                       &error_abort);
    job->common.len = len;
    block_job_txn_add_job(txn, &job->common);

--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -405,12 +405,6 @@ out:
    return ret;
 }

-static void error_callback_bh(void *opaque)
-{
-    Coroutine *co = opaque;
-    qemu_coroutine_enter(co);
-}
-
 static int inject_error(BlockDriverState *bs, BlkdebugRule *rule)
 {
    BDRVBlkdebugState *s = bs->opaque;
@@ -423,8 +417,7 @@ static int inject_error(BlockDriverState *bs, BlkdebugRule *rule)
    }

    if (!immediately) {
-        aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), error_callback_bh,
-                                qemu_coroutine_self());
+        aio_co_schedule(qemu_get_current_aio_context(), qemu_coroutine_self());
        qemu_coroutine_yield();
    }

@@ -670,7 +663,7 @@ static int64_t blkdebug_getlength(BlockDriverState *bs)

 static int blkdebug_truncate(BlockDriverState *bs, int64_t offset)
 {
-    return bdrv_truncate(bs->file->bs, offset);
+    return bdrv_truncate(bs->file, offset);
 }

 static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options)
@@ -741,6 +734,8 @@ static BlockDriver bdrv_blkdebug = {
    .bdrv_file_open         = blkdebug_open,
    .bdrv_close             = blkdebug_close,
    .bdrv_reopen_prepare    = blkdebug_reopen_prepare,
+    .bdrv_child_perm        = bdrv_filter_default_perms,
+
    .bdrv_getlength         = blkdebug_getlength,
    .bdrv_truncate          = blkdebug_truncate,
    .bdrv_refresh_filename  = blkdebug_refresh_filename,
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -60,7 +60,7 @@ static int64_t blkreplay_getlength(BlockDriverState *bs)
 static void blkreplay_bh_cb(void *opaque)
 {
    Request *req = opaque;
-    qemu_coroutine_enter(req->co);
+    aio_co_wake(req->co);
    qemu_bh_delete(req->bh);
    g_free(req);
 }
@@ -137,6 +137,7 @@ static BlockDriver bdrv_blkreplay = {

    .bdrv_file_open         = blkreplay_open,
    .bdrv_close             = blkreplay_close,
+    .bdrv_child_perm        = bdrv_filter_default_perms,
    .bdrv_getlength         = blkreplay_getlength,

    .bdrv_co_preadv         = blkreplay_co_preadv,
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -320,6 +320,7 @@ static BlockDriver bdrv_blkverify = {
    .bdrv_parse_filename              = blkverify_parse_filename,
    .bdrv_file_open                   = blkverify_open,
    .bdrv_close                       = blkverify_close,
+    .bdrv_child_perm                  = bdrv_filter_default_perms,
    .bdrv_getlength                   = blkverify_getlength,
    .bdrv_refresh_filename            = blkverify_refresh_filename,

--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -59,9 +59,15 @@ struct BlockBackend {
    bool iostatus_enabled;
    BlockDeviceIoStatus iostatus;

+    uint64_t perm;
+    uint64_t shared_perm;
+    bool disable_perm;
+
    bool allow_write_beyond_eof;

    NotifierList remove_bs_notifiers, insert_bs_notifiers;
+
+    int quiesce_counter;
 };

 typedef struct BlockBackendAIOCB {
@@ -77,6 +83,7 @@ static const AIOCBInfo block_backend_aiocb_info = {

 static void drive_info_del(DriveInfo *dinfo);
 static BlockBackend *bdrv_first_blk(BlockDriverState *bs);
+static char *blk_get_attached_dev_id(BlockBackend *blk);

 /* All BlockBackends */
 static QTAILQ_HEAD(, BlockBackend) block_backends =
@@ -99,6 +106,25 @@ static void blk_root_drained_end(BdrvChild *child);
 static void blk_root_change_media(BdrvChild *child, bool load);
 static void blk_root_resize(BdrvChild *child);

+static char *blk_root_get_parent_desc(BdrvChild *child)
+{
+    BlockBackend *blk = child->opaque;
+    char *dev_id;
+
+    if (blk->name) {
+        return g_strdup(blk->name);
+    }
+
+    dev_id = blk_get_attached_dev_id(blk);
+    if (*dev_id) {
+        return dev_id;
+    } else {
+        /* TODO Callback into the BB owner for something more detailed */
+        g_free(dev_id);
+        return g_strdup("a block device");
+    }
+}
+
 static const char *blk_root_get_name(BdrvChild *child)
 {
    return blk_name(child->opaque);
@@ -110,6 +136,7 @@ static const BdrvChildRole child_root = {
    .change_media       = blk_root_change_media,
    .resize             = blk_root_resize,
    .get_name           = blk_root_get_name,
+    .get_parent_desc    = blk_root_get_parent_desc,

    .drained_begin      = blk_root_drained_begin,
    .drained_end        = blk_root_drained_end,
@@ -117,15 +144,23 @@ static const BdrvChildRole child_root = {

 /*
 * Create a new BlockBackend with a reference count of one.
- * Store an error through @errp on failure, unless it's null.
+ *
+ * @perm is a bitmasks of BLK_PERM_* constants which describes the permissions
+ * to request for a block driver node that is attached to this BlockBackend.
+ * @shared_perm is a bitmask which describes which permissions may be granted
+ * to other users of the attached node.
+ * Both sets of permissions can be changed later using blk_set_perm().
+ *
 * Return the new BlockBackend on success, null on failure.
 */
-BlockBackend *blk_new(void)
+BlockBackend *blk_new(uint64_t perm, uint64_t shared_perm)
 {
    BlockBackend *blk;

    blk = g_new0(BlockBackend, 1);
    blk->refcnt = 1;
+    blk->perm = perm;
+    blk->shared_perm = shared_perm;
    blk_set_enable_write_cache(blk, true);

    qemu_co_queue_init(&blk->public.throttled_reqs[0]);
@@ -155,15 +190,38 @@ BlockBackend *blk_new_open(const char *filename, const char *reference,
 {
    BlockBackend *blk;
    BlockDriverState *bs;
+    uint64_t perm;

-    blk = blk_new();
+    /* blk_new_open() is mainly used in .bdrv_create implementations and the
+     * tools where sharing isn't a concern because the BDS stays private, so we
+     * just request permission according to the flags.
+     *
+     * The exceptions are xen_disk and blockdev_init(); in these cases, the
+     * caller of blk_new_open() doesn't make use of the permissions, but they
+     * shouldn't hurt either. We can still share everything here because the
+     * guest devices will add their own blockers if they can't share. */
+    perm = BLK_PERM_CONSISTENT_READ;
+    if (flags & BDRV_O_RDWR) {
+        perm |= BLK_PERM_WRITE;
+    }
+    if (flags & BDRV_O_RESIZE) {
+        perm |= BLK_PERM_RESIZE;
+    }
+
+    blk = blk_new(perm, BLK_PERM_ALL);
    bs = bdrv_open(filename, reference, options, flags, errp);
    if (!bs) {
        blk_unref(blk);
        return NULL;
    }

-    blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk);
+    blk->root = bdrv_root_attach_child(bs, "root", &child_root,
+                                       perm, BLK_PERM_ALL, blk, errp);
+    if (!blk->root) {
+        bdrv_unref(bs);
+        blk_unref(blk);
+        return NULL;
+    }

    return blk;
 }
@@ -173,6 +231,9 @@ static void blk_delete(BlockBackend *blk)
    assert(!blk->refcnt);
    assert(!blk->name);
    assert(!blk->dev);
+    if (blk->public.throttle_state) {
+        blk_io_limits_disable(blk);
+    }
    if (blk->root) {
        blk_remove_bs(blk);
    }
@@ -495,16 +556,77 @@ void blk_remove_bs(BlockBackend *blk)
 /*
 * Associates a new BlockDriverState with @blk.
 */
-void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs)
+int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
 {
+    blk->root = bdrv_root_attach_child(bs, "root", &child_root,
+                                       blk->perm, blk->shared_perm, blk, errp);
+    if (blk->root == NULL) {
+        return -EPERM;
+    }
    bdrv_ref(bs);
-    blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk);

    notifier_list_notify(&blk->insert_bs_notifiers, blk);
    if (blk->public.throttle_state) {
        throttle_timers_attach_aio_context(
            &blk->public.throttle_timers, bdrv_get_aio_context(bs));
    }
+
+    return 0;
+}
+
+/*
+ * Sets the permission bitmasks that the user of the BlockBackend needs.
+ */
+int blk_set_perm(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
+                 Error **errp)
+{
+    int ret;
+
+    if (blk->root && !blk->disable_perm) {
+        ret = bdrv_child_try_set_perm(blk->root, perm, shared_perm, errp);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    blk->perm = perm;
+    blk->shared_perm = shared_perm;
+
+    return 0;
+}
+
+void blk_get_perm(BlockBackend *blk, uint64_t *perm, uint64_t *shared_perm)
+{
+    *perm = blk->perm;
+    *shared_perm = blk->shared_perm;
+}
+
+/*
+ * Notifies the user of all BlockBackends that migration has completed. qdev
+ * devices can tighten their permissions in response (specifically revoke
+ * shared write permissions that we needed for storage migration).
+ *
+ * If an error is returned, the VM cannot be allowed to be resumed.
+ */
+void blk_resume_after_migration(Error **errp)
+{
+    BlockBackend *blk;
+    Error *local_err = NULL;
+
+    for (blk = blk_all_next(NULL); blk; blk = blk_all_next(blk)) {
+        if (!blk->disable_perm) {
+            continue;
+        }
+
+        blk->disable_perm = false;
+
+        blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            blk->disable_perm = true;
+            return;
+        }
+    }
 }

 static int blk_do_attach_dev(BlockBackend *blk, void *dev)
@@ -512,10 +634,19 @@ static int blk_do_attach_dev(BlockBackend *blk, void *dev)
    if (blk->dev) {
        return -EBUSY;
    }
+
+    /* While migration is still incoming, we don't need to apply the
+     * permissions of guest device BlockBackends. We might still have a block
+     * job or NBD server writing to the image for storage migration. */
+    if (runstate_check(RUN_STATE_INMIGRATE)) {
+        blk->disable_perm = true;
+    }
+
    blk_ref(blk);
    blk->dev = dev;
    blk->legacy_dev = false;
    blk_iostatus_reset(blk);
+
    return 0;
 }

@@ -553,6 +684,7 @@ void blk_detach_dev(BlockBackend *blk, void *dev)
    blk->dev_ops = NULL;
    blk->dev_opaque = NULL;
    blk->guest_block_size = 512;
+    blk_set_perm(blk, 0, BLK_PERM_ALL, &error_abort);
    blk_unref(blk);
 }

@@ -610,29 +742,44 @@ void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops,
                     void *opaque)
 {
    /* All drivers that use blk_set_dev_ops() are qdevified and we want to keep
-     * it that way, so we can assume blk->dev is a DeviceState if blk->dev_ops
-     * is set. */
+     * it that way, so we can assume blk->dev, if present, is a DeviceState if
+     * blk->dev_ops is set. Non-device users may use dev_ops without device. */
    assert(!blk->legacy_dev);

    blk->dev_ops = ops;
    blk->dev_opaque = opaque;
+
+    /* Are we currently quiesced? Should we enforce this right now? */
+    if (blk->quiesce_counter && ops->drained_begin) {
+        ops->drained_begin(opaque);
+    }
 }

 /*
 * Notify @blk's attached device model of media change.
- * If @load is true, notify of media load.
- * Else, notify of media eject.
+ *
+ * If @load is true, notify of media load. This action can fail, meaning that
+ * the medium cannot be loaded. @errp is set then.
+ *
+ * If @load is false, notify of media eject. This can never fail.
+ *
 * Also send DEVICE_TRAY_MOVED events as appropriate.
 */
-void blk_dev_change_media_cb(BlockBackend *blk, bool load)
+void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp)
 {
    if (blk->dev_ops && blk->dev_ops->change_media_cb) {
        bool tray_was_open, tray_is_open;
+        Error *local_err = NULL;

        assert(!blk->legacy_dev);

        tray_was_open = blk_dev_is_tray_open(blk);
-        blk->dev_ops->change_media_cb(blk->dev_opaque, load);
+        blk->dev_ops->change_media_cb(blk->dev_opaque, load, &local_err);
+        if (local_err) {
+            assert(load == true);
+            error_propagate(errp, local_err);
+            return;
+        }
        tray_is_open = blk_dev_is_tray_open(blk);

        if (tray_was_open != tray_is_open) {
@@ -646,7 +793,7 @@ void blk_dev_change_media_cb(BlockBackend *blk, bool load)

 static void blk_root_change_media(BdrvChild *child, bool load)
 {
-    blk_dev_change_media_cb(child->opaque, load);
+    blk_dev_change_media_cb(child->opaque, load, NULL);
 }

 /*
@@ -880,7 +1027,6 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
 {
    QEMUIOVector qiov;
    struct iovec iov;
-    Coroutine *co;
    BlkRwCo rwco;

    iov = (struct iovec) {
@@ -897,9 +1043,14 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
        .ret    = NOT_DONE,
    };

-    co = qemu_coroutine_create(co_entry, &rwco);
-    qemu_coroutine_enter(co);
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        co_entry(&rwco);
+    } else {
+        Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
+        bdrv_coroutine_enter(blk_bs(blk), co);
        BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
+    }

    return rwco.ret;
 }
@@ -979,7 +1130,6 @@ static void blk_aio_complete(BlkAioEmAIOCB *acb)
 static void blk_aio_complete_bh(void *opaque)
 {
    BlkAioEmAIOCB *acb = opaque;
-
    assert(acb->has_returned);
    blk_aio_complete(acb);
 }
@@ -1005,7 +1155,7 @@ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
    acb->has_returned = false;

    co = qemu_coroutine_create(co_entry, acb);
-    qemu_coroutine_enter(co);
+    bdrv_coroutine_enter(blk_bs(blk), co);

    acb->has_returned = true;
    if (acb->rwco.ret != NOT_DONE) {
@@ -1602,7 +1752,7 @@ int blk_truncate(BlockBackend *blk, int64_t offset)
        return -ENOMEDIUM;
    }

-    return bdrv_truncate(blk_bs(blk), offset);
+    return bdrv_truncate(blk->root, offset);
 }

 static void blk_pdiscard_entry(void *opaque)
@@ -1768,6 +1918,12 @@ static void blk_root_drained_begin(BdrvChild *child)
 {
    BlockBackend *blk = child->opaque;

+    if (++blk->quiesce_counter == 1) {
+        if (blk->dev_ops && blk->dev_ops->drained_begin) {
+            blk->dev_ops->drained_begin(blk->dev_opaque);
+        }
+    }
+
    /* Note that blk->root may not be accessible here yet if we are just
     * attaching to a BlockDriverState that is drained. Use child instead. */

@@ -1779,7 +1935,14 @@ static void blk_root_drained_begin(BdrvChild *child)
 static void blk_root_drained_end(BdrvChild *child)
 {
    BlockBackend *blk = child->opaque;
+    assert(blk->quiesce_counter);

    assert(blk->public.io_limits_disabled);
    --blk->public.io_limits_disabled;
+
+    if (--blk->quiesce_counter == 0) {
+        if (blk->dev_ops && blk->dev_ops->drained_end) {
+            blk->dev_ops->drained_end(blk->dev_opaque);
+        }
+    }
 }
--- a/block/bochs.c
+++ b/block/bochs.c
@@ -104,7 +104,16 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags,
    struct bochs_header bochs;
    int ret;

-    bs->read_only = true; /* no write support yet */
+    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
+                               false, errp);
+    if (!bs->file) {
+        return -EINVAL;
+    }
+
+    ret = bdrv_set_read_only(bs, true, errp); /* no write support yet */
+    if (ret < 0) {
+        return ret;
+    }

    ret = bdrv_pread(bs->file, 0, &bochs, sizeof(bochs));
    if (ret < 0) {
@@ -287,6 +296,7 @@ static BlockDriver bdrv_bochs = {
    .instance_size	= sizeof(BDRVBochsState),
    .bdrv_probe		= bochs_probe,
    .bdrv_open		= bochs_open,
+    .bdrv_child_perm     = bdrv_format_default_perms,
    .bdrv_refresh_limits = bochs_refresh_limits,
    .bdrv_co_preadv = bochs_co_preadv,
    .bdrv_close		= bochs_close,
--- a/block/cloop.c
+++ b/block/cloop.c
@@ -66,7 +66,16 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
    uint32_t offsets_size, max_compressed_block_size = 1, i;
    int ret;

-    bs->read_only = true;
+    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
+                               false, errp);
+    if (!bs->file) {
+        return -EINVAL;
+    }
+
+    ret = bdrv_set_read_only(bs, true, errp);
+    if (ret < 0) {
+        return ret;
+    }

    /* read header */
    ret = bdrv_pread(bs->file, 128, &s->block_size, 4);
@@ -284,6 +293,7 @@ static BlockDriver bdrv_cloop = {
    .instance_size  = sizeof(BDRVCloopState),
    .bdrv_probe     = cloop_probe,
    .bdrv_open      = cloop_open,
+    .bdrv_child_perm     = bdrv_format_default_perms,
    .bdrv_refresh_limits = cloop_refresh_limits,
    .bdrv_co_preadv = cloop_co_preadv,
    .bdrv_close     = cloop_close,
--- a/block/commit.c
+++ b/block/commit.c
@@ -13,6 +13,7 @@
 */

 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "trace.h"
 #include "block/block_int.h"
 #include "block/blockjob_int.h"
@@ -36,6 +37,7 @@ typedef struct CommitBlockJob {
    BlockJob common;
    RateLimit limit;
    BlockDriverState *active;
+    BlockDriverState *commit_top_bs;
    BlockBackend *top;
    BlockBackend *base;
    BlockdevOnError on_error;
@@ -83,12 +85,23 @@ static void commit_complete(BlockJob *job, void *opaque)
    BlockDriverState *active = s->active;
    BlockDriverState *top = blk_bs(s->top);
    BlockDriverState *base = blk_bs(s->base);
-    BlockDriverState *overlay_bs = bdrv_find_overlay(active, top);
+    BlockDriverState *overlay_bs = bdrv_find_overlay(active, s->commit_top_bs);
    int ret = data->ret;
+    bool remove_commit_top_bs = false;
+
+    /* Remove base node parent that still uses BLK_PERM_WRITE/RESIZE before
+     * the normal backing chain can be restored. */
+    blk_unref(s->base);

    if (!block_job_is_cancelled(&s->common) && ret == 0) {
        /* success */
-        ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str);
+        ret = bdrv_drop_intermediate(active, s->commit_top_bs, base,
+                                     s->backing_file_str);
+    } else if (overlay_bs) {
+        /* XXX Can (or should) we somehow keep 'consistent read' blocked even
+         * after the failed/cancelled commit job is gone? If we already wrote
+         * something to base, the intermediate images aren't valid any more. */
+        remove_commit_top_bs = true;
    }

    /* restore base open flags here if appropriate (e.g., change the base back
@@ -102,9 +115,15 @@ static void commit_complete(BlockJob *job, void *opaque)
    }
    g_free(s->backing_file_str);
    blk_unref(s->top);
-    blk_unref(s->base);
    block_job_completed(&s->common, ret);
    g_free(data);
+
+    /* If bdrv_drop_intermediate() didn't already do that, remove the commit
+     * filter driver from the backing chain. Do this as the final step so that
+     * the 'consistent read' permission can be granted.  */
+    if (remove_commit_top_bs) {
+        bdrv_set_backing_hd(overlay_bs, top, &error_abort);
+    }
 }

 static void coroutine_fn commit_run(void *opaque)
@@ -208,10 +227,57 @@ static const BlockJobDriver commit_job_driver = {
    .start         = commit_run,
 };

+static int coroutine_fn bdrv_commit_top_preadv(BlockDriverState *bs,
+    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
+{
+    return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
+}
+
+static int64_t coroutine_fn bdrv_commit_top_get_block_status(
+    BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
+    BlockDriverState **file)
+{
+    *pnum = nb_sectors;
+    *file = bs->backing->bs;
+    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
+           (sector_num << BDRV_SECTOR_BITS);
+}
+
+static void bdrv_commit_top_refresh_filename(BlockDriverState *bs, QDict *opts)
+{
+    bdrv_refresh_filename(bs->backing->bs);
+    pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
+            bs->backing->bs->filename);
+}
+
+static void bdrv_commit_top_close(BlockDriverState *bs)
+{
+}
+
+static void bdrv_commit_top_child_perm(BlockDriverState *bs, BdrvChild *c,
+                                       const BdrvChildRole *role,
+                                       uint64_t perm, uint64_t shared,
+                                       uint64_t *nperm, uint64_t *nshared)
+{
+    *nperm = 0;
+    *nshared = BLK_PERM_ALL;
+}
+
+/* Dummy node that provides consistent read to its users without requiring it
+ * from its backing file and that allows writes on the backing file chain. */
+static BlockDriver bdrv_commit_top = {
+    .format_name                = "commit_top",
+    .bdrv_co_preadv             = bdrv_commit_top_preadv,
+    .bdrv_co_get_block_status   = bdrv_commit_top_get_block_status,
+    .bdrv_refresh_filename      = bdrv_commit_top_refresh_filename,
+    .bdrv_close                 = bdrv_commit_top_close,
+    .bdrv_child_perm            = bdrv_commit_top_child_perm,
+};
+
 void commit_start(const char *job_id, BlockDriverState *bs,
                  BlockDriverState *base, BlockDriverState *top, int64_t speed,
                  BlockdevOnError on_error, const char *backing_file_str,
-                  Error **errp)
+                  const char *filter_node_name, Error **errp)
 {
    CommitBlockJob *s;
    BlockReopenQueue *reopen_queue = NULL;
@@ -219,7 +285,9 @@ void commit_start(const char *job_id, BlockDriverState *bs,
    int orig_base_flags;
    BlockDriverState *iter;
    BlockDriverState *overlay_bs;
+    BlockDriverState *commit_top_bs = NULL;
    Error *local_err = NULL;
+    int ret;

    assert(top != bs);
    if (top == base) {
@@ -234,8 +302,8 @@ void commit_start(const char *job_id, BlockDriverState *bs,
        return;
    }

-    s = block_job_create(job_id, &commit_job_driver, bs, speed,
-                         BLOCK_JOB_DEFAULT, NULL, NULL, errp);
+    s = block_job_create(job_id, &commit_job_driver, bs, 0, BLK_PERM_ALL,
+                         speed, BLOCK_JOB_DEFAULT, NULL, NULL, errp);
    if (!s) {
        return;
    }
@@ -256,30 +324,84 @@ void commit_start(const char *job_id, BlockDriverState *bs,
        bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err);
        if (local_err != NULL) {
            error_propagate(errp, local_err);
-            block_job_unref(&s->common);
-            return;
+            goto fail;
        }
    }

+    /* Insert commit_top block node above top, so we can block consistent read
+     * on the backing chain below it */
+    commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, filter_node_name, 0,
+                                         errp);
+    if (commit_top_bs == NULL) {
+        goto fail;
+    }
+    commit_top_bs->total_sectors = top->total_sectors;
+    bdrv_set_aio_context(commit_top_bs, bdrv_get_aio_context(top));
+
+    bdrv_set_backing_hd(commit_top_bs, top, &local_err);
+    if (local_err) {
+        bdrv_unref(commit_top_bs);
+        commit_top_bs = NULL;
+        error_propagate(errp, local_err);
+        goto fail;
+    }
+    bdrv_set_backing_hd(overlay_bs, commit_top_bs, &local_err);
+    if (local_err) {
+        bdrv_unref(commit_top_bs);
+        commit_top_bs = NULL;
+        error_propagate(errp, local_err);
+        goto fail;
+    }
+
+    s->commit_top_bs = commit_top_bs;
+    bdrv_unref(commit_top_bs);

    /* Block all nodes between top and base, because they will
     * disappear from the chain after this operation. */
    assert(bdrv_chain_contains(top, base));
-    for (iter = top; iter != backing_bs(base); iter = backing_bs(iter)) {
-        block_job_add_bdrv(&s->common, iter);
+    for (iter = top; iter != base; iter = backing_bs(iter)) {
+        /* XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves
+         * at s->base (if writes are blocked for a node, they are also blocked
+         * for its backing file). The other options would be a second filter
+         * driver above s->base. */
+        ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
+                                 BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE,
+                                 errp);
+        if (ret < 0) {
+            goto fail;
        }
+    }
+
+    ret = block_job_add_bdrv(&s->common, "base", base, 0, BLK_PERM_ALL, errp);
+    if (ret < 0) {
+        goto fail;
+    }
+
    /* overlay_bs must be blocked because it needs to be modified to
-     * update the backing image string, but if it's the root node then
-     * don't block it again */
-    if (bs != overlay_bs) {
-        block_job_add_bdrv(&s->common, overlay_bs);
+     * update the backing image string. */
+    ret = block_job_add_bdrv(&s->common, "overlay of top", overlay_bs,
+                             BLK_PERM_GRAPH_MOD, BLK_PERM_ALL, errp);
+    if (ret < 0) {
+        goto fail;
    }

-    s->base = blk_new();
-    blk_insert_bs(s->base, base);
+    s->base = blk_new(BLK_PERM_CONSISTENT_READ
+                      | BLK_PERM_WRITE
+                      | BLK_PERM_RESIZE,
+                      BLK_PERM_CONSISTENT_READ
+                      | BLK_PERM_GRAPH_MOD
+                      | BLK_PERM_WRITE_UNCHANGED);
+    ret = blk_insert_bs(s->base, base, errp);
+    if (ret < 0) {
+        goto fail;
+    }

-    s->top = blk_new();
-    blk_insert_bs(s->top, top);
+    /* Required permissions are already taken with block_job_add_bdrv() */
+    s->top = blk_new(0, BLK_PERM_ALL);
+    ret = blk_insert_bs(s->top, top, errp);
+    if (ret < 0) {
+        goto fail;
+    }

    s->active = bs;

@@ -292,6 +414,19 @@ void commit_start(const char *job_id, BlockDriverState *bs,

    trace_commit_start(bs, base, top, s);
    block_job_start(&s->common);
+    return;
+
+fail:
+    if (s->base) {
+        blk_unref(s->base);
+    }
+    if (s->top) {
+        blk_unref(s->top);
+    }
+    if (commit_top_bs) {
+        bdrv_set_backing_hd(overlay_bs, top, &error_abort);
+    }
+    block_job_unref(&s->common);
 }


@@ -301,11 +436,14 @@ void commit_start(const char *job_id, BlockDriverState *bs,
 int bdrv_commit(BlockDriverState *bs)
 {
    BlockBackend *src, *backing;
+    BlockDriverState *backing_file_bs = NULL;
+    BlockDriverState *commit_top_bs = NULL;
    BlockDriver *drv = bs->drv;
    int64_t sector, total_sectors, length, backing_length;
    int n, ro, open_flags;
    int ret = 0;
    uint8_t *buf = NULL;
+    Error *local_err = NULL;

    if (!drv)
        return -ENOMEDIUM;
@@ -328,11 +466,34 @@ int bdrv_commit(BlockDriverState *bs)
        }
    }

-    src = blk_new();
-    blk_insert_bs(src, bs);
+    src = blk_new(BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL);
+    backing = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);

-    backing = blk_new();
-    blk_insert_bs(backing, bs->backing->bs);
+    ret = blk_insert_bs(src, bs, &local_err);
+    if (ret < 0) {
+        error_report_err(local_err);
+        goto ro_cleanup;
+    }
+
+    /* Insert commit_top block node above backing, so we can write to it */
+    backing_file_bs = backing_bs(bs);
+
+    commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, NULL, BDRV_O_RDWR,
+                                         &local_err);
+    if (commit_top_bs == NULL) {
+        error_report_err(local_err);
+        goto ro_cleanup;
+    }
+    bdrv_set_aio_context(commit_top_bs, bdrv_get_aio_context(backing_file_bs));
+
+    bdrv_set_backing_hd(commit_top_bs, backing_file_bs, &error_abort);
+    bdrv_set_backing_hd(bs, commit_top_bs, &error_abort);
+
+    ret = blk_insert_bs(backing, backing_file_bs, &local_err);
+    if (ret < 0) {
+        error_report_err(local_err);
+        goto ro_cleanup;
+    }

    length = blk_getlength(src);
    if (length < 0) {
@@ -404,8 +565,12 @@ int bdrv_commit(BlockDriverState *bs)
 ro_cleanup:
    qemu_vfree(buf);

-    blk_unref(src);
    blk_unref(backing);
+    if (backing_file_bs) {
+        bdrv_set_backing_hd(bs, backing_file_bs, &error_abort);
+    }
+    bdrv_unref(commit_top_bs);
+    blk_unref(src);

    if (ro) {
        /* ignoring error return here */
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -56,11 +56,11 @@ static int block_crypto_probe_generic(QCryptoBlockFormat format,


 static ssize_t block_crypto_read_func(QCryptoBlock *block,
+                                      void *opaque,
                                      size_t offset,
                                      uint8_t *buf,
                                      size_t buflen,
-                                      Error **errp,
-                                      void *opaque)
+                                      Error **errp)
 {
    BlockDriverState *bs = opaque;
    ssize_t ret;
@@ -83,11 +83,11 @@ struct BlockCryptoCreateData {


 static ssize_t block_crypto_write_func(QCryptoBlock *block,
+                                       void *opaque,
                                       size_t offset,
                                       const uint8_t *buf,
                                       size_t buflen,
-                                       Error **errp,
-                                       void *opaque)
+                                       Error **errp)
 {
    struct BlockCryptoCreateData *data = opaque;
    ssize_t ret;
@@ -102,9 +102,9 @@ static ssize_t block_crypto_write_func(QCryptoBlock *block,


 static ssize_t block_crypto_init_func(QCryptoBlock *block,
+                                      void *opaque,
                                      size_t headerlen,
-                                      Error **errp,
-                                      void *opaque)
+                                      Error **errp)
 {
    struct BlockCryptoCreateData *data = opaque;
    int ret;
@@ -300,6 +300,12 @@ static int block_crypto_open_generic(QCryptoBlockFormat format,
    QCryptoBlockOpenOptions *open_opts = NULL;
    unsigned int cflags = 0;

+    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
+                               false, errp);
+    if (!bs->file) {
+        return -EINVAL;
+    }
+
    opts = qemu_opts_create(opts_spec, NULL, 0, &error_abort);
    qemu_opts_absorb_qdict(opts, options, &local_err);
    if (local_err) {
@@ -383,7 +389,7 @@ static int block_crypto_truncate(BlockDriverState *bs, int64_t offset)

    offset += payload_offset;

-    return bdrv_truncate(bs->file->bs, offset);
+    return bdrv_truncate(bs->file, offset);
 }

 static void block_crypto_close(BlockDriverState *bs)
@@ -622,6 +628,7 @@ BlockDriver bdrv_crypto_luks = {
    .bdrv_probe         = block_crypto_probe_luks,
    .bdrv_open          = block_crypto_open_luks,
    .bdrv_close         = block_crypto_close,
+    .bdrv_child_perm    = bdrv_format_default_perms,
    .bdrv_create        = block_crypto_create_luks,
    .bdrv_truncate      = block_crypto_truncate,
    .create_opts        = &block_crypto_create_opts_luks,
--- a/block/curl.c
+++ b/block/curl.c
@@ -135,6 +135,7 @@ typedef struct BDRVCURLState {
    char *cookie;
    bool accept_range;
    AioContext *aio_context;
+    QemuMutex mutex;
    char *username;
    char *password;
    char *proxyusername;
@@ -333,6 +334,7 @@ static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len,
    return FIND_RET_NONE;
 }

+/* Called with s->mutex held.  */
 static void curl_multi_check_completion(BDRVCURLState *s)
 {
    int msgs_in_queue;
@@ -374,7 +376,9 @@ static void curl_multi_check_completion(BDRVCURLState *s)
                        continue;
                    }

-                    acb->common.cb(acb->common.opaque, -EPROTO);
+                    qemu_mutex_unlock(&s->mutex);
+                    acb->common.cb(acb->common.opaque, -EIO);
+                    qemu_mutex_lock(&s->mutex);
                    qemu_aio_unref(acb);
                    state->acb[i] = NULL;
                }
@@ -386,9 +390,9 @@ static void curl_multi_check_completion(BDRVCURLState *s)
    }
 }

-static void curl_multi_do(void *arg)
+/* Called with s->mutex held.  */
+static void curl_multi_do_locked(CURLState *s)
 {
-    CURLState *s = (CURLState *)arg;
    CURLSocket *socket, *next_socket;
    int running;
    int r;
@@ -406,12 +410,23 @@ static void curl_multi_do(void *arg)
    }
 }

+static void curl_multi_do(void *arg)
+{
+    CURLState *s = (CURLState *)arg;
+
+    qemu_mutex_lock(&s->s->mutex);
+    curl_multi_do_locked(s);
+    qemu_mutex_unlock(&s->s->mutex);
+}
+
 static void curl_multi_read(void *arg)
 {
    CURLState *s = (CURLState *)arg;

-    curl_multi_do(arg);
+    qemu_mutex_lock(&s->s->mutex);
+    curl_multi_do_locked(s);
    curl_multi_check_completion(s->s);
+    qemu_mutex_unlock(&s->s->mutex);
 }

 static void curl_multi_timeout_do(void *arg)
@@ -424,9 +439,11 @@ static void curl_multi_timeout_do(void *arg)
        return;
    }

+    qemu_mutex_lock(&s->mutex);
    curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);

    curl_multi_check_completion(s);
+    qemu_mutex_unlock(&s->mutex);
 #else
    abort();
 #endif
@@ -642,6 +659,7 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
    const char *cookie;
    double d;
    const char *secretid;
+    const char *protocol_delimiter;

    static int inited = 0;

@@ -683,6 +701,15 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
        goto out_noclean;
    }

+    if (!strstart(file, bs->drv->protocol_name, &protocol_delimiter) ||
+        !strstart(protocol_delimiter, "://", NULL))
+    {
+        error_setg(errp, "%s curl driver cannot handle the URL '%s' (does not "
+                   "start with '%s://')", bs->drv->protocol_name, file,
+                   bs->drv->protocol_name);
+        goto out_noclean;
+    }
+
    s->username = g_strdup(qemu_opt_get(opts, CURL_BLOCK_OPT_USERNAME));
    secretid = qemu_opt_get(opts, CURL_BLOCK_OPT_PASSWORD_SECRET);

@@ -759,6 +786,7 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
    curl_easy_cleanup(state->curl);
    state->curl = NULL;

+    qemu_mutex_init(&s->mutex);
    curl_attach_aio_context(bs, bdrv_get_aio_context(bs));

    qemu_opts_del(opts);
@@ -784,13 +812,17 @@ static void curl_readv_bh_cb(void *p)
 {
    CURLState *state;
    int running;
+    int ret = -EINPROGRESS;

    CURLAIOCB *acb = p;
-    BDRVCURLState *s = acb->common.bs->opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVCURLState *s = bs->opaque;

    size_t start = acb->sector_num * BDRV_SECTOR_SIZE;
    size_t end;

+    qemu_mutex_lock(&s->mutex);
+
    // In case we have the requested data already (e.g. read-ahead),
    // we can just call the callback and be done.
    switch (curl_find_buf(s, start, acb->nb_sectors * BDRV_SECTOR_SIZE, acb)) {
@@ -798,7 +830,7 @@ static void curl_readv_bh_cb(void *p)
            qemu_aio_unref(acb);
            // fall through
        case FIND_RET_WAIT:
-            return;
+            goto out;
        default:
            break;
    }
@@ -806,9 +838,8 @@ static void curl_readv_bh_cb(void *p)
    // No cache found, so let's start a new request
    state = curl_init_state(acb->common.bs, s);
    if (!state) {
-        acb->common.cb(acb->common.opaque, -EIO);
-        qemu_aio_unref(acb);
-        return;
+        ret = -EIO;
+        goto out;
    }

    acb->start = 0;
@@ -822,9 +853,8 @@ static void curl_readv_bh_cb(void *p)
    state->orig_buf = g_try_malloc(state->buf_len);
    if (state->buf_len && state->orig_buf == NULL) {
        curl_clean_state(state);
-        acb->common.cb(acb->common.opaque, -ENOMEM);
-        qemu_aio_unref(acb);
-        return;
+        ret = -ENOMEM;
+        goto out;
    }
    state->acb[0] = acb;

@@ -837,6 +867,13 @@ static void curl_readv_bh_cb(void *p)

    /* Tell curl it needs to kick things off */
    curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
+
+out:
+    qemu_mutex_unlock(&s->mutex);
+    if (ret != -EINPROGRESS) {
+        acb->common.cb(acb->common.opaque, ret);
+        qemu_aio_unref(acb);
+    }
 }

 static BlockAIOCB *curl_aio_readv(BlockDriverState *bs,
@@ -861,6 +898,7 @@ static void curl_close(BlockDriverState *bs)

    DPRINTF("CURL: Close\n");
    curl_detach_aio_context(bs);
+    qemu_mutex_destroy(&s->mutex);

    g_free(s->cookie);
    g_free(s->url);
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -413,8 +413,18 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
    int64_t offset;
    int ret;

+    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
+                               false, errp);
+    if (!bs->file) {
+        return -EINVAL;
+    }
+
+    ret = bdrv_set_read_only(bs, true, errp);
+    if (ret < 0) {
+        return ret;
+    }
+
    block_module_load_one("dmg-bz2");
-    bs->read_only = true;

    s->n_chunks = 0;
    s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL;
@@ -691,6 +701,7 @@ static BlockDriver bdrv_dmg = {
    .bdrv_probe     = dmg_probe,
    .bdrv_open      = dmg_open,
    .bdrv_refresh_limits = dmg_refresh_limits,
+    .bdrv_child_perm     = bdrv_format_default_perms,
    .bdrv_co_preadv = dmg_co_preadv,
    .bdrv_close     = dmg_close,
 };
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -144,6 +144,7 @@ typedef struct BDRVRawState {
    bool has_write_zeroes:1;
    bool discard_zeroes:1;
    bool use_linux_aio:1;
+    bool page_cache_inconsistent:1;
    bool has_fallocate;
    bool needs_alignment;
 } BDRVRawState;
@@ -219,28 +220,28 @@ static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
 {
    unsigned int sector_size;
    bool success = false;
+    int i;

    errno = ENOTSUP;
-
-    /* Try a few ioctls to get the right size */
+    static const unsigned long ioctl_list[] = {
 #ifdef BLKSSZGET
-    if (ioctl(fd, BLKSSZGET, &sector_size) >= 0) {
-        *sector_size_p = sector_size;
-        success = true;
-    }
+        BLKSSZGET,
 #endif
 #ifdef DKIOCGETBLOCKSIZE
-    if (ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) >= 0) {
-        *sector_size_p = sector_size;
-        success = true;
-    }
+        DKIOCGETBLOCKSIZE,
 #endif
 #ifdef DIOCGSECTORSIZE
-    if (ioctl(fd, DIOCGSECTORSIZE, &sector_size) >= 0) {
+        DIOCGSECTORSIZE,
+#endif
+    };
+
+    /* Try a few ioctls to get the right size */
+    for (i = 0; i < (int)ARRAY_SIZE(ioctl_list); i++) {
+        if (ioctl(fd, ioctl_list[i], &sector_size) >= 0) {
            *sector_size_p = sector_size;
            success = true;
        }
-#endif
+    }

    return success ? 0 : -errno;
 }
@@ -668,6 +669,51 @@ static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd)
 #endif
 }

+static int hdev_get_max_segments(const struct stat *st)
+{
+#ifdef CONFIG_LINUX
+    char buf[32];
+    const char *end;
+    char *sysfspath;
+    int ret;
+    int fd = -1;
+    long max_segments;
+
+    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
+                                major(st->st_rdev), minor(st->st_rdev));
+    fd = open(sysfspath, O_RDONLY);
+    if (fd == -1) {
+        ret = -errno;
+        goto out;
+    }
+    do {
+        ret = read(fd, buf, sizeof(buf) - 1);
+    } while (ret == -1 && errno == EINTR);
+    if (ret < 0) {
+        ret = -errno;
+        goto out;
+    } else if (ret == 0) {
+        ret = -EIO;
+        goto out;
+    }
+    buf[ret] = 0;
+    /* The file is ended with '\n', pass 'end' to accept that. */
+    ret = qemu_strtol(buf, &end, 10, &max_segments);
+    if (ret == 0 && end && *end == '\n') {
+        ret = max_segments;
+    }
+
+out:
+    if (fd != -1) {
+        close(fd);
+    }
+    g_free(sysfspath);
+    return ret;
+#else
+    return -ENOTSUP;
+#endif
+}
+
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
    BDRVRawState *s = bs->opaque;
@@ -679,6 +725,11 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
            if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
                bs->bl.max_transfer = pow2floor(ret);
            }
+            ret = hdev_get_max_segments(&st);
+            if (ret > 0) {
+                bs->bl.max_transfer = MIN(bs->bl.max_transfer,
+                                          ret * getpagesize());
+            }
        }
    }

@@ -774,10 +825,31 @@ static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)

 static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb)
 {
+    BDRVRawState *s = aiocb->bs->opaque;
    int ret;

+    if (s->page_cache_inconsistent) {
+        return -EIO;
+    }
+
    ret = qemu_fdatasync(aiocb->aio_fildes);
    if (ret == -1) {
+        /* There is no clear definition of the semantics of a failing fsync(),
+         * so we may have to assume the worst. The sad truth is that this
+         * assumption is correct for Linux. Some pages are now probably marked
+         * clean in the page cache even though they are inconsistent with the
+         * on-disk contents. The next fdatasync() call would succeed, but no
+         * further writeback attempt will be made. We can't get back to a state
+         * in which we know what is on disk (we would have to rewrite
+         * everything that was touched since the last fdatasync() at least), so
+         * make bdrv_flush() fail permanently. Given that the behaviour isn't
+         * really defined, I have little hope that other OSes are doing better.
+         *
+         * Obviously, this doesn't affect O_DIRECT, which bypasses the page
+         * cache. */
+        if ((s->open_flags & O_DIRECT) == 0) {
+            s->page_cache_inconsistent = true;
+        }
        return -errno;
    }
    return 0;
@@ -1591,18 +1663,17 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
 #endif
    }

-    if (ftruncate(fd, total_size) != 0) {
-        result = -errno;
-        error_setg_errno(errp, -result, "Could not resize file");
-        goto out_close;
-    }
-
    switch (prealloc) {
 #ifdef CONFIG_POSIX_FALLOCATE
    case PREALLOC_MODE_FALLOC:
-        /* posix_fallocate() doesn't set errno. */
+        /*
+         * Truncating before posix_fallocate() makes it about twice slower on
+         * file systems that do not support fallocate(), trying to check if a
+         * block is allocated before allocating it, so don't do that here.
+         */
        result = -posix_fallocate(fd, 0, total_size);
        if (result != 0) {
+            /* posix_fallocate() doesn't set errno. */
            error_setg_errno(errp, -result,
                             "Could not preallocate data for the new file");
        }
@@ -1610,6 +1681,17 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
 #endif
    case PREALLOC_MODE_FULL:
    {
+        /*
+         * Knowing the final size from the beginning could allow the file
+         * system driver to do less allocations and possibly avoid
+         * fragmentation of the file.
+         */
+        if (ftruncate(fd, total_size) != 0) {
+            result = -errno;
+            error_setg_errno(errp, -result, "Could not resize file");
+            goto out_close;
+        }
+
        int64_t num = 0, left = total_size;
        buf = g_malloc0(65536);

@@ -1636,6 +1718,10 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
        break;
    }
    case PREALLOC_MODE_OFF:
+        if (ftruncate(fd, total_size) != 0) {
+            result = -errno;
+            error_setg_errno(errp, -result, "Could not resize file");
+        }
        break;
    default:
        result = -EINVAL;
@@ -2107,6 +2193,12 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
    int ret;

 #if defined(__APPLE__) && defined(__MACH__)
+    /*
+     * Caution: while qdict_get_str() is fine, getting non-string types
+     * would require more care.  When @options come from -blockdev or
+     * blockdev_add, its members are typed according to the QAPI
+     * schema, but when they come from -drive, they're all QString.
+     */
    const char *filename = qdict_get_str(options, "filename");
    char bsd_path[MAXPATHLEN] = "";
    bool error_occurred = false;
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -12,6 +12,7 @@
 #include "block/block_int.h"
 #include "qapi/error.h"
 #include "qapi/qmp/qerror.h"
+#include "qapi/util.h"
 #include "qemu/uri.h"
 #include "qemu/error-report.h"
 #include "qemu/cutils.h"
@@ -151,7 +152,7 @@ static QemuOptsList runtime_type_opts = {
        {
            .name = GLUSTER_OPT_TYPE,
            .type = QEMU_OPT_STRING,
-            .help = "tcp|unix",
+            .help = "inet|unix",
        },
        { /* end of list */ }
    },
@@ -170,14 +171,14 @@ static QemuOptsList runtime_unix_opts = {
    },
 };

-static QemuOptsList runtime_tcp_opts = {
-    .name = "gluster_tcp",
-    .head = QTAILQ_HEAD_INITIALIZER(runtime_tcp_opts.head),
+static QemuOptsList runtime_inet_opts = {
+    .name = "gluster_inet",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_inet_opts.head),
    .desc = {
        {
            .name = GLUSTER_OPT_TYPE,
            .type = QEMU_OPT_STRING,
-            .help = "tcp|unix",
+            .help = "inet|unix",
        },
        {
            .name = GLUSTER_OPT_HOST,
@@ -320,7 +321,7 @@ static int parse_volume_options(BlockdevOptionsGluster *gconf, char *path)
 static int qemu_gluster_parse_uri(BlockdevOptionsGluster *gconf,
                                  const char *filename)
 {
-    GlusterServer *gsconf;
+    SocketAddressFlat *gsconf;
    URI *uri;
    QueryParams *qp = NULL;
    bool is_unix = false;
@@ -331,19 +332,19 @@ static int qemu_gluster_parse_uri(BlockdevOptionsGluster *gconf,
        return -EINVAL;
    }

-    gconf->server = g_new0(GlusterServerList, 1);
-    gconf->server->value = gsconf = g_new0(GlusterServer, 1);
+    gconf->server = g_new0(SocketAddressFlatList, 1);
+    gconf->server->value = gsconf = g_new0(SocketAddressFlat, 1);

    /* transport */
    if (!uri->scheme || !strcmp(uri->scheme, "gluster")) {
-        gsconf->type = GLUSTER_TRANSPORT_TCP;
+        gsconf->type = SOCKET_ADDRESS_FLAT_TYPE_INET;
    } else if (!strcmp(uri->scheme, "gluster+tcp")) {
-        gsconf->type = GLUSTER_TRANSPORT_TCP;
+        gsconf->type = SOCKET_ADDRESS_FLAT_TYPE_INET;
    } else if (!strcmp(uri->scheme, "gluster+unix")) {
-        gsconf->type = GLUSTER_TRANSPORT_UNIX;
+        gsconf->type = SOCKET_ADDRESS_FLAT_TYPE_UNIX;
        is_unix = true;
    } else if (!strcmp(uri->scheme, "gluster+rdma")) {
-        gsconf->type = GLUSTER_TRANSPORT_TCP;
+        gsconf->type = SOCKET_ADDRESS_FLAT_TYPE_INET;
        error_report("Warning: rdma feature is not supported, falling "
                     "back to tcp");
    } else {
@@ -373,11 +374,11 @@ static int qemu_gluster_parse_uri(BlockdevOptionsGluster *gconf,
        }
        gsconf->u.q_unix.path = g_strdup(qp->p[0].value);
    } else {
-        gsconf->u.tcp.host = g_strdup(uri->server ? uri->server : "localhost");
+        gsconf->u.inet.host = g_strdup(uri->server ? uri->server : "localhost");
        if (uri->port) {
-            gsconf->u.tcp.port = g_strdup_printf("%d", uri->port);
+            gsconf->u.inet.port = g_strdup_printf("%d", uri->port);
        } else {
-            gsconf->u.tcp.port = g_strdup_printf("%d", GLUSTER_DEFAULT_PORT);
+            gsconf->u.inet.port = g_strdup_printf("%d", GLUSTER_DEFAULT_PORT);
        }
    }

@@ -395,7 +396,7 @@ static struct glfs *qemu_gluster_glfs_init(BlockdevOptionsGluster *gconf,
    struct glfs *glfs;
    int ret;
    int old_errno;
-    GlusterServerList *server;
+    SocketAddressFlatList *server;
    unsigned long long port;

    glfs = glfs_find_preopened(gconf->volume);
@@ -411,22 +412,27 @@ static struct glfs *qemu_gluster_glfs_init(BlockdevOptionsGluster *gconf,
    glfs_set_preopened(gconf->volume, glfs);

    for (server = gconf->server; server; server = server->next) {
-        if (server->value->type  == GLUSTER_TRANSPORT_UNIX) {
-            ret = glfs_set_volfile_server(glfs,
-                                   GlusterTransport_lookup[server->value->type],
+        switch (server->value->type) {
+        case SOCKET_ADDRESS_FLAT_TYPE_UNIX:
+            ret = glfs_set_volfile_server(glfs, "unix",
                                   server->value->u.q_unix.path, 0);
-        } else {
-            if (parse_uint_full(server->value->u.tcp.port, &port, 10) < 0 ||
+            break;
+        case SOCKET_ADDRESS_FLAT_TYPE_INET:
+            if (parse_uint_full(server->value->u.inet.port, &port, 10) < 0 ||
                port > 65535) {
                error_setg(errp, "'%s' is not a valid port number",
-                           server->value->u.tcp.port);
+                           server->value->u.inet.port);
                errno = EINVAL;
                goto out;
            }
-            ret = glfs_set_volfile_server(glfs,
-                                   GlusterTransport_lookup[server->value->type],
-                                   server->value->u.tcp.host,
+            ret = glfs_set_volfile_server(glfs, "tcp",
+                                   server->value->u.inet.host,
                                   (int)port);
+            break;
+        case SOCKET_ADDRESS_FLAT_TYPE_VSOCK:
+        case SOCKET_ADDRESS_FLAT_TYPE_FD:
+        default:
+            abort();
        }

        if (ret < 0) {
@@ -444,13 +450,13 @@ static struct glfs *qemu_gluster_glfs_init(BlockdevOptionsGluster *gconf,
        error_setg(errp, "Gluster connection for volume %s, path %s failed"
                         " to connect", gconf->volume, gconf->path);
        for (server = gconf->server; server; server = server->next) {
-            if (server->value->type  == GLUSTER_TRANSPORT_UNIX) {
+            if (server->value->type  == SOCKET_ADDRESS_FLAT_TYPE_UNIX) {
                error_append_hint(errp, "hint: failed on socket %s ",
                                  server->value->u.q_unix.path);
            } else {
                error_append_hint(errp, "hint: failed on host %s and port %s ",
-                                  server->value->u.tcp.host,
-                                  server->value->u.tcp.port);
+                                  server->value->u.inet.host,
+                                  server->value->u.inet.port);
            }
        }

@@ -474,23 +480,6 @@ out:
    return NULL;
 }

-static int qapi_enum_parse(const char *opt)
-{
-    int i;
-
-    if (!opt) {
-        return GLUSTER_TRANSPORT__MAX;
-    }
-
-    for (i = 0; i < GLUSTER_TRANSPORT__MAX; i++) {
-        if (!strcmp(opt, GlusterTransport_lookup[i])) {
-            return i;
-        }
-    }
-
-    return i;
-}
-
 /*
 * Convert the json formatted command line into qapi.
 */
@@ -498,14 +487,14 @@ static int qemu_gluster_parse_json(BlockdevOptionsGluster *gconf,
                                  QDict *options, Error **errp)
 {
    QemuOpts *opts;
-    GlusterServer *gsconf;
-    GlusterServerList *curr = NULL;
+    SocketAddressFlat *gsconf = NULL;
+    SocketAddressFlatList *curr = NULL;
    QDict *backing_options = NULL;
    Error *local_err = NULL;
    char *str = NULL;
    const char *ptr;
    size_t num_servers;
-    int i;
+    int i, type;

    /* create opts info from runtime_json_opts list */
    opts = qemu_opts_create(&runtime_json_opts, NULL, 0, &error_abort);
@@ -547,25 +536,32 @@ static int qemu_gluster_parse_json(BlockdevOptionsGluster *gconf,
        }

        ptr = qemu_opt_get(opts, GLUSTER_OPT_TYPE);
-        gsconf = g_new0(GlusterServer, 1);
-        gsconf->type = qapi_enum_parse(ptr);
        if (!ptr) {
            error_setg(&local_err, QERR_MISSING_PARAMETER, GLUSTER_OPT_TYPE);
            error_append_hint(&local_err, GERR_INDEX_HINT, i);
            goto out;

        }
-        if (gsconf->type == GLUSTER_TRANSPORT__MAX) {
-            error_setg(&local_err, QERR_INVALID_PARAMETER_VALUE,
-                       GLUSTER_OPT_TYPE, "tcp or unix");
+        gsconf = g_new0(SocketAddressFlat, 1);
+        if (!strcmp(ptr, "tcp")) {
+            ptr = "inet";       /* accept legacy "tcp" */
+        }
+        type = qapi_enum_parse(SocketAddressFlatType_lookup, ptr,
+                               SOCKET_ADDRESS_FLAT_TYPE__MAX, -1, NULL);
+        if (type != SOCKET_ADDRESS_FLAT_TYPE_INET
+            && type != SOCKET_ADDRESS_FLAT_TYPE_UNIX) {
+            error_setg(&local_err,
+                       "Parameter '%s' may be 'inet' or 'unix'",
+                       GLUSTER_OPT_TYPE);
            error_append_hint(&local_err, GERR_INDEX_HINT, i);
            goto out;
        }
+        gsconf->type = type;
        qemu_opts_del(opts);

-        if (gsconf->type == GLUSTER_TRANSPORT_TCP) {
-            /* create opts info from runtime_tcp_opts list */
-            opts = qemu_opts_create(&runtime_tcp_opts, NULL, 0, &error_abort);
+        if (gsconf->type == SOCKET_ADDRESS_FLAT_TYPE_INET) {
+            /* create opts info from runtime_inet_opts list */
+            opts = qemu_opts_create(&runtime_inet_opts, NULL, 0, &error_abort);
            qemu_opts_absorb_qdict(opts, backing_options, &local_err);
            if (local_err) {
                goto out;
@@ -578,7 +574,7 @@ static int qemu_gluster_parse_json(BlockdevOptionsGluster *gconf,
                error_append_hint(&local_err, GERR_INDEX_HINT, i);
                goto out;
            }
-            gsconf->u.tcp.host = g_strdup(ptr);
+            gsconf->u.inet.host = g_strdup(ptr);
            ptr = qemu_opt_get(opts, GLUSTER_OPT_PORT);
            if (!ptr) {
                error_setg(&local_err, QERR_MISSING_PARAMETER,
@@ -586,28 +582,28 @@ static int qemu_gluster_parse_json(BlockdevOptionsGluster *gconf,
                error_append_hint(&local_err, GERR_INDEX_HINT, i);
                goto out;
            }
-            gsconf->u.tcp.port = g_strdup(ptr);
+            gsconf->u.inet.port = g_strdup(ptr);

            /* defend for unsupported fields in InetSocketAddress,
             * i.e. @ipv4, @ipv6  and @to
             */
            ptr = qemu_opt_get(opts, GLUSTER_OPT_TO);
            if (ptr) {
-                gsconf->u.tcp.has_to = true;
+                gsconf->u.inet.has_to = true;
            }
            ptr = qemu_opt_get(opts, GLUSTER_OPT_IPV4);
            if (ptr) {
-                gsconf->u.tcp.has_ipv4 = true;
+                gsconf->u.inet.has_ipv4 = true;
            }
            ptr = qemu_opt_get(opts, GLUSTER_OPT_IPV6);
            if (ptr) {
-                gsconf->u.tcp.has_ipv6 = true;
+                gsconf->u.inet.has_ipv6 = true;
            }
-            if (gsconf->u.tcp.has_to) {
+            if (gsconf->u.inet.has_to) {
                error_setg(&local_err, "Parameter 'to' not supported");
                goto out;
            }
-            if (gsconf->u.tcp.has_ipv4 || gsconf->u.tcp.has_ipv6) {
+            if (gsconf->u.inet.has_ipv4 || gsconf->u.inet.has_ipv6) {
                error_setg(&local_err, "Parameters 'ipv4/ipv6' not supported");
                goto out;
            }
@@ -632,16 +628,18 @@ static int qemu_gluster_parse_json(BlockdevOptionsGluster *gconf,
        }

        if (gconf->server == NULL) {
-            gconf->server = g_new0(GlusterServerList, 1);
+            gconf->server = g_new0(SocketAddressFlatList, 1);
            gconf->server->value = gsconf;
            curr = gconf->server;
        } else {
-            curr->next = g_new0(GlusterServerList, 1);
+            curr->next = g_new0(SocketAddressFlatList, 1);
            curr->next->value = gsconf;
            curr = curr->next;
        }
+        gsconf = NULL;

-        qdict_del(backing_options, str);
+        QDECREF(backing_options);
+        backing_options = NULL;
        g_free(str);
        str = NULL;
    }
@@ -650,11 +648,10 @@ static int qemu_gluster_parse_json(BlockdevOptionsGluster *gconf,

 out:
    error_propagate(errp, local_err);
+    qapi_free_SocketAddressFlat(gsconf);
    qemu_opts_del(opts);
-    if (str) {
-        qdict_del(backing_options, str);
    g_free(str);
-    }
+    QDECREF(backing_options);
    errno = EINVAL;
    return -errno;
 }
@@ -683,7 +680,7 @@ static struct glfs *qemu_gluster_init(BlockdevOptionsGluster *gconf,
                             "file.volume=testvol,file.path=/path/a.qcow2"
                             "[,file.debug=9]"
                             "[,file.logfile=/path/filename.log],"
-                             "file.server.0.type=tcp,"
+                             "file.server.0.type=inet,"
                             "file.server.0.host=1.2.3.4,"
                             "file.server.0.port=24007,"
                             "file.server.1.transport=unix,"
@@ -698,13 +695,6 @@ static struct glfs *qemu_gluster_init(BlockdevOptionsGluster *gconf,
    return qemu_gluster_glfs_init(gconf, errp);
 }

-static void qemu_gluster_complete_aio(void *opaque)
-{
-    GlusterAIOCB *acb = (GlusterAIOCB *)opaque;
-
-    qemu_coroutine_enter(acb->coroutine);
-}
-
 /*
 * AIO callback routine called from GlusterFS thread.
 */
@@ -720,7 +710,7 @@ static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
        acb->ret = -EIO; /* Partial read/write - fail it */
    }

-    aio_bh_schedule_oneshot(acb->aio_context, qemu_gluster_complete_aio, acb);
+    aio_co_schedule(acb->aio_context, acb->coroutine);
 }

 static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
--- a/block/io.c
+++ b/block/io.c
@@ -44,7 +44,7 @@ static void coroutine_fn bdrv_co_do_rw(void *opaque);
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
    int64_t offset, int count, BdrvRequestFlags flags);

-static void bdrv_parent_drained_begin(BlockDriverState *bs)
+void bdrv_parent_drained_begin(BlockDriverState *bs)
 {
    BdrvChild *c;

@@ -55,7 +55,7 @@ static void bdrv_parent_drained_begin(BlockDriverState *bs)
    }
 }

-static void bdrv_parent_drained_end(BlockDriverState *bs)
+void bdrv_parent_drained_end(BlockDriverState *bs)
 {
    BdrvChild *c;

@@ -158,7 +158,7 @@ bool bdrv_requests_pending(BlockDriverState *bs)

 static bool bdrv_drain_recurse(BlockDriverState *bs)
 {
-    BdrvChild *child;
+    BdrvChild *child, *tmp;
    bool waited;

    waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
@@ -167,8 +167,25 @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
        bs->drv->bdrv_drain(bs);
    }

-    QLIST_FOREACH(child, &bs->children, next) {
-        waited |= bdrv_drain_recurse(child->bs);
+    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
+        BlockDriverState *bs = child->bs;
+        bool in_main_loop =
+            qemu_get_current_aio_context() == qemu_get_aio_context();
+        assert(bs->refcnt > 0);
+        if (in_main_loop) {
+            /* In case the recursive bdrv_drain_recurse processes a
+             * block_job_defer_to_main_loop BH and modifies the graph,
+             * let's hold a reference to bs until we are done.
+             *
+             * IOThread doesn't have such a BH, and it is not safe to call
+             * bdrv_unref without BQL, so skip doing it there.
+             */
+            bdrv_ref(bs);
+        }
+        waited |= bdrv_drain_recurse(bs);
+        if (in_main_loop) {
+            bdrv_unref(bs);
+        }
    }

    return waited;
@@ -189,7 +206,7 @@ static void bdrv_co_drain_bh_cb(void *opaque)
    bdrv_dec_in_flight(bs);
    bdrv_drained_begin(bs);
    data->done = true;
-    qemu_coroutine_enter(co);
+    aio_co_wake(co);
 }

 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
@@ -539,7 +556,7 @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
                 * (instead of producing a deadlock in the former case). */
                if (!req->waiting_for) {
                    self->waiting_for = req;
-                    qemu_co_queue_wait(&req->wait_queue);
+                    qemu_co_queue_wait(&req->wait_queue, NULL);
                    self->waiting_for = NULL;
                    retry = true;
                    waited = true;
@@ -616,7 +633,7 @@ static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
        bdrv_rw_co_entry(&rwco);
    } else {
        co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
-        qemu_coroutine_enter(co);
+        bdrv_coroutine_enter(child->bs, co);
        BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
    }
    return rwco.ret;
@@ -813,7 +830,7 @@ static void bdrv_co_io_em_complete(void *opaque, int ret)
    CoroutineIOCompletion *co = opaque;

    co->ret = ret;
-    qemu_coroutine_enter(co->coroutine);
+    aio_co_wake(co->coroutine);
 }

 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
@@ -925,9 +942,11 @@ bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
    return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
 }

-static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
+static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
        int64_t offset, unsigned int bytes, QEMUIOVector *qiov)
 {
+    BlockDriverState *bs = child->bs;
+
    /* Perform I/O through a temporary buffer so that users who scribble over
     * their read buffer while the operation is in progress do not end up
     * modifying the image file.  This is critical for zero-copy guest I/O
@@ -943,6 +962,15 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
    size_t skip_bytes;
    int ret;

+    /* FIXME We cannot require callers to have write permissions when all they
+     * are doing is a read request. If we did things right, write permissions
+     * would be obtained anyway, but internally by the copy-on-read code. As
+     * long as it is implemented here rather than in a separat filter driver,
+     * the copy-on-read code doesn't have its own BdrvChild, however, for which
+     * it could request permissions. Therefore we have to bypass the permission
+     * system for the moment. */
+    // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
+
    /* Cover entire cluster so no additional backing file I/O is required when
     * allocating cluster in the image file.
     */
@@ -1001,10 +1029,11 @@ err:
 * handles copy on read, zeroing after EOF, and fragmentation of large
 * reads; any other features must be implemented by the caller.
 */
-static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
+static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
    int64_t align, QEMUIOVector *qiov, int flags)
 {
+    BlockDriverState *bs = child->bs;
    int64_t total_bytes, max_bytes;
    int ret = 0;
    uint64_t bytes_remaining = bytes;
@@ -1050,7 +1079,7 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
        }

        if (!ret || pnum != nb_sectors) {
-            ret = bdrv_co_do_copy_on_readv(bs, offset, bytes, qiov);
+            ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov);
            goto out;
        }
    }
@@ -1158,7 +1187,7 @@ int coroutine_fn bdrv_co_preadv(BdrvChild *child,
    }

    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
-    ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
+    ret = bdrv_aligned_preadv(child, &req, offset, bytes, align,
                              use_local_qiov ? &local_qiov : qiov,
                              flags);
    tracked_request_end(&req);
@@ -1306,10 +1335,11 @@ fail:
 * Forwards an already correctly aligned write request to the BlockDriver,
 * after possibly fragmenting it.
 */
-static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
+static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
    int64_t align, QEMUIOVector *qiov, int flags)
 {
+    BlockDriverState *bs = child->bs;
    BlockDriver *drv = bs->drv;
    bool waited;
    int ret;
@@ -1332,6 +1362,16 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
    assert(!waited || !req->serialising);
    assert(req->overlap_offset <= offset);
    assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
+    /* FIXME: Block migration uses the BlockBackend of the guest device at a
+     *        point when it has not yet taken write permissions. This will be
+     *        fixed by a future patch, but for now we have to bypass this
+     *        assertion for block migration to work. */
+    // assert(child->perm & BLK_PERM_WRITE);
+    /* FIXME: Because of the above, we also cannot guarantee that all format
+     *        BDS take the BLK_PERM_RESIZE permission on their file BDS, since
+     *        they are not obligated to do so if they do not have any parent
+     *        that has taken the permission to write to them. */
+    // assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);

    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);

@@ -1397,12 +1437,13 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
    return ret;
 }

-static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
+static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
                                                int64_t offset,
                                                unsigned int bytes,
                                                BdrvRequestFlags flags,
                                                BdrvTrackedRequest *req)
 {
+    BlockDriverState *bs = child->bs;
    uint8_t *buf = NULL;
    QEMUIOVector local_qiov;
    struct iovec iov;
@@ -1430,7 +1471,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
        mark_request_serialising(req, align);
        wait_serialising_requests(req);
        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
-        ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align,
+        ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align,
                                  align, &local_qiov, 0);
        if (ret < 0) {
            goto fail;
@@ -1438,7 +1479,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);

        memset(buf + head_padding_bytes, 0, zero_bytes);
-        ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align,
+        ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align,
                                   align, &local_qiov,
                                   flags & ~BDRV_REQ_ZERO_WRITE);
        if (ret < 0) {
@@ -1452,7 +1493,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
    if (bytes >= align) {
        /* Write the aligned part in the middle. */
        uint64_t aligned_bytes = bytes & ~(align - 1);
-        ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, align,
+        ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
                                   NULL, flags);
        if (ret < 0) {
            goto fail;
@@ -1468,7 +1509,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
        mark_request_serialising(req, align);
        wait_serialising_requests(req);
        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
-        ret = bdrv_aligned_preadv(bs, req, offset, align,
+        ret = bdrv_aligned_preadv(child, req, offset, align,
                                  align, &local_qiov, 0);
        if (ret < 0) {
            goto fail;
@@ -1476,7 +1517,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);

        memset(buf, 0, bytes);
-        ret = bdrv_aligned_pwritev(bs, req, offset, align, align,
+        ret = bdrv_aligned_pwritev(child, req, offset, align, align,
                                   &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
    }
 fail:
@@ -1523,7 +1564,7 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);

    if (!qiov) {
-        ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req);
+        ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
        goto out;
    }

@@ -1542,7 +1583,7 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
        qemu_iovec_init_external(&head_qiov, &head_iov, 1);

        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
-        ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
+        ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align,
                                  align, &head_qiov, 0);
        if (ret < 0) {
            goto fail;
@@ -1584,8 +1625,8 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
        qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);

        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
-        ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
-                                  align, &tail_qiov, 0);
+        ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1),
+                                  align, align, &tail_qiov, 0);
        if (ret < 0) {
            goto fail;
        }
@@ -1603,7 +1644,7 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
        bytes = ROUND_UP(bytes, align);
    }

-    ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, align,
+    ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
                               use_local_qiov ? &local_qiov : qiov,
                               flags);

@@ -1751,7 +1792,7 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,

    if (ret & BDRV_BLOCK_RAW) {
        assert(ret & BDRV_BLOCK_OFFSET_VALID);
-        ret = bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
+        ret = bdrv_get_block_status(*file, ret >> BDRV_SECTOR_BITS,
                                    *pnum, pnum, file);
        goto out;
    }
@@ -1864,7 +1905,7 @@ int64_t bdrv_get_block_status_above(BlockDriverState *bs,
    } else {
        co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry,
                                   &data);
-        qemu_coroutine_enter(co);
+        bdrv_coroutine_enter(bs, co);
        BDRV_POLL_WHILE(bs, !data.done);
    }
    return data.ret;
@@ -1990,7 +2031,7 @@ bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
        };
        Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);

-        qemu_coroutine_enter(co);
+        bdrv_coroutine_enter(bs, co);
        while (data.ret == -EINPROGRESS) {
            aio_poll(bdrv_get_aio_context(bs), true);
        }
@@ -2080,6 +2121,11 @@ void bdrv_aio_cancel(BlockAIOCB *acb)
        if (acb->aiocb_info->get_aio_context) {
            aio_poll(acb->aiocb_info->get_aio_context(acb), true);
        } else if (acb->bs) {
+            /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
+             * assert that we're not using an I/O thread.  Thread-safe
+             * code should use bdrv_aio_cancel_async exclusively.
+             */
+            assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
            aio_poll(bdrv_get_aio_context(acb->bs), true);
        } else {
            abort();
@@ -2202,7 +2248,7 @@ static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
    acb->is_write = is_write;

    co = qemu_coroutine_create(bdrv_co_do_rw, acb);
-    qemu_coroutine_enter(co);
+    bdrv_coroutine_enter(child->bs, co);

    bdrv_co_maybe_schedule_bh(acb);
    return &acb->common;
@@ -2233,41 +2279,12 @@ BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
    acb->req.error = -EINPROGRESS;

    co = qemu_coroutine_create(bdrv_aio_flush_co_entry, acb);
-    qemu_coroutine_enter(co);
+    bdrv_coroutine_enter(bs, co);

    bdrv_co_maybe_schedule_bh(acb);
    return &acb->common;
 }

-void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
-                   BlockCompletionFunc *cb, void *opaque)
-{
-    BlockAIOCB *acb;
-
-    acb = g_malloc(aiocb_info->aiocb_size);
-    acb->aiocb_info = aiocb_info;
-    acb->bs = bs;
-    acb->cb = cb;
-    acb->opaque = opaque;
-    acb->refcnt = 1;
-    return acb;
-}
-
-void qemu_aio_ref(void *p)
-{
-    BlockAIOCB *acb = p;
-    acb->refcnt++;
-}
-
-void qemu_aio_unref(void *p)
-{
-    BlockAIOCB *acb = p;
-    assert(acb->refcnt > 0);
-    if (--acb->refcnt == 0) {
-        g_free(acb);
-    }
-}
-
 /**************************************************************/
 /* Coroutine block device emulation */

@@ -2286,20 +2303,21 @@ static void coroutine_fn bdrv_flush_co_entry(void *opaque)

 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 {
-    int ret;
-
-    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
-        bdrv_is_sg(bs)) {
-        return 0;
-    }
+    int current_gen;
+    int ret = 0;

    bdrv_inc_in_flight(bs);

-    int current_gen = bs->write_gen;
+    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
+        bdrv_is_sg(bs)) {
+        goto early_exit;
+    }
+
+    current_gen = bs->write_gen;

    /* Wait until any previous flushes are completed */
    while (bs->active_flush_req) {
-        qemu_co_queue_wait(&bs->flush_queue);
+        qemu_co_queue_wait(&bs->flush_queue, NULL);
    }

    bs->active_flush_req = true;
@@ -2378,6 +2396,7 @@ out:
    /* Return value is ignored - it's ok if wait queue is empty */
    qemu_co_queue_next(&bs->flush_queue);

+early_exit:
    bdrv_dec_in_flight(bs);
    return ret;
 }
@@ -2395,7 +2414,7 @@ int bdrv_flush(BlockDriverState *bs)
        bdrv_flush_co_entry(&flush_co);
    } else {
        co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
-        qemu_coroutine_enter(co);
+        bdrv_coroutine_enter(bs, co);
        BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
    }

@@ -2542,7 +2561,7 @@ int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count)
        bdrv_pdiscard_co_entry(&rwco);
    } else {
        co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
-        qemu_coroutine_enter(co);
+        bdrv_coroutine_enter(bs, co);
        BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE);
    }

--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -58,6 +58,7 @@ typedef struct IscsiLun {
    int events;
    QEMUTimer *nop_timer;
    QEMUTimer *event_timer;
+    QemuMutex mutex;
    struct scsi_inquiry_logical_block_provisioning lbp;
    struct scsi_inquiry_block_limits bl;
    unsigned char *zeroblock;
@@ -102,7 +103,6 @@ typedef struct IscsiTask {

 typedef struct IscsiAIOCB {
    BlockAIOCB common;
-    QEMUIOVector *qiov;
    QEMUBH *bh;
    IscsiLun *iscsilun;
    struct scsi_task *task;
@@ -165,8 +165,9 @@ iscsi_schedule_bh(IscsiAIOCB *acb)
 static void iscsi_co_generic_bh_cb(void *opaque)
 {
    struct IscsiTask *iTask = opaque;
+
    iTask->complete = 1;
-    qemu_coroutine_enter(iTask->co);
+    aio_co_wake(iTask->co);
 }

 static void iscsi_retry_timer_expired(void *opaque)
@@ -174,7 +175,7 @@ static void iscsi_retry_timer_expired(void *opaque)
    struct IscsiTask *iTask = opaque;
    iTask->complete = 1;
    if (iTask->co) {
-        qemu_coroutine_enter(iTask->co);
+        aio_co_wake(iTask->co);
    }
 }

@@ -251,6 +252,7 @@ static int iscsi_translate_sense(struct scsi_sense *sense)
    return ret;
 }

+/* Called (via iscsi_service) with QemuMutex held.  */
 static void
 iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
                        void *command_data, void *opaque)
@@ -351,6 +353,7 @@ static const AIOCBInfo iscsi_aiocb_info = {
 static void iscsi_process_read(void *arg);
 static void iscsi_process_write(void *arg);

+/* Called with QemuMutex held.  */
 static void
 iscsi_set_events(IscsiLun *iscsilun)
 {
@@ -394,8 +397,10 @@ iscsi_process_read(void *arg)
    IscsiLun *iscsilun = arg;
    struct iscsi_context *iscsi = iscsilun->iscsi;

+    qemu_mutex_lock(&iscsilun->mutex);
    iscsi_service(iscsi, POLLIN);
    iscsi_set_events(iscsilun);
+    qemu_mutex_unlock(&iscsilun->mutex);
 }

 static void
@@ -404,8 +409,10 @@ iscsi_process_write(void *arg)
    IscsiLun *iscsilun = arg;
    struct iscsi_context *iscsi = iscsilun->iscsi;

+    qemu_mutex_lock(&iscsilun->mutex);
    iscsi_service(iscsi, POLLOUT);
    iscsi_set_events(iscsilun);
+    qemu_mutex_unlock(&iscsilun->mutex);
 }

 static int64_t sector_lun2qemu(int64_t sector, IscsiLun *iscsilun)
@@ -584,6 +591,7 @@ iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
    uint64_t lba;
    uint32_t num_sectors;
    bool fua = flags & BDRV_REQ_FUA;
+    int r = 0;

    if (fua) {
        assert(iscsilun->dpofua);
@@ -599,6 +607,7 @@ iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
    lba = sector_qemu2lun(sector_num, iscsilun);
    num_sectors = sector_qemu2lun(nb_sectors, iscsilun);
    iscsi_co_init_iscsitask(iscsilun, &iTask);
+    qemu_mutex_lock(&iscsilun->mutex);
 retry:
    if (iscsilun->use_16_for_rw) {
 #if LIBISCSI_API_VERSION >= (20160603)
@@ -627,6 +636,7 @@ retry:
    }
 #endif
    if (iTask.task == NULL) {
+        qemu_mutex_unlock(&iscsilun->mutex);
        return -ENOMEM;
    }
 #if LIBISCSI_API_VERSION < (20160603)
@@ -635,7 +645,9 @@ retry:
 #endif
    while (!iTask.complete) {
        iscsi_set_events(iscsilun);
+        qemu_mutex_unlock(&iscsilun->mutex);
        qemu_coroutine_yield();
+        qemu_mutex_lock(&iscsilun->mutex);
    }

    if (iTask.task != NULL) {
@@ -650,12 +662,15 @@ retry:

    if (iTask.status != SCSI_STATUS_GOOD) {
        iscsi_allocmap_set_invalid(iscsilun, sector_num, nb_sectors);
-        return iTask.err_code;
+        r = iTask.err_code;
+        goto out_unlock;
    }

    iscsi_allocmap_set_allocated(iscsilun, sector_num, nb_sectors);

-    return 0;
+out_unlock:
+    qemu_mutex_unlock(&iscsilun->mutex);
+    return r;
 }


@@ -688,18 +703,21 @@ static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
        goto out;
    }

+    qemu_mutex_lock(&iscsilun->mutex);
 retry:
    if (iscsi_get_lba_status_task(iscsilun->iscsi, iscsilun->lun,
                                  sector_qemu2lun(sector_num, iscsilun),
                                  8 + 16, iscsi_co_generic_cb,
                                  &iTask) == NULL) {
        ret = -ENOMEM;
-        goto out;
+        goto out_unlock;
    }

    while (!iTask.complete) {
        iscsi_set_events(iscsilun);
+        qemu_mutex_unlock(&iscsilun->mutex);
        qemu_coroutine_yield();
+        qemu_mutex_lock(&iscsilun->mutex);
    }

    if (iTask.do_retry) {
@@ -716,20 +734,20 @@ retry:
         * because the device is busy or the cmd is not
         * supported) we pretend all blocks are allocated
         * for backwards compatibility */
-        goto out;
+        goto out_unlock;
    }

    lbas = scsi_datain_unmarshall(iTask.task);
    if (lbas == NULL) {
        ret = -EIO;
-        goto out;
+        goto out_unlock;
    }

    lbasd = &lbas->descriptors[0];

    if (sector_qemu2lun(sector_num, iscsilun) != lbasd->lba) {
        ret = -EIO;
-        goto out;
+        goto out_unlock;
    }

    *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun);
@@ -751,6 +769,8 @@ retry:
    if (*pnum > nb_sectors) {
        *pnum = nb_sectors;
    }
+out_unlock:
+    qemu_mutex_unlock(&iscsilun->mutex);
 out:
    if (iTask.task != NULL) {
        scsi_free_scsi_task(iTask.task);
@@ -813,6 +833,7 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
    num_sectors = sector_qemu2lun(nb_sectors, iscsilun);

    iscsi_co_init_iscsitask(iscsilun, &iTask);
+    qemu_mutex_lock(&iscsilun->mutex);
 retry:
    if (iscsilun->use_16_for_rw) {
 #if LIBISCSI_API_VERSION >= (20160603)
@@ -843,6 +864,7 @@ retry:
    }
 #endif
    if (iTask.task == NULL) {
+        qemu_mutex_unlock(&iscsilun->mutex);
        return -ENOMEM;
    }
 #if LIBISCSI_API_VERSION < (20160603)
@@ -850,7 +872,9 @@ retry:
 #endif
    while (!iTask.complete) {
        iscsi_set_events(iscsilun);
+        qemu_mutex_unlock(&iscsilun->mutex);
        qemu_coroutine_yield();
+        qemu_mutex_lock(&iscsilun->mutex);
    }

    if (iTask.task != NULL) {
@@ -862,6 +886,7 @@ retry:
        iTask.complete = 0;
        goto retry;
    }
+    qemu_mutex_unlock(&iscsilun->mutex);

    if (iTask.status != SCSI_STATUS_GOOD) {
        return iTask.err_code;
@@ -876,15 +901,19 @@ static int coroutine_fn iscsi_co_flush(BlockDriverState *bs)
    struct IscsiTask iTask;

    iscsi_co_init_iscsitask(iscsilun, &iTask);
+    qemu_mutex_lock(&iscsilun->mutex);
 retry:
    if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, 0,
                                      0, iscsi_co_generic_cb, &iTask) == NULL) {
+        qemu_mutex_unlock(&iscsilun->mutex);
        return -ENOMEM;
    }

    while (!iTask.complete) {
        iscsi_set_events(iscsilun);
+        qemu_mutex_unlock(&iscsilun->mutex);
        qemu_coroutine_yield();
+        qemu_mutex_lock(&iscsilun->mutex);
    }

    if (iTask.task != NULL) {
@@ -896,6 +925,7 @@ retry:
        iTask.complete = 0;
        goto retry;
    }
+    qemu_mutex_unlock(&iscsilun->mutex);

    if (iTask.status != SCSI_STATUS_GOOD) {
        return iTask.err_code;
@@ -905,6 +935,7 @@ retry:
 }

 #ifdef __linux__
+/* Called (via iscsi_service) with QemuMutex held.  */
 static void
 iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
                     void *command_data, void *opaque)
@@ -1029,6 +1060,7 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
    acb->task->expxferlen = acb->ioh->dxfer_len;

    data.size = 0;
+    qemu_mutex_lock(&iscsilun->mutex);
    if (acb->task->xfer_dir == SCSI_XFER_WRITE) {
        if (acb->ioh->iovec_count == 0) {
            data.data = acb->ioh->dxferp;
@@ -1044,6 +1076,7 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
                                 iscsi_aio_ioctl_cb,
                                 (data.size > 0) ? &data : NULL,
                                 acb) != 0) {
+        qemu_mutex_unlock(&iscsilun->mutex);
        scsi_free_scsi_task(acb->task);
        qemu_aio_unref(acb);
        return NULL;
@@ -1063,6 +1096,7 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
    }

    iscsi_set_events(iscsilun);
+    qemu_mutex_unlock(&iscsilun->mutex);

    return &acb->common;
 }
@@ -1087,6 +1121,7 @@ coroutine_fn iscsi_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
    IscsiLun *iscsilun = bs->opaque;
    struct IscsiTask iTask;
    struct unmap_list list;
+    int r = 0;

    if (!is_byte_request_lun_aligned(offset, count, iscsilun)) {
        return -ENOTSUP;
@@ -1101,15 +1136,19 @@ coroutine_fn iscsi_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
    list.num = count / iscsilun->block_size;

    iscsi_co_init_iscsitask(iscsilun, &iTask);
+    qemu_mutex_lock(&iscsilun->mutex);
 retry:
    if (iscsi_unmap_task(iscsilun->iscsi, iscsilun->lun, 0, 0, &list, 1,
                         iscsi_co_generic_cb, &iTask) == NULL) {
-        return -ENOMEM;
+        r = -ENOMEM;
+        goto out_unlock;
    }

    while (!iTask.complete) {
        iscsi_set_events(iscsilun);
+        qemu_mutex_unlock(&iscsilun->mutex);
        qemu_coroutine_yield();
+        qemu_mutex_lock(&iscsilun->mutex);
    }

    if (iTask.task != NULL) {
@@ -1126,17 +1165,20 @@ retry:
        /* the target might fail with a check condition if it
           is not happy with the alignment of the UNMAP request
           we silently fail in this case */
-        return 0;
+        goto out_unlock;
    }

    if (iTask.status != SCSI_STATUS_GOOD) {
-        return iTask.err_code;
+        r = iTask.err_code;
+        goto out_unlock;
    }

    iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
                               count >> BDRV_SECTOR_BITS);

-    return 0;
+out_unlock:
+    qemu_mutex_unlock(&iscsilun->mutex);
+    return r;
 }

 static int
@@ -1148,6 +1190,7 @@ coroutine_fn iscsi_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
    uint64_t lba;
    uint32_t nb_blocks;
    bool use_16_for_ws = iscsilun->use_16_for_rw;
+    int r = 0;

    if (!is_byte_request_lun_aligned(offset, count, iscsilun)) {
        return -ENOTSUP;
@@ -1181,6 +1224,7 @@ coroutine_fn iscsi_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
        }
    }

+    qemu_mutex_lock(&iscsilun->mutex);
    iscsi_co_init_iscsitask(iscsilun, &iTask);
 retry:
    if (use_16_for_ws) {
@@ -1195,12 +1239,15 @@ retry:
                                            0, 0, iscsi_co_generic_cb, &iTask);
    }
    if (iTask.task == NULL) {
+        qemu_mutex_unlock(&iscsilun->mutex);
        return -ENOMEM;
    }

    while (!iTask.complete) {
        iscsi_set_events(iscsilun);
+        qemu_mutex_unlock(&iscsilun->mutex);
        qemu_coroutine_yield();
+        qemu_mutex_lock(&iscsilun->mutex);
    }

    if (iTask.status == SCSI_STATUS_CHECK_CONDITION &&
@@ -1210,7 +1257,8 @@ retry:
        /* WRITE SAME is not supported by the target */
        iscsilun->has_write_same = false;
        scsi_free_scsi_task(iTask.task);
-        return -ENOTSUP;
+        r = -ENOTSUP;
+        goto out_unlock;
    }

    if (iTask.task != NULL) {
@@ -1226,7 +1274,8 @@ retry:
    if (iTask.status != SCSI_STATUS_GOOD) {
        iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
                                   count >> BDRV_SECTOR_BITS);
-        return iTask.err_code;
+        r = iTask.err_code;
+        goto out_unlock;
    }

    if (flags & BDRV_REQ_MAY_UNMAP) {
@@ -1237,32 +1286,19 @@ retry:
                                     count >> BDRV_SECTOR_BITS);
    }

-    return 0;
+out_unlock:
+    qemu_mutex_unlock(&iscsilun->mutex);
+    return r;
 }

-static void parse_chap(struct iscsi_context *iscsi, const char *target,
+static void apply_chap(struct iscsi_context *iscsi, QemuOpts *opts,
                       Error **errp)
 {
-    QemuOptsList *list;
-    QemuOpts *opts;
    const char *user = NULL;
    const char *password = NULL;
    const char *secretid;
    char *secret = NULL;

-    list = qemu_find_opts("iscsi");
-    if (!list) {
-        return;
-    }
-
-    opts = qemu_opts_find(list, target);
-    if (opts == NULL) {
-        opts = QTAILQ_FIRST(&list->head);
-        if (!opts) {
-            return;
-        }
-    }
-
    user = qemu_opt_get(opts, "user");
    if (!user) {
        return;
@@ -1293,65 +1329,37 @@ static void parse_chap(struct iscsi_context *iscsi, const char *target,
    g_free(secret);
 }

-static void parse_header_digest(struct iscsi_context *iscsi, const char *target,
+static void apply_header_digest(struct iscsi_context *iscsi, QemuOpts *opts,
                                Error **errp)
 {
-    QemuOptsList *list;
-    QemuOpts *opts;
    const char *digest = NULL;

-    list = qemu_find_opts("iscsi");
-    if (!list) {
-        return;
-    }
-
-    opts = qemu_opts_find(list, target);
-    if (opts == NULL) {
-        opts = QTAILQ_FIRST(&list->head);
-        if (!opts) {
-            return;
-        }
-    }
-
    digest = qemu_opt_get(opts, "header-digest");
    if (!digest) {
-        return;
-    }
-
-    if (!strcmp(digest, "CRC32C")) {
+        iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE_CRC32C);
+    } else if (!strcmp(digest, "crc32c")) {
        iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_CRC32C);
-    } else if (!strcmp(digest, "NONE")) {
+    } else if (!strcmp(digest, "none")) {
        iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE);
-    } else if (!strcmp(digest, "CRC32C-NONE")) {
+    } else if (!strcmp(digest, "crc32c-none")) {
        iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_CRC32C_NONE);
-    } else if (!strcmp(digest, "NONE-CRC32C")) {
+    } else if (!strcmp(digest, "none-crc32c")) {
        iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE_CRC32C);
    } else {
        error_setg(errp, "Invalid header-digest setting : %s", digest);
    }
 }

-static char *parse_initiator_name(const char *target)
+static char *get_initiator_name(QemuOpts *opts)
 {
-    QemuOptsList *list;
-    QemuOpts *opts;
    const char *name;
    char *iscsi_name;
    UuidInfo *uuid_info;

-    list = qemu_find_opts("iscsi");
-    if (list) {
-        opts = qemu_opts_find(list, target);
-        if (!opts) {
-            opts = QTAILQ_FIRST(&list->head);
-        }
-        if (opts) {
    name = qemu_opt_get(opts, "initiator-name");
    if (name) {
        return g_strdup(name);
    }
-        }
-    }

    uuid_info = qmp_query_uuid(NULL);
    if (strcmp(uuid_info->UUID, UUID_NONE) == 0) {
@@ -1365,43 +1373,24 @@ static char *parse_initiator_name(const char *target)
    return iscsi_name;
 }

-static int parse_timeout(const char *target)
-{
-    QemuOptsList *list;
-    QemuOpts *opts;
-    const char *timeout;
-
-    list = qemu_find_opts("iscsi");
-    if (list) {
-        opts = qemu_opts_find(list, target);
-        if (!opts) {
-            opts = QTAILQ_FIRST(&list->head);
-        }
-        if (opts) {
-            timeout = qemu_opt_get(opts, "timeout");
-            if (timeout) {
-                return atoi(timeout);
-            }
-        }
-    }
-
-    return 0;
-}
-
 static void iscsi_nop_timed_event(void *opaque)
 {
    IscsiLun *iscsilun = opaque;

+    qemu_mutex_lock(&iscsilun->mutex);
    if (iscsi_get_nops_in_flight(iscsilun->iscsi) >= MAX_NOP_FAILURES) {
        error_report("iSCSI: NOP timeout. Reconnecting...");
        iscsilun->request_timed_out = true;
    } else if (iscsi_nop_out_async(iscsilun->iscsi, NULL, NULL, 0, NULL) != 0) {
        error_report("iSCSI: failed to sent NOP-Out. Disabling NOP messages.");
-        return;
+        goto out;
    }

    timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL);
    iscsi_set_events(iscsilun);
+
+out:
+    qemu_mutex_unlock(&iscsilun->mutex);
 }

 static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp)
@@ -1474,20 +1463,6 @@ static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp)
    }
 }

-/* TODO Convert to fine grained options */
-static QemuOptsList runtime_opts = {
-    .name = "iscsi",
-    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
-    .desc = {
-        {
-            .name = "filename",
-            .type = QEMU_OPT_STRING,
-            .help = "URL to the iscsi image",
-        },
-        { /* end of list */ }
-    },
-};
-
 static struct scsi_task *iscsi_do_inquiry(struct iscsi_context *iscsi, int lun,
                                          int evpd, int pc, void **inq, Error **errp)
 {
@@ -1605,24 +1580,178 @@ out:
    }
 }

+static void iscsi_parse_iscsi_option(const char *target, QDict *options)
+{
+    QemuOptsList *list;
+    QemuOpts *opts;
+    const char *user, *password, *password_secret, *initiator_name,
+               *header_digest, *timeout;
+
+    list = qemu_find_opts("iscsi");
+    if (!list) {
+        return;
+    }
+
+    opts = qemu_opts_find(list, target);
+    if (opts == NULL) {
+        opts = QTAILQ_FIRST(&list->head);
+        if (!opts) {
+            return;
+        }
+    }
+
+    user = qemu_opt_get(opts, "user");
+    if (user) {
+        qdict_set_default_str(options, "user", user);
+    }
+
+    password = qemu_opt_get(opts, "password");
+    if (password) {
+        qdict_set_default_str(options, "password", password);
+    }
+
+    password_secret = qemu_opt_get(opts, "password-secret");
+    if (password_secret) {
+        qdict_set_default_str(options, "password-secret", password_secret);
+    }
+
+    initiator_name = qemu_opt_get(opts, "initiator-name");
+    if (initiator_name) {
+        qdict_set_default_str(options, "initiator-name", initiator_name);
+    }
+
+    header_digest = qemu_opt_get(opts, "header-digest");
+    if (header_digest) {
+        /* -iscsi takes upper case values, but QAPI only supports lower case
+         * enum constant names, so we have to convert here. */
+        char *qapi_value = g_ascii_strdown(header_digest, -1);
+        qdict_set_default_str(options, "header-digest", qapi_value);
+        g_free(qapi_value);
+    }
+
+    timeout = qemu_opt_get(opts, "timeout");
+    if (timeout) {
+        qdict_set_default_str(options, "timeout", timeout);
+    }
+}
+
 /*
 * We support iscsi url's on the form
 * iscsi://[<username>%<password>@]<host>[:<port>]/<targetname>/<lun>
 */
+static void iscsi_parse_filename(const char *filename, QDict *options,
+                                 Error **errp)
+{
+    struct iscsi_url *iscsi_url;
+    const char *transport_name;
+    char *lun_str;
+
+    iscsi_url = iscsi_parse_full_url(NULL, filename);
+    if (iscsi_url == NULL) {
+        error_setg(errp, "Failed to parse URL : %s", filename);
+        return;
+    }
+
+#if LIBISCSI_API_VERSION >= (20160603)
+    switch (iscsi_url->transport) {
+    case TCP_TRANSPORT:
+        transport_name = "tcp";
+        break;
+    case ISER_TRANSPORT:
+        transport_name = "iser";
+        break;
+    default:
+        error_setg(errp, "Unknown transport type (%d)",
+                   iscsi_url->transport);
+        return;
+    }
+#else
+    transport_name = "tcp";
+#endif
+
+    qdict_set_default_str(options, "transport", transport_name);
+    qdict_set_default_str(options, "portal", iscsi_url->portal);
+    qdict_set_default_str(options, "target", iscsi_url->target);
+
+    lun_str = g_strdup_printf("%d", iscsi_url->lun);
+    qdict_set_default_str(options, "lun", lun_str);
+    g_free(lun_str);
+
+    /* User/password from -iscsi take precedence over those from the URL */
+    iscsi_parse_iscsi_option(iscsi_url->target, options);
+
+    if (iscsi_url->user[0] != '\0') {
+        qdict_set_default_str(options, "user", iscsi_url->user);
+        qdict_set_default_str(options, "password", iscsi_url->passwd);
+    }
+
+    iscsi_destroy_url(iscsi_url);
+}
+
+static QemuOptsList runtime_opts = {
+    .name = "iscsi",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+    .desc = {
+        {
+            .name = "transport",
+            .type = QEMU_OPT_STRING,
+        },
+        {
+            .name = "portal",
+            .type = QEMU_OPT_STRING,
+        },
+        {
+            .name = "target",
+            .type = QEMU_OPT_STRING,
+        },
+        {
+            .name = "user",
+            .type = QEMU_OPT_STRING,
+        },
+        {
+            .name = "password",
+            .type = QEMU_OPT_STRING,
+        },
+        {
+            .name = "password-secret",
+            .type = QEMU_OPT_STRING,
+        },
+        {
+            .name = "lun",
+            .type = QEMU_OPT_NUMBER,
+        },
+        {
+            .name = "initiator-name",
+            .type = QEMU_OPT_STRING,
+        },
+        {
+            .name = "header-digest",
+            .type = QEMU_OPT_STRING,
+        },
+        {
+            .name = "timeout",
+            .type = QEMU_OPT_NUMBER,
+        },
+        { /* end of list */ }
+    },
+};
+
 static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
                      Error **errp)
 {
    IscsiLun *iscsilun = bs->opaque;
    struct iscsi_context *iscsi = NULL;
-    struct iscsi_url *iscsi_url = NULL;
    struct scsi_task *task = NULL;
    struct scsi_inquiry_standard *inq = NULL;
    struct scsi_inquiry_supported_pages *inq_vpd;
    char *initiator_name = NULL;
    QemuOpts *opts;
    Error *local_err = NULL;
-    const char *filename;
-    int i, ret = 0, timeout = 0;
+    const char *transport_name, *portal, *target;
+#if LIBISCSI_API_VERSION >= (20160603)
+    enum iscsi_transport_type transport;
+#endif
+    int i, ret = 0, timeout = 0, lun;

    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
    qemu_opts_absorb_qdict(opts, options, &local_err);
@@ -1632,18 +1761,34 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
        goto out;
    }

-    filename = qemu_opt_get(opts, "filename");
+    transport_name = qemu_opt_get(opts, "transport");
+    portal = qemu_opt_get(opts, "portal");
+    target = qemu_opt_get(opts, "target");
+    lun = qemu_opt_get_number(opts, "lun", 0);

-    iscsi_url = iscsi_parse_full_url(iscsi, filename);
-    if (iscsi_url == NULL) {
-        error_setg(errp, "Failed to parse URL : %s", filename);
+    if (!transport_name || !portal || !target) {
+        error_setg(errp, "Need all of transport, portal and target options");
+        ret = -EINVAL;
+        goto out;
+    }
+
+    if (!strcmp(transport_name, "tcp")) {
+#if LIBISCSI_API_VERSION >= (20160603)
+        transport = TCP_TRANSPORT;
+    } else if (!strcmp(transport_name, "iser")) {
+        transport = ISER_TRANSPORT;
+#else
+        /* TCP is what older libiscsi versions always use */
+#endif
+    } else {
+        error_setg(errp, "Unknown transport: %s", transport_name);
        ret = -EINVAL;
        goto out;
    }

    memset(iscsilun, 0, sizeof(IscsiLun));

-    initiator_name = parse_initiator_name(iscsi_url->target);
+    initiator_name = get_initiator_name(opts);

    iscsi = iscsi_create_context(initiator_name);
    if (iscsi == NULL) {
@@ -1652,30 +1797,20 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
        goto out;
    }
 #if LIBISCSI_API_VERSION >= (20160603)
-    if (iscsi_init_transport(iscsi, iscsi_url->transport)) {
+    if (iscsi_init_transport(iscsi, transport)) {
        error_setg(errp, ("Error initializing transport."));
        ret = -EINVAL;
        goto out;
    }
 #endif
-    if (iscsi_set_targetname(iscsi, iscsi_url->target)) {
+    if (iscsi_set_targetname(iscsi, target)) {
        error_setg(errp, "iSCSI: Failed to set target name.");
        ret = -EINVAL;
        goto out;
    }

-    if (iscsi_url->user[0] != '\0') {
-        ret = iscsi_set_initiator_username_pwd(iscsi, iscsi_url->user,
-                                              iscsi_url->passwd);
-        if (ret != 0) {
-            error_setg(errp, "Failed to set initiator username and password");
-            ret = -EINVAL;
-            goto out;
-        }
-    }
-
    /* check if we got CHAP username/password via the options */
-    parse_chap(iscsi, iscsi_url->target, &local_err);
+    apply_chap(iscsi, opts, &local_err);
    if (local_err != NULL) {
        error_propagate(errp, local_err);
        ret = -EINVAL;
@@ -1688,10 +1823,8 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
        goto out;
    }

-    iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE_CRC32C);
-
    /* check if we got HEADER_DIGEST via the options */
-    parse_header_digest(iscsi, iscsi_url->target, &local_err);
+    apply_header_digest(iscsi, opts, &local_err);
    if (local_err != NULL) {
        error_propagate(errp, local_err);
        ret = -EINVAL;
@@ -1699,7 +1832,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
    }

    /* timeout handling is broken in libiscsi before 1.15.0 */
-    timeout = parse_timeout(iscsi_url->target);
+    timeout = qemu_opt_get_number(opts, "timeout", 0);
 #if LIBISCSI_API_VERSION >= 20150621
    iscsi_set_timeout(iscsi, timeout);
 #else
@@ -1708,7 +1841,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
    }
 #endif

-    if (iscsi_full_connect_sync(iscsi, iscsi_url->portal, iscsi_url->lun) != 0) {
+    if (iscsi_full_connect_sync(iscsi, portal, lun) != 0) {
        error_setg(errp, "iSCSI: Failed to connect to LUN : %s",
            iscsi_get_error(iscsi));
        ret = -EINVAL;
@@ -1717,7 +1850,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,

    iscsilun->iscsi = iscsi;
    iscsilun->aio_context = bdrv_get_aio_context(bs);
-    iscsilun->lun   = iscsi_url->lun;
+    iscsilun->lun = lun;
    iscsilun->has_write_same = true;

    task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 0, 0,
@@ -1803,6 +1936,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
    scsi_free_scsi_task(task);
    task = NULL;

+    qemu_mutex_init(&iscsilun->mutex);
    iscsi_attach_aio_context(bs, iscsilun->aio_context);

    /* Guess the internal cluster (page) size of the iscsi target by the means
@@ -1820,9 +1954,6 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
 out:
    qemu_opts_del(opts);
    g_free(initiator_name);
-    if (iscsi_url != NULL) {
-        iscsi_destroy_url(iscsi_url);
-    }
    if (task != NULL) {
        scsi_free_scsi_task(task);
    }
@@ -1851,6 +1982,7 @@ static void iscsi_close(BlockDriverState *bs)
    iscsi_destroy_context(iscsi);
    g_free(iscsilun->zeroblock);
    iscsi_allocmap_free(iscsilun);
+    qemu_mutex_destroy(&iscsilun->mutex);
    memset(iscsilun, 0, sizeof(IscsiLun));
 }

@@ -1960,6 +2092,7 @@ static int iscsi_create(const char *filename, QemuOpts *opts, Error **errp)
    BlockDriverState *bs;
    IscsiLun *iscsilun = NULL;
    QDict *bs_options;
+    Error *local_err = NULL;

    bs = bdrv_new();

@@ -1970,8 +2103,13 @@ static int iscsi_create(const char *filename, QemuOpts *opts, Error **errp)
    iscsilun = bs->opaque;

    bs_options = qdict_new();
-    qdict_put(bs_options, "filename", qstring_from_str(filename));
+    iscsi_parse_filename(filename, bs_options, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+    } else {
        ret = iscsi_open(bs, bs_options, 0, NULL);
+    }
    QDECREF(bs_options);

    if (ret != 0) {
@@ -2032,7 +2170,7 @@ static BlockDriver bdrv_iscsi = {
    .protocol_name   = "iscsi",

    .instance_size          = sizeof(IscsiLun),
-    .bdrv_needs_filename = true,
+    .bdrv_parse_filename    = iscsi_parse_filename,
    .bdrv_file_open         = iscsi_open,
    .bdrv_close             = iscsi_close,
    .bdrv_create            = iscsi_create,
@@ -2067,7 +2205,7 @@ static BlockDriver bdrv_iser = {
    .protocol_name   = "iser",

    .instance_size          = sizeof(IscsiLun),
-    .bdrv_needs_filename = true,
+    .bdrv_parse_filename    = iscsi_parse_filename,
    .bdrv_file_open         = iscsi_open,
    .bdrv_close             = iscsi_close,
    .bdrv_create            = iscsi_create,
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -54,10 +54,10 @@ struct LinuxAioState {
    io_context_t ctx;
    EventNotifier e;

-    /* io queue for submit at batch */
+    /* io queue for submit at batch.  Protected by AioContext lock. */
    LaioQueue io_q;

-    /* I/O completion processing */
+    /* I/O completion processing.  Only runs in I/O thread.  */
    QEMUBH *completion_bh;
    int event_idx;
    int event_max;
@@ -100,7 +100,7 @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
         * that!
         */
        if (!qemu_coroutine_entered(laiocb->co)) {
-            qemu_coroutine_enter(laiocb->co);
+            aio_co_wake(laiocb->co);
        }
    } else {
        laiocb->common.cb(laiocb->common.opaque, ret);
@@ -234,9 +234,12 @@ static void qemu_laio_process_completions(LinuxAioState *s)
 static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
 {
    qemu_laio_process_completions(s);
+
+    aio_context_acquire(s->aio_context);
    if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
        ioq_submit(s);
    }
+    aio_context_release(s->aio_context);
 }

 static void qemu_laio_completion_bh(void *opaque)
@@ -455,6 +458,7 @@ void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
 {
    aio_set_event_notifier(old_context, &s->e, false, NULL, NULL);
    qemu_bh_delete(s->completion_bh);
+    s->aio_context = NULL;
 }

 void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -12,6 +12,7 @@
 */

 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "trace.h"
 #include "block/blockjob_int.h"
 #include "block/block_int.h"
@@ -38,7 +39,10 @@ typedef struct MirrorBlockJob {
    BlockJob common;
    RateLimit limit;
    BlockBackend *target;
+    BlockDriverState *mirror_top_bs;
+    BlockDriverState *source;
    BlockDriverState *base;
+
    /* The name of the graph node to replace */
    char *replaces;
    /* The BDS to replace */
@@ -69,6 +73,7 @@ typedef struct MirrorBlockJob {
    bool waiting_for_io;
    int target_cluster_sectors;
    int max_iov;
+    bool initial_zeroing_ongoing;
 } MirrorBlockJob;

 typedef struct MirrorOp {
@@ -117,9 +122,10 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
        if (s->cow_bitmap) {
            bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
        }
+        if (!s->initial_zeroing_ongoing) {
            s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE;
        }
-
+    }
    qemu_iovec_destroy(&op->qiov);
    g_free(op);

@@ -132,6 +138,8 @@ static void mirror_write_complete(void *opaque, int ret)
 {
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
+
+    aio_context_acquire(blk_get_aio_context(s->common.blk));
    if (ret < 0) {
        BlockErrorAction action;

@@ -142,12 +150,15 @@ static void mirror_write_complete(void *opaque, int ret)
        }
    }
    mirror_iteration_done(op, ret);
+    aio_context_release(blk_get_aio_context(s->common.blk));
 }

 static void mirror_read_complete(void *opaque, int ret)
 {
    MirrorOp *op = opaque;
    MirrorBlockJob *s = op->s;
+
+    aio_context_acquire(blk_get_aio_context(s->common.blk));
    if (ret < 0) {
        BlockErrorAction action;

@@ -158,10 +169,11 @@ static void mirror_read_complete(void *opaque, int ret)
        }

        mirror_iteration_done(op, ret);
-        return;
-    }
+    } else {
        blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
                        0, mirror_write_complete, op);
+    }
+    aio_context_release(blk_get_aio_context(s->common.blk));
 }

 static inline void mirror_clip_sectors(MirrorBlockJob *s,
@@ -319,7 +331,7 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s,

 static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
-    BlockDriverState *source = blk_bs(s->common.blk);
+    BlockDriverState *source = s->source;
    int64_t sector_num, first_chunk;
    uint64_t delay_ns = 0;
    /* At least the first dirty chunk is mirrored in one iteration. */
@@ -378,7 +390,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
                            nb_chunks * sectors_per_chunk);
    bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks);
    while (nb_chunks > 0 && sector_num < end) {
-        int ret;
+        int64_t ret;
        int io_sectors, io_sectors_acct;
        BlockDriverState *file;
        enum MirrorMethod {
@@ -489,12 +501,37 @@ static void mirror_exit(BlockJob *job, void *opaque)
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
    MirrorExitData *data = opaque;
    AioContext *replace_aio_context = NULL;
-    BlockDriverState *src = blk_bs(s->common.blk);
+    BlockDriverState *src = s->source;
    BlockDriverState *target_bs = blk_bs(s->target);
+    BlockDriverState *mirror_top_bs = s->mirror_top_bs;
+    Error *local_err = NULL;

    /* Make sure that the source BDS doesn't go away before we called
     * block_job_completed(). */
    bdrv_ref(src);
+    bdrv_ref(mirror_top_bs);
+    bdrv_ref(target_bs);
+
+    /* Remove target parent that still uses BLK_PERM_WRITE/RESIZE before
+     * inserting target_bs at s->to_replace, where we might not be able to get
+     * these permissions. */
+    blk_unref(s->target);
+    s->target = NULL;
+
+    /* We don't access the source any more. Dropping any WRITE/RESIZE is
+     * required before it could become a backing file of target_bs. */
+    bdrv_child_try_set_perm(mirror_top_bs->backing, 0, BLK_PERM_ALL,
+                            &error_abort);
+    if (s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
+        BlockDriverState *backing = s->is_none_mode ? src : s->base;
+        if (backing_bs(target_bs) != backing) {
+            bdrv_set_backing_hd(target_bs, backing, &local_err);
+            if (local_err) {
+                error_report_err(local_err);
+                data->ret = -EPERM;
+            }
+        }
+    }

    if (s->to_replace) {
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
@@ -514,12 +551,12 @@ static void mirror_exit(BlockJob *job, void *opaque)
        /* The mirror job has no requests in flight any more, but we need to
         * drain potential other users of the BDS before changing the graph. */
        bdrv_drained_begin(target_bs);
-        bdrv_replace_in_backing_chain(to_replace, target_bs);
+        bdrv_replace_node(to_replace, target_bs, &local_err);
        bdrv_drained_end(target_bs);
-
-        /* We just changed the BDS the job BB refers to */
-        blk_remove_bs(job->blk);
-        blk_insert_bs(job->blk, src);
+        if (local_err) {
+            error_report_err(local_err);
+            data->ret = -EPERM;
+        }
    }
    if (s->to_replace) {
        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
@@ -530,11 +567,29 @@ static void mirror_exit(BlockJob *job, void *opaque)
        aio_context_release(replace_aio_context);
    }
    g_free(s->replaces);
-    blk_unref(s->target);
-    s->target = NULL;
+    bdrv_unref(target_bs);
+
+    /* Remove the mirror filter driver from the graph. Before this, get rid of
+     * the blockers on the intermediate nodes so that the resulting state is
+     * valid. Also give up permissions on mirror_top_bs->backing, which might
+     * block the removal. */
+    block_job_remove_all_bdrv(job);
+    bdrv_child_try_set_perm(mirror_top_bs->backing, 0, BLK_PERM_ALL,
+                            &error_abort);
+    bdrv_replace_node(mirror_top_bs, backing_bs(mirror_top_bs), &error_abort);
+
+    /* We just changed the BDS the job BB refers to (with either or both of the
+     * bdrv_replace_node() calls), so switch the BB back so the cleanup does
+     * the right thing. We don't need any permissions any more now. */
+    blk_remove_bs(job->blk);
+    blk_set_perm(job->blk, 0, BLK_PERM_ALL, &error_abort);
+    blk_insert_bs(job->blk, mirror_top_bs, &error_abort);
+
    block_job_completed(&s->common, data->ret);
+
    g_free(data);
    bdrv_drained_end(src);
+    bdrv_unref(mirror_top_bs);
    bdrv_unref(src);
 }

@@ -554,7 +609,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
 {
    int64_t sector_num, end;
    BlockDriverState *base = s->base;
-    BlockDriverState *bs = blk_bs(s->common.blk);
+    BlockDriverState *bs = s->source;
    BlockDriverState *target_bs = blk_bs(s->target);
    int ret, n;

@@ -566,6 +621,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
            return 0;
        }

+        s->initial_zeroing_ongoing = true;
        for (sector_num = 0; sector_num < end; ) {
            int nb_sectors = MIN(end - sector_num,
                QEMU_ALIGN_DOWN(INT_MAX, s->granularity) >> BDRV_SECTOR_BITS);
@@ -573,11 +629,13 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
            mirror_throttle(s);

            if (block_job_is_cancelled(&s->common)) {
+                s->initial_zeroing_ongoing = false;
                return 0;
            }

            if (s->in_flight >= MAX_IN_FLIGHT) {
-                trace_mirror_yield(s, s->in_flight, s->buf_free_count, -1);
+                trace_mirror_yield(s, UINT64_MAX, s->buf_free_count,
+                                   s->in_flight);
                mirror_wait_for_io(s);
                continue;
            }
@@ -587,6 +645,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
        }

        mirror_wait_for_all_io(s);
+        s->initial_zeroing_ongoing = false;
    }

    /* First part, loop on the sectors and initialize the dirty bitmap.  */
@@ -633,7 +692,7 @@ static void coroutine_fn mirror_run(void *opaque)
 {
    MirrorBlockJob *s = opaque;
    MirrorExitData *data;
-    BlockDriverState *bs = blk_bs(s->common.blk);
+    BlockDriverState *bs = s->source;
    BlockDriverState *target_bs = blk_bs(s->target);
    bool need_drain = true;
    int64_t length;
@@ -651,7 +710,28 @@ static void coroutine_fn mirror_run(void *opaque)
    if (s->bdev_length < 0) {
        ret = s->bdev_length;
        goto immediate_exit;
-    } else if (s->bdev_length == 0) {
+    }
+
+    /* Active commit must resize the base image if its size differs from the
+     * active layer. */
+    if (s->base == blk_bs(s->target)) {
+        int64_t base_length;
+
+        base_length = blk_getlength(s->target);
+        if (base_length < 0) {
+            ret = base_length;
+            goto immediate_exit;
+        }
+
+        if (s->bdev_length > base_length) {
+            ret = blk_truncate(s->target, s->bdev_length);
+            if (ret < 0) {
+                goto immediate_exit;
+            }
+        }
+    }
+
+    if (s->bdev_length == 0) {
        /* Report BLOCK_JOB_READY and wait for complete. */
        block_job_event_ready(&s->common);
        s->synced = true;
@@ -730,7 +810,7 @@ static void coroutine_fn mirror_run(void *opaque)
            s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
            if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                (cnt == 0 && s->in_flight > 0)) {
-                trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
+                trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight);
                mirror_wait_for_io(s);
                continue;
            } else if (cnt != 0) {
@@ -844,9 +924,8 @@ static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
 static void mirror_complete(BlockJob *job, Error **errp)
 {
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
-    BlockDriverState *src, *target;
+    BlockDriverState *target;

-    src = blk_bs(job->blk);
    target = blk_bs(s->target);

    if (!s->synced) {
@@ -878,6 +957,10 @@ static void mirror_complete(BlockJob *job, Error **errp)
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
        aio_context_acquire(replace_aio_context);

+        /* TODO Translate this into permission system. Current definition of
+         * GRAPH_MOD would require to request it for the parents; they might
+         * not even be BlockDriverStates, however, so a BdrvChild can't address
+         * them. May need redefinition of GRAPH_MOD. */
        error_setg(&s->replace_blocker,
                   "block device is in use by block-job-complete");
        bdrv_op_block_all(s->to_replace, s->replace_blocker);
@@ -886,13 +969,6 @@ static void mirror_complete(BlockJob *job, Error **errp)
        aio_context_release(replace_aio_context);
    }

-    if (s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
-        BlockDriverState *backing = s->is_none_mode ? src : s->base;
-        if (backing_bs(target) != backing) {
-            bdrv_set_backing_hd(target, backing);
-        }
-    }
-
    s->should_complete = true;
    block_job_enter(&s->common);
 }
@@ -948,6 +1024,85 @@ static const BlockJobDriver commit_active_job_driver = {
    .drain                  = mirror_drain,
 };

+static int coroutine_fn bdrv_mirror_top_preadv(BlockDriverState *bs,
+    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
+{
+    return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
+}
+
+static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs,
+    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
+{
+    return bdrv_co_pwritev(bs->backing, offset, bytes, qiov, flags);
+}
+
+static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs)
+{
+    return bdrv_co_flush(bs->backing->bs);
+}
+
+static int64_t coroutine_fn bdrv_mirror_top_get_block_status(
+    BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
+    BlockDriverState **file)
+{
+    *pnum = nb_sectors;
+    *file = bs->backing->bs;
+    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
+           (sector_num << BDRV_SECTOR_BITS);
+}
+
+static int coroutine_fn bdrv_mirror_top_pwrite_zeroes(BlockDriverState *bs,
+    int64_t offset, int count, BdrvRequestFlags flags)
+{
+    return bdrv_co_pwrite_zeroes(bs->backing, offset, count, flags);
+}
+
+static int coroutine_fn bdrv_mirror_top_pdiscard(BlockDriverState *bs,
+    int64_t offset, int count)
+{
+    return bdrv_co_pdiscard(bs->backing->bs, offset, count);
+}
+
+static void bdrv_mirror_top_refresh_filename(BlockDriverState *bs, QDict *opts)
+{
+    bdrv_refresh_filename(bs->backing->bs);
+    pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
+            bs->backing->bs->filename);
+}
+
+static void bdrv_mirror_top_close(BlockDriverState *bs)
+{
+}
+
+static void bdrv_mirror_top_child_perm(BlockDriverState *bs, BdrvChild *c,
+                                       const BdrvChildRole *role,
+                                       uint64_t perm, uint64_t shared,
+                                       uint64_t *nperm, uint64_t *nshared)
+{
+    /* Must be able to forward guest writes to the real image */
+    *nperm = 0;
+    if (perm & BLK_PERM_WRITE) {
+        *nperm |= BLK_PERM_WRITE;
+    }
+
+    *nshared = BLK_PERM_ALL;
+}
+
+/* Dummy node that provides consistent read to its users without requiring it
+ * from its backing file and that allows writes on the backing file chain. */
+static BlockDriver bdrv_mirror_top = {
+    .format_name                = "mirror_top",
+    .bdrv_co_preadv             = bdrv_mirror_top_preadv,
+    .bdrv_co_pwritev            = bdrv_mirror_top_pwritev,
+    .bdrv_co_pwrite_zeroes      = bdrv_mirror_top_pwrite_zeroes,
+    .bdrv_co_pdiscard           = bdrv_mirror_top_pdiscard,
+    .bdrv_co_flush              = bdrv_mirror_top_flush,
+    .bdrv_co_get_block_status   = bdrv_mirror_top_get_block_status,
+    .bdrv_refresh_filename      = bdrv_mirror_top_refresh_filename,
+    .bdrv_close                 = bdrv_mirror_top_close,
+    .bdrv_child_perm            = bdrv_mirror_top_child_perm,
+};
+
 static void mirror_start_job(const char *job_id, BlockDriverState *bs,
                             int creation_flags, BlockDriverState *target,
                             const char *replaces, int64_t speed,
@@ -957,12 +1112,18 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
                             BlockdevOnError on_target_error,
                             bool unmap,
                             BlockCompletionFunc *cb,
-                             void *opaque, Error **errp,
+                             void *opaque,
                             const BlockJobDriver *driver,
                             bool is_none_mode, BlockDriverState *base,
-                             bool auto_complete)
+                             bool auto_complete, const char *filter_node_name,
+                             Error **errp)
 {
    MirrorBlockJob *s;
+    BlockDriverState *mirror_top_bs;
+    bool target_graph_mod;
+    bool target_is_backing;
+    Error *local_err = NULL;
+    int ret;

    if (granularity == 0) {
        granularity = bdrv_get_default_bitmap_granularity(target);
@@ -979,14 +1140,65 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
        buf_size = DEFAULT_MIRROR_BUF_SIZE;
    }

-    s = block_job_create(job_id, driver, bs, speed, creation_flags,
-                         cb, opaque, errp);
-    if (!s) {
+    /* In the case of active commit, add dummy driver to provide consistent
+     * reads on the top, while disabling it in the intermediate nodes, and make
+     * the backing chain writable. */
+    mirror_top_bs = bdrv_new_open_driver(&bdrv_mirror_top, filter_node_name,
+                                         BDRV_O_RDWR, errp);
+    if (mirror_top_bs == NULL) {
+        return;
+    }
+    mirror_top_bs->total_sectors = bs->total_sectors;
+    bdrv_set_aio_context(mirror_top_bs, bdrv_get_aio_context(bs));
+
+    /* bdrv_append takes ownership of the mirror_top_bs reference, need to keep
+     * it alive until block_job_create() succeeds even if bs has no parent. */
+    bdrv_ref(mirror_top_bs);
+    bdrv_drained_begin(bs);
+    bdrv_append(mirror_top_bs, bs, &local_err);
+    bdrv_drained_end(bs);
+
+    if (local_err) {
+        bdrv_unref(mirror_top_bs);
+        error_propagate(errp, local_err);
        return;
    }

-    s->target = blk_new();
-    blk_insert_bs(s->target, target);
+    /* Make sure that the source is not resized while the job is running */
+    s = block_job_create(job_id, driver, mirror_top_bs,
+                         BLK_PERM_CONSISTENT_READ,
+                         BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
+                         BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD, speed,
+                         creation_flags, cb, opaque, errp);
+    if (!s) {
+        goto fail;
+    }
+    /* The block job now has a reference to this node */
+    bdrv_unref(mirror_top_bs);
+
+    s->source = bs;
+    s->mirror_top_bs = mirror_top_bs;
+
+    /* No resize for the target either; while the mirror is still running, a
+     * consistent read isn't necessarily possible. We could possibly allow
+     * writes and graph modifications, though it would likely defeat the
+     * purpose of a mirror, so leave them blocked for now.
+     *
+     * In the case of active commit, things look a bit different, though,
+     * because the target is an already populated backing file in active use.
+     * We can allow anything except resize there.*/
+    target_is_backing = bdrv_chain_contains(bs, target);
+    target_graph_mod = (backing_mode != MIRROR_LEAVE_BACKING_CHAIN);
+    s->target = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE |
+                        (target_graph_mod ? BLK_PERM_GRAPH_MOD : 0),
+                        BLK_PERM_WRITE_UNCHANGED |
+                        (target_is_backing ? BLK_PERM_CONSISTENT_READ |
+                                             BLK_PERM_WRITE |
+                                             BLK_PERM_GRAPH_MOD : 0));
+    ret = blk_insert_bs(s->target, target, errp);
+    if (ret < 0) {
+        goto fail;
+    }

    s->replaces = g_strdup(replaces);
    s->on_source_error = on_source_error;
@@ -1003,24 +1215,51 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,

    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
    if (!s->dirty_bitmap) {
-        g_free(s->replaces);
-        blk_unref(s->target);
-        block_job_unref(&s->common);
-        return;
+        goto fail;
    }

-    block_job_add_bdrv(&s->common, target);
+    /* Required permissions are already taken with blk_new() */
+    block_job_add_bdrv(&s->common, "target", target, 0, BLK_PERM_ALL,
+                       &error_abort);
+
    /* In commit_active_start() all intermediate nodes disappear, so
     * any jobs in them must be blocked */
-    if (bdrv_chain_contains(bs, target)) {
+    if (target_is_backing) {
        BlockDriverState *iter;
        for (iter = backing_bs(bs); iter != target; iter = backing_bs(iter)) {
-            block_job_add_bdrv(&s->common, iter);
+            /* XXX BLK_PERM_WRITE needs to be allowed so we don't block
+             * ourselves at s->base (if writes are blocked for a node, they are
+             * also blocked for its backing file). The other options would be a
+             * second filter driver above s->base (== target). */
+            ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
+                                     BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE,
+                                     errp);
+            if (ret < 0) {
+                goto fail;
+            }
        }
    }

    trace_mirror_start(bs, s, opaque);
    block_job_start(&s->common);
+    return;
+
+fail:
+    if (s) {
+        /* Make sure this BDS does not go away until we have completed the graph
+         * changes below */
+        bdrv_ref(mirror_top_bs);
+
+        g_free(s->replaces);
+        blk_unref(s->target);
+        block_job_unref(&s->common);
+    }
+
+    bdrv_child_try_set_perm(mirror_top_bs->backing, 0, BLK_PERM_ALL,
+                            &error_abort);
+    bdrv_replace_node(mirror_top_bs, backing_bs(mirror_top_bs), &error_abort);
+
+    bdrv_unref(mirror_top_bs);
 }

 void mirror_start(const char *job_id, BlockDriverState *bs,
@@ -1029,7 +1268,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
                  MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
                  BlockdevOnError on_source_error,
                  BlockdevOnError on_target_error,
-                  bool unmap, Error **errp)
+                  bool unmap, const char *filter_node_name, Error **errp)
 {
    bool is_none_mode;
    BlockDriverState *base;
@@ -1042,19 +1281,19 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
    base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL;
    mirror_start_job(job_id, bs, BLOCK_JOB_DEFAULT, target, replaces,
                     speed, granularity, buf_size, backing_mode,
-                     on_source_error, on_target_error, unmap, NULL, NULL, errp,
-                     &mirror_job_driver, is_none_mode, base, false);
+                     on_source_error, on_target_error, unmap, NULL, NULL,
+                     &mirror_job_driver, is_none_mode, base, false,
+                     filter_node_name, errp);
 }

 void commit_active_start(const char *job_id, BlockDriverState *bs,
                         BlockDriverState *base, int creation_flags,
                         int64_t speed, BlockdevOnError on_error,
-                         BlockCompletionFunc *cb, void *opaque, Error **errp,
-                         bool auto_complete)
+                         const char *filter_node_name,
+                         BlockCompletionFunc *cb, void *opaque,
+                         bool auto_complete, Error **errp)
 {
-    int64_t length, base_length;
    int orig_base_flags;
-    int ret;
    Error *local_err = NULL;

    orig_base_flags = bdrv_get_flags(base);
@@ -1063,35 +1302,11 @@ void commit_active_start(const char *job_id, BlockDriverState *bs,
        return;
    }

-    length = bdrv_getlength(bs);
-    if (length < 0) {
-        error_setg_errno(errp, -length,
-                         "Unable to determine length of %s", bs->filename);
-        goto error_restore_flags;
-    }
-
-    base_length = bdrv_getlength(base);
-    if (base_length < 0) {
-        error_setg_errno(errp, -base_length,
-                         "Unable to determine length of %s", base->filename);
-        goto error_restore_flags;
-    }
-
-    if (length > base_length) {
-        ret = bdrv_truncate(base, length);
-        if (ret < 0) {
-            error_setg_errno(errp, -ret,
-                            "Top image %s is larger than base image %s, and "
-                             "resize of base image failed",
-                             bs->filename, base->filename);
-            goto error_restore_flags;
-        }
-    }
-
    mirror_start_job(job_id, bs, creation_flags, base, NULL, speed, 0, 0,
                     MIRROR_LEAVE_BACKING_CHAIN,
-                     on_error, on_error, true, cb, opaque, &local_err,
-                     &commit_active_job_driver, false, base, auto_complete);
+                     on_error, on_error, true, cb, opaque,
+                     &commit_active_job_driver, false, base, auto_complete,
+                     filter_node_name, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        goto error_restore_flags;
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -39,7 +39,7 @@ static void nbd_recv_coroutines_enter_all(NBDClientSession *s)

    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
        if (s->recv_coroutine[i]) {
-            qemu_coroutine_enter(s->recv_coroutine[i]);
+            aio_co_wake(s->recv_coroutine[i]);
        }
    }
 }
@@ -56,7 +56,7 @@ static void nbd_teardown_connection(BlockDriverState *bs)
    qio_channel_shutdown(client->ioc,
                         QIO_CHANNEL_SHUTDOWN_BOTH,
                         NULL);
-    nbd_recv_coroutines_enter_all(client);
+    BDRV_POLL_WHILE(bs, client->read_reply_co);

    nbd_client_detach_aio_context(bs);
    object_unref(OBJECT(client->sioc));
@@ -65,54 +65,45 @@ static void nbd_teardown_connection(BlockDriverState *bs)
    client->ioc = NULL;
 }

-static void nbd_reply_ready(void *opaque)
+static coroutine_fn void nbd_read_reply_entry(void *opaque)
 {
-    BlockDriverState *bs = opaque;
-    NBDClientSession *s = nbd_get_client_session(bs);
+    NBDClientSession *s = opaque;
    uint64_t i;
    int ret;

-    if (!s->ioc) { /* Already closed */
-        return;
-    }
-
-    if (s->reply.handle == 0) {
-        /* No reply already in flight.  Fetch a header.  It is possible
-         * that another thread has done the same thing in parallel, so
-         * the socket is not readable anymore.
-         */
+    for (;;) {
+        assert(s->reply.handle == 0);
        ret = nbd_receive_reply(s->ioc, &s->reply);
-        if (ret == -EAGAIN) {
-            return;
-        }
-        if (ret < 0) {
-            s->reply.handle = 0;
-            goto fail;
-        }
+        if (ret <= 0) {
+            break;
        }

        /* There's no need for a mutex on the receive side, because the
         * handler acts as a synchronization point and ensures that only
-     * one coroutine is called until the reply finishes.  */
+         * one coroutine is called until the reply finishes.
+         */
        i = HANDLE_TO_INDEX(s, s->reply.handle);
-    if (i >= MAX_NBD_REQUESTS) {
-        goto fail;
+        if (i >= MAX_NBD_REQUESTS || !s->recv_coroutine[i]) {
+            break;
        }

-    if (s->recv_coroutine[i]) {
-        qemu_coroutine_enter(s->recv_coroutine[i]);
-        return;
+        /* We're woken up by the recv_coroutine itself.  Note that there
+         * is no race between yielding and reentering read_reply_co.  This
+         * is because:
+         *
+         * - if recv_coroutine[i] runs on the same AioContext, it is only
+         *   entered after we yield
+         *
+         * - if recv_coroutine[i] runs on a different AioContext, reentering
+         *   read_reply_co happens through a bottom half, which can only
+         *   run after we yield.
+         */
+        aio_co_wake(s->recv_coroutine[i]);
+        qemu_coroutine_yield();
    }

-fail:
-    nbd_teardown_connection(bs);
-}
-
-static void nbd_restart_write(void *opaque)
-{
-    BlockDriverState *bs = opaque;
-
-    qemu_coroutine_enter(nbd_get_client_session(bs)->send_coroutine);
+    nbd_recv_coroutines_enter_all(s);
+    s->read_reply_co = NULL;
 }

 static int nbd_co_send_request(BlockDriverState *bs,
@@ -120,7 +111,6 @@ static int nbd_co_send_request(BlockDriverState *bs,
                               QEMUIOVector *qiov)
 {
    NBDClientSession *s = nbd_get_client_session(bs);
-    AioContext *aio_context;
    int rc, ret, i;

    qemu_co_mutex_lock(&s->send_mutex);
@@ -141,11 +131,6 @@ static int nbd_co_send_request(BlockDriverState *bs,
        return -EPIPE;
    }

-    s->send_coroutine = qemu_coroutine_self();
-    aio_context = bdrv_get_aio_context(bs);
-
-    aio_set_fd_handler(aio_context, s->sioc->fd, false,
-                       nbd_reply_ready, nbd_restart_write, NULL, bs);
    if (qiov) {
        qio_channel_set_cork(s->ioc, true);
        rc = nbd_send_request(s->ioc, request);
@@ -160,9 +145,6 @@ static int nbd_co_send_request(BlockDriverState *bs,
    } else {
        rc = nbd_send_request(s->ioc, request);
    }
-    aio_set_fd_handler(aio_context, s->sioc->fd, false,
-                       nbd_reply_ready, NULL, NULL, bs);
-    s->send_coroutine = NULL;
    qemu_co_mutex_unlock(&s->send_mutex);
    return rc;
 }
@@ -174,8 +156,7 @@ static void nbd_co_receive_reply(NBDClientSession *s,
 {
    int ret;

-    /* Wait until we're woken up by the read handler.  TODO: perhaps
-     * peek at the next reply and avoid yielding if it's ours?  */
+    /* Wait until we're woken up by nbd_read_reply_entry.  */
    qemu_coroutine_yield();
    *reply = s->reply;
    if (reply->handle != request->handle ||
@@ -201,7 +182,7 @@ static void nbd_coroutine_start(NBDClientSession *s,
    /* Poor man semaphore.  The free_sema is locked when no other request
     * can be accepted, and unlocked after receiving one reply.  */
    if (s->in_flight == MAX_NBD_REQUESTS) {
-        qemu_co_queue_wait(&s->free_sema);
+        qemu_co_queue_wait(&s->free_sema, NULL);
        assert(s->in_flight < MAX_NBD_REQUESTS);
    }
    s->in_flight++;
@@ -209,13 +190,19 @@ static void nbd_coroutine_start(NBDClientSession *s,
    /* s->recv_coroutine[i] is set as soon as we get the send_lock.  */
 }

-static void nbd_coroutine_end(NBDClientSession *s,
+static void nbd_coroutine_end(BlockDriverState *bs,
                              NBDRequest *request)
 {
+    NBDClientSession *s = nbd_get_client_session(bs);
    int i = HANDLE_TO_INDEX(s, request->handle);
+
    s->recv_coroutine[i] = NULL;
-    if (s->in_flight-- == MAX_NBD_REQUESTS) {
+    s->in_flight--;
    qemu_co_queue_next(&s->free_sema);
+
+    /* Kick the read_reply_co to get the next reply.  */
+    if (s->read_reply_co) {
+        aio_co_wake(s->read_reply_co);
    }
 }

@@ -241,7 +228,7 @@ int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
    } else {
        nbd_co_receive_reply(client, &request, &reply, qiov);
    }
-    nbd_coroutine_end(client, &request);
+    nbd_coroutine_end(bs, &request);
    return -reply.error;
 }

@@ -271,7 +258,7 @@ int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
    } else {
        nbd_co_receive_reply(client, &request, &reply, NULL);
    }
-    nbd_coroutine_end(client, &request);
+    nbd_coroutine_end(bs, &request);
    return -reply.error;
 }

@@ -306,7 +293,7 @@ int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
    } else {
        nbd_co_receive_reply(client, &request, &reply, NULL);
    }
-    nbd_coroutine_end(client, &request);
+    nbd_coroutine_end(bs, &request);
    return -reply.error;
 }

@@ -331,7 +318,7 @@ int nbd_client_co_flush(BlockDriverState *bs)
    } else {
        nbd_co_receive_reply(client, &request, &reply, NULL);
    }
-    nbd_coroutine_end(client, &request);
+    nbd_coroutine_end(bs, &request);
    return -reply.error;
 }

@@ -357,23 +344,23 @@ int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
    } else {
        nbd_co_receive_reply(client, &request, &reply, NULL);
    }
-    nbd_coroutine_end(client, &request);
+    nbd_coroutine_end(bs, &request);
    return -reply.error;

 }

 void nbd_client_detach_aio_context(BlockDriverState *bs)
 {
-    aio_set_fd_handler(bdrv_get_aio_context(bs),
-                       nbd_get_client_session(bs)->sioc->fd,
-                       false, NULL, NULL, NULL, NULL);
+    NBDClientSession *client = nbd_get_client_session(bs);
+    qio_channel_detach_aio_context(QIO_CHANNEL(client->sioc));
 }

 void nbd_client_attach_aio_context(BlockDriverState *bs,
                                   AioContext *new_context)
 {
-    aio_set_fd_handler(new_context, nbd_get_client_session(bs)->sioc->fd,
-                       false, nbd_reply_ready, NULL, NULL, bs);
+    NBDClientSession *client = nbd_get_client_session(bs);
+    qio_channel_attach_aio_context(QIO_CHANNEL(client->sioc), new_context);
+    aio_co_schedule(new_context, client->read_reply_co);
 }

 void nbd_client_close(BlockDriverState *bs)
@@ -434,7 +421,7 @@ int nbd_client_init(BlockDriverState *bs,
    /* Now that we're connected, set the socket to be non-blocking and
     * kick the reply mechanism.  */
    qio_channel_set_blocking(QIO_CHANNEL(sioc), false, NULL);
-
+    client->read_reply_co = qemu_coroutine_create(nbd_read_reply_entry, client);
    nbd_client_attach_aio_context(bs, bdrv_get_aio_context(bs));

    logout("Established connection with NBD server\n");
--- a/block/nbd-client.h
+++ b/block/nbd-client.h
@@ -25,13 +25,11 @@ typedef struct NBDClientSession {

    CoMutex send_mutex;
    CoQueue free_sema;
-    Coroutine *send_coroutine;
+    Coroutine *read_reply_co;
    int in_flight;

    Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
    NBDReply reply;
-
-    bool is_unix;
 } NBDClientSession;

 NBDClientSession *nbd_get_client_session(BlockDriverState *bs);
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -47,7 +47,7 @@ typedef struct BDRVNBDState {
    NBDClientSession client;

    /* For nbd_refresh_filename() */
-    SocketAddress *saddr;
+    SocketAddressFlat *saddr;
    char *export, *tlscredsid;
 } BDRVNBDState;

@@ -95,7 +95,7 @@ static int nbd_parse_uri(const char *filename, QDict *options)
            goto out;
        }
        qdict_put(options, "server.type", qstring_from_str("unix"));
-        qdict_put(options, "server.data.path",
+        qdict_put(options, "server.path",
                  qstring_from_str(qp->p[0].value));
    } else {
        QString *host;
@@ -116,10 +116,10 @@ static int nbd_parse_uri(const char *filename, QDict *options)
        }

        qdict_put(options, "server.type", qstring_from_str("inet"));
-        qdict_put(options, "server.data.host", host);
+        qdict_put(options, "server.host", host);

        port_str = g_strdup_printf("%d", uri->port ?: NBD_DEFAULT_PORT);
-        qdict_put(options, "server.data.port", qstring_from_str(port_str));
+        qdict_put(options, "server.port", qstring_from_str(port_str));
        g_free(port_str);
    }

@@ -197,7 +197,7 @@ static void nbd_parse_filename(const char *filename, QDict *options,
    /* are we a UNIX or TCP socket? */
    if (strstart(host_spec, "unix:", &unixpath)) {
        qdict_put(options, "server.type", qstring_from_str("unix"));
-        qdict_put(options, "server.data.path", qstring_from_str(unixpath));
+        qdict_put(options, "server.path", qstring_from_str(unixpath));
    } else {
        InetSocketAddress *addr = NULL;

@@ -207,8 +207,8 @@ static void nbd_parse_filename(const char *filename, QDict *options,
        }

        qdict_put(options, "server.type", qstring_from_str("inet"));
-        qdict_put(options, "server.data.host", qstring_from_str(addr->host));
-        qdict_put(options, "server.data.port", qstring_from_str(addr->port));
+        qdict_put(options, "server.host", qstring_from_str(addr->host));
+        qdict_put(options, "server.port", qstring_from_str(addr->port));
        qapi_free_InetSocketAddress(addr);
    }

@@ -248,20 +248,21 @@ static bool nbd_process_legacy_socket_options(QDict *output_options,
        }

        qdict_put(output_options, "server.type", qstring_from_str("unix"));
-        qdict_put(output_options, "server.data.path", qstring_from_str(path));
+        qdict_put(output_options, "server.path", qstring_from_str(path));
    } else if (host) {
        qdict_put(output_options, "server.type", qstring_from_str("inet"));
-        qdict_put(output_options, "server.data.host", qstring_from_str(host));
-        qdict_put(output_options, "server.data.port",
+        qdict_put(output_options, "server.host", qstring_from_str(host));
+        qdict_put(output_options, "server.port",
                  qstring_from_str(port ?: stringify(NBD_DEFAULT_PORT)));
    }

    return true;
 }

-static SocketAddress *nbd_config(BDRVNBDState *s, QDict *options, Error **errp)
+static SocketAddressFlat *nbd_config(BDRVNBDState *s, QDict *options,
+                                     Error **errp)
 {
-    SocketAddress *saddr = NULL;
+    SocketAddressFlat *saddr = NULL;
    QDict *addr = NULL;
    QObject *crumpled_addr = NULL;
    Visitor *iv = NULL;
@@ -278,15 +279,21 @@ static SocketAddress *nbd_config(BDRVNBDState *s, QDict *options, Error **errp)
        goto done;
    }

-    iv = qobject_input_visitor_new(crumpled_addr, true);
-    visit_type_SocketAddress(iv, NULL, &saddr, &local_err);
+    /*
+     * FIXME .numeric, .to, .ipv4 or .ipv6 don't work with -drive
+     * server.type=inet.  .to doesn't matter, it's ignored anyway.
+     * That's because when @options come from -blockdev or
+     * blockdev_add, members are typed according to the QAPI schema,
+     * but when they come from -drive, they're all QString.  The
+     * visitor expects the former.
+     */
+    iv = qobject_input_visitor_new(crumpled_addr);
+    visit_type_SocketAddressFlat(iv, NULL, &saddr, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        goto done;
    }

-    s->client.is_unix = saddr->type == SOCKET_ADDRESS_KIND_UNIX;
-
 done:
    QDECREF(addr);
    qobject_decref(crumpled_addr);
@@ -300,9 +307,10 @@ NBDClientSession *nbd_get_client_session(BlockDriverState *bs)
    return &s->client;
 }

-static QIOChannelSocket *nbd_establish_connection(SocketAddress *saddr,
+static QIOChannelSocket *nbd_establish_connection(SocketAddressFlat *saddr_flat,
                                                  Error **errp)
 {
+    SocketAddress *saddr = socket_address_crumple(saddr_flat);
    QIOChannelSocket *sioc;
    Error *local_err = NULL;

@@ -312,7 +320,9 @@ static QIOChannelSocket *nbd_establish_connection(SocketAddress *saddr,
    qio_channel_socket_connect_sync(sioc,
                                    saddr,
                                    &local_err);
+    qapi_free_SocketAddress(saddr);
    if (local_err) {
+        object_unref(OBJECT(sioc));
        error_propagate(errp, local_err);
        return NULL;
    }
@@ -403,7 +413,7 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
        goto error;
    }

-    /* Translate @host, @port, and @path to a SocketAddress */
+    /* Translate @host, @port, and @path to a SocketAddressFlat */
    if (!nbd_process_legacy_socket_options(options, opts, errp)) {
        goto error;
    }
@@ -423,11 +433,12 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
            goto error;
        }

-        if (s->saddr->type != SOCKET_ADDRESS_KIND_INET) {
+        /* TODO SOCKET_ADDRESS_KIND_FD where fd has AF_INET or AF_INET6 */
+        if (s->saddr->type != SOCKET_ADDRESS_FLAT_TYPE_INET) {
            error_setg(errp, "TLS only supported over IP sockets");
            goto error;
        }
-        hostname = s->saddr->u.inet.data->host;
+        hostname = s->saddr->u.inet.host;
    }

    /* establish TCP connection, return error if it fails
@@ -450,7 +461,7 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
        object_unref(OBJECT(tlscreds));
    }
    if (ret < 0) {
-        qapi_free_SocketAddress(s->saddr);
+        qapi_free_SocketAddressFlat(s->saddr);
        g_free(s->export);
        g_free(s->tlscredsid);
    }
@@ -476,7 +487,7 @@ static void nbd_close(BlockDriverState *bs)

    nbd_client_close(bs);

-    qapi_free_SocketAddress(s->saddr);
+    qapi_free_SocketAddressFlat(s->saddr);
    g_free(s->export);
    g_free(s->tlscredsid);
 }
@@ -507,15 +518,15 @@ static void nbd_refresh_filename(BlockDriverState *bs, QDict *options)
    Visitor *ov;
    const char *host = NULL, *port = NULL, *path = NULL;

-    if (s->saddr->type == SOCKET_ADDRESS_KIND_INET) {
-        const InetSocketAddress *inet = s->saddr->u.inet.data;
+    if (s->saddr->type == SOCKET_ADDRESS_FLAT_TYPE_INET) {
+        const InetSocketAddress *inet = &s->saddr->u.inet;
        if (!inet->has_ipv4 && !inet->has_ipv6 && !inet->has_to) {
            host = inet->host;
            port = inet->port;
        }
-    } else if (s->saddr->type == SOCKET_ADDRESS_KIND_UNIX) {
-        path = s->saddr->u.q_unix.data->path;
-    }
+    } else if (s->saddr->type == SOCKET_ADDRESS_FLAT_TYPE_UNIX) {
+        path = s->saddr->u.q_unix.path;
+    } /* else can't represent as pseudo-filename */

    qdict_put(opts, "driver", qstring_from_str("nbd"));

@@ -534,11 +545,9 @@ static void nbd_refresh_filename(BlockDriverState *bs, QDict *options)
    }

    ov = qobject_output_visitor_new(&saddr_qdict);
-    visit_type_SocketAddress(ov, NULL, &s->saddr, &error_abort);
+    visit_type_SocketAddressFlat(ov, NULL, &s->saddr, &error_abort);
    visit_complete(ov, &saddr_qdict);
    visit_free(ov);
-    assert(qobject_type(saddr_qdict) == QTYPE_QDICT);
-
    qdict_put_obj(opts, "server", saddr_qdict);

    if (s->export) {
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -54,6 +54,7 @@ typedef struct NFSClient {
    int events;
    bool has_zero_init;
    AioContext *aio_context;
+    QemuMutex mutex;
    blkcnt_t st_blocks;
    bool cache_used;
    NFSServer *server;
@@ -108,12 +109,13 @@ static int nfs_parse_uri(const char *filename, QDict *options, Error **errp)
    qdict_put(options, "path", qstring_from_str(uri->path));

    for (i = 0; i < qp->n; i++) {
+        unsigned long long val;
        if (!qp->p[i].value) {
            error_setg(errp, "Value for NFS parameter expected: %s",
                       qp->p[i].name);
            goto out;
        }
-        if (parse_uint_full(qp->p[i].value, NULL, 0)) {
+        if (parse_uint_full(qp->p[i].value, &val, 0)) {
            error_setg(errp, "Illegal value for NFS parameter: %s",
                       qp->p[i].name);
            goto out;
@@ -190,6 +192,7 @@ static void nfs_parse_filename(const char *filename, QDict *options,
 static void nfs_process_read(void *arg);
 static void nfs_process_write(void *arg);

+/* Called with QemuMutex held.  */
 static void nfs_set_events(NFSClient *client)
 {
    int ev = nfs_which_events(client->context);
@@ -207,15 +210,21 @@ static void nfs_set_events(NFSClient *client)
 static void nfs_process_read(void *arg)
 {
    NFSClient *client = arg;
+
+    qemu_mutex_lock(&client->mutex);
    nfs_service(client->context, POLLIN);
    nfs_set_events(client);
+    qemu_mutex_unlock(&client->mutex);
 }

 static void nfs_process_write(void *arg)
 {
    NFSClient *client = arg;
+
+    qemu_mutex_lock(&client->mutex);
    nfs_service(client->context, POLLOUT);
    nfs_set_events(client);
+    qemu_mutex_unlock(&client->mutex);
 }

 static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
@@ -230,10 +239,12 @@ static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
 static void nfs_co_generic_bh_cb(void *opaque)
 {
    NFSRPC *task = opaque;
+
    task->complete = 1;
-    qemu_coroutine_enter(task->co);
+    aio_co_wake(task->co);
 }

+/* Called (via nfs_service) with QemuMutex held.  */
 static void
 nfs_co_generic_cb(int ret, struct nfs_context *nfs, void *data,
                  void *private_data)
@@ -255,9 +266,9 @@ nfs_co_generic_cb(int ret, struct nfs_context *nfs, void *data,
                            nfs_co_generic_bh_cb, task);
 }

-static int coroutine_fn nfs_co_readv(BlockDriverState *bs,
-                                     int64_t sector_num, int nb_sectors,
-                                     QEMUIOVector *iov)
+static int coroutine_fn nfs_co_preadv(BlockDriverState *bs, uint64_t offset,
+                                      uint64_t bytes, QEMUIOVector *iov,
+                                      int flags)
 {
    NFSClient *client = bs->opaque;
    NFSRPC task;
@@ -265,14 +276,15 @@ static int coroutine_fn nfs_co_readv(BlockDriverState *bs,
    nfs_co_init_task(bs, &task);
    task.iov = iov;

+    qemu_mutex_lock(&client->mutex);
    if (nfs_pread_async(client->context, client->fh,
-                        sector_num * BDRV_SECTOR_SIZE,
-                        nb_sectors * BDRV_SECTOR_SIZE,
-                        nfs_co_generic_cb, &task) != 0) {
+                        offset, bytes, nfs_co_generic_cb, &task) != 0) {
+        qemu_mutex_unlock(&client->mutex);
        return -ENOMEM;
    }

    nfs_set_events(client);
+    qemu_mutex_unlock(&client->mutex);
    while (!task.complete) {
        qemu_coroutine_yield();
    }
@@ -289,39 +301,50 @@ static int coroutine_fn nfs_co_readv(BlockDriverState *bs,
    return 0;
 }

-static int coroutine_fn nfs_co_writev(BlockDriverState *bs,
-                                        int64_t sector_num, int nb_sectors,
-                                        QEMUIOVector *iov)
+static int coroutine_fn nfs_co_pwritev(BlockDriverState *bs, uint64_t offset,
+                                       uint64_t bytes, QEMUIOVector *iov,
+                                       int flags)
 {
    NFSClient *client = bs->opaque;
    NFSRPC task;
    char *buf = NULL;
+    bool my_buffer = false;

    nfs_co_init_task(bs, &task);

-    buf = g_try_malloc(nb_sectors * BDRV_SECTOR_SIZE);
-    if (nb_sectors && buf == NULL) {
+    if (iov->niov != 1) {
+        buf = g_try_malloc(bytes);
+        if (bytes && buf == NULL) {
            return -ENOMEM;
        }
+        qemu_iovec_to_buf(iov, 0, buf, bytes);
+        my_buffer = true;
+    } else {
+        buf = iov->iov[0].iov_base;
+    }

-    qemu_iovec_to_buf(iov, 0, buf, nb_sectors * BDRV_SECTOR_SIZE);
-
+    qemu_mutex_lock(&client->mutex);
    if (nfs_pwrite_async(client->context, client->fh,
-                         sector_num * BDRV_SECTOR_SIZE,
-                         nb_sectors * BDRV_SECTOR_SIZE,
-                         buf, nfs_co_generic_cb, &task) != 0) {
+                         offset, bytes, buf,
+                         nfs_co_generic_cb, &task) != 0) {
+        qemu_mutex_unlock(&client->mutex);
+        if (my_buffer) {
            g_free(buf);
+        }
        return -ENOMEM;
    }

    nfs_set_events(client);
+    qemu_mutex_unlock(&client->mutex);
    while (!task.complete) {
        qemu_coroutine_yield();
    }

+    if (my_buffer) {
        g_free(buf);
+    }

-    if (task.ret != nb_sectors * BDRV_SECTOR_SIZE) {
+    if (task.ret != bytes) {
        return task.ret < 0 ? task.ret : -EIO;
    }

@@ -335,12 +358,15 @@ static int coroutine_fn nfs_co_flush(BlockDriverState *bs)

    nfs_co_init_task(bs, &task);

+    qemu_mutex_lock(&client->mutex);
    if (nfs_fsync_async(client->context, client->fh, nfs_co_generic_cb,
                        &task) != 0) {
+        qemu_mutex_unlock(&client->mutex);
        return -ENOMEM;
    }

    nfs_set_events(client);
+    qemu_mutex_unlock(&client->mutex);
    while (!task.complete) {
        qemu_coroutine_yield();
    }
@@ -358,27 +384,27 @@ static QemuOptsList runtime_opts = {
            .help = "Path of the image on the host",
        },
        {
-            .name = "uid",
+            .name = "user",
            .type = QEMU_OPT_NUMBER,
            .help = "UID value to use when talking to the server",
        },
        {
-            .name = "gid",
+            .name = "group",
            .type = QEMU_OPT_NUMBER,
            .help = "GID value to use when talking to the server",
        },
        {
-            .name = "tcp-syncnt",
+            .name = "tcp-syn-count",
            .type = QEMU_OPT_NUMBER,
            .help = "Number of SYNs to send during the session establish",
        },
        {
-            .name = "readahead",
+            .name = "readahead-size",
            .type = QEMU_OPT_NUMBER,
            .help = "Set the readahead size in bytes",
        },
        {
-            .name = "pagecache",
+            .name = "page-cache-size",
            .type = QEMU_OPT_NUMBER,
            .help = "Set the pagecache size in bytes",
        },
@@ -426,6 +452,7 @@ static void nfs_file_close(BlockDriverState *bs)
 {
    NFSClient *client = bs->opaque;
    nfs_client_close(client);
+    qemu_mutex_destroy(&client->mutex);
 }

 static NFSServer *nfs_config(QDict *options, Error **errp)
@@ -447,7 +474,14 @@ static NFSServer *nfs_config(QDict *options, Error **errp)
        goto out;
    }

-    iv = qobject_input_visitor_new(crumpled_addr, true);
+    /*
+     * Caution: this works only because all scalar members of
+     * NFSServer are QString in @crumpled_addr.  The visitor expects
+     * @crumpled_addr to be typed according to the QAPI schema.  It
+     * is when @options come from -blockdev or blockdev_add.  But when
+     * they come from -drive, they're all QString.
+     */
+    iv = qobject_input_visitor_new(crumpled_addr);
    visit_type_NFSServer(iv, NULL, &server, &local_error);
    if (local_error) {
        error_propagate(errp, local_error);
@@ -463,7 +497,7 @@ out:


 static int64_t nfs_client_open(NFSClient *client, QDict *options,
-                               int flags, Error **errp, int open_flags)
+                               int flags, int open_flags, Error **errp)
 {
    int ret = -EINVAL;
    QemuOpts *opts = NULL;
@@ -507,29 +541,29 @@ static int64_t nfs_client_open(NFSClient *client, QDict *options,
        goto fail;
    }

-    if (qemu_opt_get(opts, "uid")) {
-        client->uid = qemu_opt_get_number(opts, "uid", 0);
+    if (qemu_opt_get(opts, "user")) {
+        client->uid = qemu_opt_get_number(opts, "user", 0);
        nfs_set_uid(client->context, client->uid);
    }

-    if (qemu_opt_get(opts, "gid")) {
-        client->gid = qemu_opt_get_number(opts, "gid", 0);
+    if (qemu_opt_get(opts, "group")) {
+        client->gid = qemu_opt_get_number(opts, "group", 0);
        nfs_set_gid(client->context, client->gid);
    }

-    if (qemu_opt_get(opts, "tcp-syncnt")) {
-        client->tcp_syncnt = qemu_opt_get_number(opts, "tcp-syncnt", 0);
+    if (qemu_opt_get(opts, "tcp-syn-count")) {
+        client->tcp_syncnt = qemu_opt_get_number(opts, "tcp-syn-count", 0);
        nfs_set_tcp_syncnt(client->context, client->tcp_syncnt);
    }

 #ifdef LIBNFS_FEATURE_READAHEAD
-    if (qemu_opt_get(opts, "readahead")) {
+    if (qemu_opt_get(opts, "readahead-size")) {
        if (open_flags & BDRV_O_NOCACHE) {
            error_setg(errp, "Cannot enable NFS readahead "
                             "if cache.direct = on");
            goto fail;
        }
-        client->readahead = qemu_opt_get_number(opts, "readahead", 0);
+        client->readahead = qemu_opt_get_number(opts, "readahead-size", 0);
        if (client->readahead > QEMU_NFS_MAX_READAHEAD_SIZE) {
            error_report("NFS Warning: Truncating NFS readahead "
                         "size to %d", QEMU_NFS_MAX_READAHEAD_SIZE);
@@ -544,13 +578,13 @@ static int64_t nfs_client_open(NFSClient *client, QDict *options,
 #endif

 #ifdef LIBNFS_FEATURE_PAGECACHE
-    if (qemu_opt_get(opts, "pagecache")) {
+    if (qemu_opt_get(opts, "page-cache-size")) {
        if (open_flags & BDRV_O_NOCACHE) {
            error_setg(errp, "Cannot enable NFS pagecache "
                             "if cache.direct = on");
            goto fail;
        }
-        client->pagecache = qemu_opt_get_number(opts, "pagecache", 0);
+        client->pagecache = qemu_opt_get_number(opts, "page-cache-size", 0);
        if (client->pagecache > QEMU_NFS_MAX_PAGECACHE_SIZE) {
            error_report("NFS Warning: Truncating NFS pagecache "
                         "size to %d pages", QEMU_NFS_MAX_PAGECACHE_SIZE);
@@ -629,10 +663,11 @@ static int nfs_file_open(BlockDriverState *bs, QDict *options, int flags,

    ret = nfs_client_open(client, options,
                          (flags & BDRV_O_RDWR) ? O_RDWR : O_RDONLY,
-                          errp, bs->open_flags);
+                          bs->open_flags, errp);
    if (ret < 0) {
        return ret;
    }
+    qemu_mutex_init(&client->mutex);
    bs->total_sectors = ret;
    ret = 0;
    return ret;
@@ -670,7 +705,7 @@ static int nfs_file_create(const char *url, QemuOpts *opts, Error **errp)
        goto out;
    }

-    ret = nfs_client_open(client, options, O_CREAT, errp, 0);
+    ret = nfs_client_open(client, options, O_CREAT, 0, errp);
    if (ret < 0) {
        goto out;
    }
@@ -688,6 +723,7 @@ static int nfs_has_zero_init(BlockDriverState *bs)
    return client->has_zero_init;
 }

+/* Called (via nfs_service) with QemuMutex held.  */
 static void
 nfs_get_allocated_file_size_cb(int ret, struct nfs_context *nfs, void *data,
                               void *private_data)
@@ -797,27 +833,25 @@ static void nfs_refresh_filename(BlockDriverState *bs, QDict *options)
    ov = qobject_output_visitor_new(&server_qdict);
    visit_type_NFSServer(ov, NULL, &client->server, &error_abort);
    visit_complete(ov, &server_qdict);
-    assert(qobject_type(server_qdict) == QTYPE_QDICT);
-
    qdict_put_obj(opts, "server", server_qdict);
    qdict_put(opts, "path", qstring_from_str(client->path));

    if (client->uid) {
-        qdict_put(opts, "uid", qint_from_int(client->uid));
+        qdict_put(opts, "user", qint_from_int(client->uid));
    }
    if (client->gid) {
-        qdict_put(opts, "gid", qint_from_int(client->gid));
+        qdict_put(opts, "group", qint_from_int(client->gid));
    }
    if (client->tcp_syncnt) {
-        qdict_put(opts, "tcp-syncnt",
+        qdict_put(opts, "tcp-syn-cnt",
                  qint_from_int(client->tcp_syncnt));
    }
    if (client->readahead) {
-        qdict_put(opts, "readahead",
+        qdict_put(opts, "readahead-size",
                  qint_from_int(client->readahead));
    }
    if (client->pagecache) {
-        qdict_put(opts, "pagecache",
+        qdict_put(opts, "page-cache-size",
                  qint_from_int(client->pagecache));
    }
    if (client->debug) {
@@ -855,8 +889,8 @@ static BlockDriver bdrv_nfs = {
    .bdrv_create                    = nfs_file_create,
    .bdrv_reopen_prepare            = nfs_reopen_prepare,

-    .bdrv_co_readv                  = nfs_co_readv,
-    .bdrv_co_writev                 = nfs_co_writev,
+    .bdrv_co_preadv                 = nfs_co_preadv,
+    .bdrv_co_pwritev                = nfs_co_pwritev,
    .bdrv_co_flush_to_disk          = nfs_co_flush,

    .bdrv_detach_aio_context        = nfs_detach_aio_context,
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -114,7 +114,7 @@ static QemuOptsList parallels_runtime_opts = {
            .name = PARALLELS_OPT_PREALLOC_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Preallocation size on image expansion",
-            .def_value_str = "128MiB",
+            .def_value_str = "128M",
        },
        {
            .name = PARALLELS_OPT_PREALLOC_MODE,
@@ -192,8 +192,7 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
                                 int nb_sectors, int *pnum)
 {
    BDRVParallelsState *s = bs->opaque;
-    uint32_t idx, to_allocate, i;
-    int64_t pos, space;
+    int64_t pos, space, idx, to_allocate, i;

    pos = block_status(s, sector_num, nb_sectors, pnum);
    if (pos > 0) {
@@ -201,11 +200,19 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
    }

    idx = sector_num / s->tracks;
-    if (idx >= s->bat_size) {
-        return -EINVAL;
-    }
-
    to_allocate = DIV_ROUND_UP(sector_num + *pnum, s->tracks) - idx;
+
+    /* This function is called only by parallels_co_writev(), which will never
+     * pass a sector_num at or beyond the end of the image (because the block
+     * layer never passes such a sector_num to that function). Therefore, idx
+     * is always below s->bat_size.
+     * block_status() will limit *pnum so that sector_num + *pnum will not
+     * exceed the image end. Therefore, idx + to_allocate cannot exceed
+     * s->bat_size.
+     * Note that s->bat_size is an unsigned int, therefore idx + to_allocate
+     * will always fit into a uint32_t. */
+    assert(idx < s->bat_size && idx + to_allocate <= s->bat_size);
+
    space = to_allocate * s->tracks;
    if (s->data_end + space > bdrv_getlength(bs->file->bs) >> BDRV_SECTOR_BITS) {
        int ret;
@@ -215,7 +222,7 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
                                     s->data_end << BDRV_SECTOR_BITS,
                                     space << BDRV_SECTOR_BITS, 0);
        } else {
-            ret = bdrv_truncate(bs->file->bs,
+            ret = bdrv_truncate(bs->file,
                                (s->data_end + space) << BDRV_SECTOR_BITS);
        }
        if (ret < 0) {
@@ -449,7 +456,7 @@ static int parallels_check(BlockDriverState *bs, BdrvCheckResult *res,
                size - res->image_end_offset);
        res->leaks += count;
        if (fix & BDRV_FIX_LEAKS) {
-            ret = bdrv_truncate(bs->file->bs, res->image_end_offset);
+            ret = bdrv_truncate(bs->file, res->image_end_offset);
            if (ret < 0) {
                res->check_errors++;
                return ret;
@@ -488,7 +495,8 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    file = blk_new_open(filename, NULL, NULL,
-                        BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
+                        BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
+                        &local_err);
    if (file == NULL) {
        error_propagate(errp, local_err);
        return -EIO;
@@ -581,6 +589,12 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
    Error *local_err = NULL;
    char *buf;

+    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
+                               false, errp);
+    if (!bs->file) {
+        return -EINVAL;
+    }
+
    ret = bdrv_pread(bs->file, 0, &ph, sizeof(ph));
    if (ret < 0) {
        goto fail;
@@ -680,8 +694,9 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
    if (local_err != NULL) {
        goto fail_options;
    }
-    if (!bdrv_has_zero_init(bs->file->bs) ||
-            bdrv_truncate(bs->file->bs, bdrv_getlength(bs->file->bs)) != 0) {
+
+    if (!(flags & BDRV_O_RESIZE) || !bdrv_has_zero_init(bs->file->bs) ||
+            bdrv_truncate(bs->file, bdrv_getlength(bs->file->bs)) != 0) {
        s->prealloc_mode = PRL_PREALLOC_MODE_FALLOCATE;
    }

@@ -724,7 +739,7 @@ static void parallels_close(BlockDriverState *bs)
    }

    if (bs->open_flags & BDRV_O_RDWR) {
-        bdrv_truncate(bs->file->bs, s->data_end << BDRV_SECTOR_BITS);
+        bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS);
    }

    g_free(s->bat_dirty_bmap);
@@ -756,6 +771,7 @@ static BlockDriver bdrv_parallels = {
    .bdrv_probe		= parallels_probe,
    .bdrv_open		= parallels_open,
    .bdrv_close		= parallels_close,
+    .bdrv_child_perm          = bdrv_format_default_perms,
    .bdrv_co_get_block_status = parallels_co_get_block_status,
    .bdrv_has_zero_init       = bdrv_has_zero_init_1,
    .bdrv_co_flush_to_os      = parallels_co_flush_to_os,
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -237,8 +237,8 @@ void bdrv_query_image_info(BlockDriverState *bs,

    size = bdrv_getlength(bs);
    if (size < 0) {
-        error_setg_errno(errp, -size, "Can't get size of device '%s'",
-                         bdrv_get_device_name(bs));
+        error_setg_errno(errp, -size, "Can't get image size '%s'",
+                         bs->exact_filename);
        goto out;
    }

@@ -357,10 +357,6 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info,
    qapi_free_BlockInfo(info);
 }

-static BlockStats *bdrv_query_stats(BlockBackend *blk,
-                                    const BlockDriverState *bs,
-                                    bool query_backing);
-
 static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
 {
    BlockAcctStats *stats = blk_get_stats(blk);
@@ -428,9 +424,18 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
    }
 }

-static void bdrv_query_bds_stats(BlockStats *s, const BlockDriverState *bs,
+static BlockStats *bdrv_query_bds_stats(const BlockDriverState *bs,
                                 bool query_backing)
 {
+    BlockStats *s = NULL;
+
+    s = g_malloc0(sizeof(*s));
+    s->stats = g_malloc0(sizeof(*s->stats));
+
+    if (!bs) {
+        return s;
+    }
+
    if (bdrv_get_node_name(bs)[0]) {
        s->has_node_name = true;
        s->node_name = g_strdup(bdrv_get_node_name(bs));
@@ -440,32 +445,12 @@ static void bdrv_query_bds_stats(BlockStats *s, const BlockDriverState *bs,

    if (bs->file) {
        s->has_parent = true;
-        s->parent = bdrv_query_stats(NULL, bs->file->bs, query_backing);
+        s->parent = bdrv_query_bds_stats(bs->file->bs, query_backing);
    }

    if (query_backing && bs->backing) {
        s->has_backing = true;
-        s->backing = bdrv_query_stats(NULL, bs->backing->bs, query_backing);
-    }
-
-}
-
-static BlockStats *bdrv_query_stats(BlockBackend *blk,
-                                    const BlockDriverState *bs,
-                                    bool query_backing)
-{
-    BlockStats *s;
-
-    s = g_malloc0(sizeof(*s));
-    s->stats = g_malloc0(sizeof(*s->stats));
-
-    if (blk) {
-        s->has_device = true;
-        s->device = g_strdup(blk_name(blk));
-        bdrv_query_blk_stats(s->stats, blk);
-    }
-    if (bs) {
-        bdrv_query_bds_stats(s, bs, query_backing);
+        s->backing = bdrv_query_bds_stats(bs->backing->bs, query_backing);
    }

    return s;
@@ -494,43 +479,45 @@ BlockInfoList *qmp_query_block(Error **errp)
    return head;
 }

-static bool next_query_bds(BlockBackend **blk, BlockDriverState **bs,
-                           bool query_nodes)
-{
-    if (query_nodes) {
-        *bs = bdrv_next_node(*bs);
-        return !!*bs;
-    }
-
-    *blk = blk_next(*blk);
-    *bs = *blk ? blk_bs(*blk) : NULL;
-
-    return !!*blk;
-}
-
 BlockStatsList *qmp_query_blockstats(bool has_query_nodes,
                                     bool query_nodes,
                                     Error **errp)
 {
    BlockStatsList *head = NULL, **p_next = &head;
-    BlockBackend *blk = NULL;
-    BlockDriverState *bs = NULL;
+    BlockBackend *blk;
+    BlockDriverState *bs;

    /* Just to be safe if query_nodes is not always initialized */
-    query_nodes = has_query_nodes && query_nodes;
-
-    while (next_query_bds(&blk, &bs, query_nodes)) {
+    if (has_query_nodes && query_nodes) {
+        for (bs = bdrv_next_node(NULL); bs; bs = bdrv_next_node(bs)) {
            BlockStatsList *info = g_malloc0(sizeof(*info));
-        AioContext *ctx = blk ? blk_get_aio_context(blk)
-                              : bdrv_get_aio_context(bs);
+            AioContext *ctx = bdrv_get_aio_context(bs);

            aio_context_acquire(ctx);
-        info->value = bdrv_query_stats(blk, bs, !query_nodes);
+            info->value = bdrv_query_bds_stats(bs, false);
            aio_context_release(ctx);

            *p_next = info;
            p_next = &info->next;
        }
+    } else {
+        for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
+            BlockStatsList *info = g_malloc0(sizeof(*info));
+            AioContext *ctx = blk_get_aio_context(blk);
+            BlockStats *s;
+
+            aio_context_acquire(ctx);
+            s = bdrv_query_bds_stats(blk_bs(blk), true);
+            s->has_device = true;
+            s->device = g_strdup(blk_name(blk));
+            bdrv_query_blk_stats(s->stats, blk);
+            aio_context_release(ctx);
+
+            info->value = s;
+            *p_next = info;
+            p_next = &info->next;
+        }
+    }

    return head;
 }
@@ -695,7 +682,6 @@ void bdrv_image_info_specific_dump(fprintf_function func_fprintf, void *f,

    visit_type_ImageInfoSpecific(v, NULL, &info_spec, &error_abort);
    visit_complete(v, &obj);
-    assert(qobject_type(obj) == QTYPE_QDICT);
    data = qdict_get(qobject_to_qdict(obj), "data");
    dump_qobject(func_fprintf, f, 1, data);
    qobject_decref(obj);
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -106,6 +106,12 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
    QCowHeader header;
    Error *local_err = NULL;

+    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
+                               false, errp);
+    if (!bs->file) {
+        return -EINVAL;
+    }
+
    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
    if (ret < 0) {
        goto fail;
@@ -467,7 +473,7 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
                /* round to cluster size */
                cluster_offset = (cluster_offset + s->cluster_size - 1) &
                    ~(s->cluster_size - 1);
-                bdrv_truncate(bs->file->bs, cluster_offset + s->cluster_size);
+                bdrv_truncate(bs->file, cluster_offset + s->cluster_size);
                /* if encrypted, we must initialize the cluster
                   content which won't be written */
                if (bs->encrypted &&
@@ -817,7 +823,8 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    qcow_blk = blk_new_open(filename, NULL, NULL,
-                            BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
+                            BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
+                            &local_err);
    if (qcow_blk == NULL) {
        error_propagate(errp, local_err);
        ret = -EIO;
@@ -909,7 +916,7 @@ static int qcow_make_empty(BlockDriverState *bs)
    if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table,
            l1_length) < 0)
        return -1;
-    ret = bdrv_truncate(bs->file->bs, s->l1_table_offset + l1_length);
+    ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
    if (ret < 0)
        return ret;

@@ -1046,6 +1053,7 @@ static BlockDriver bdrv_qcow = {
    .bdrv_probe		= qcow_probe,
    .bdrv_open		= qcow_open,
    .bdrv_close		= qcow_close,
+    .bdrv_child_perm        = bdrv_format_default_perms,
    .bdrv_reopen_prepare    = qcow_reopen_prepare,
    .bdrv_create            = qcow_create,
    .bdrv_has_zero_init     = bdrv_has_zero_init_1,
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -932,9 +932,7 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
            if (bytes == 0) {
                /* Wait for the dependency to complete. We need to recheck
                 * the free/allocated clusters when we continue. */
-                qemu_co_mutex_unlock(&s->lock);
-                qemu_co_queue_wait(&old_alloc->dependent_requests);
-                qemu_co_mutex_lock(&s->lock);
+                qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
                return -EAGAIN;
            }
        }
@@ -1521,12 +1519,10 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,

    end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS);

-    /* Round start up and end down */
-    offset = align_offset(offset, s->cluster_size);
+    /* The caller must cluster-align start; round end down except at EOF */
+    assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
+    if (end_offset != bs->total_sectors * BDRV_SECTOR_SIZE) {
        end_offset = start_of_cluster(s, end_offset);
-
-    if (offset > end_offset) {
-        return 0;
    }

    nb_clusters = size_to_clusters(s, end_offset - offset);
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -83,6 +83,16 @@ static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
 /*********************************************************/
 /* refcount handling */

+static void update_max_refcount_table_index(BDRVQcow2State *s)
+{
+    unsigned i = s->refcount_table_size - 1;
+    while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) {
+        i--;
+    }
+    /* Set s->max_refcount_table_index to the index of the last used entry */
+    s->max_refcount_table_index = i;
+}
+
 int qcow2_refcount_init(BlockDriverState *bs)
 {
    BDRVQcow2State *s = bs->opaque;
@@ -111,6 +121,7 @@ int qcow2_refcount_init(BlockDriverState *bs)
        }
        for(i = 0; i < s->refcount_table_size; i++)
            be64_to_cpus(&s->refcount_table[i]);
+        update_max_refcount_table_index(s);
    }
    return 0;
 fail:
@@ -439,6 +450,10 @@ static int alloc_refcount_block(BlockDriverState *bs,
        }

        s->refcount_table[refcount_table_index] = new_block;
+        /* If there's a hole in s->refcount_table then it can happen
+         * that refcount_table_index < s->max_refcount_table_index */
+        s->max_refcount_table_index =
+            MAX(s->max_refcount_table_index, refcount_table_index);

        /* The new refcount block may be where the caller intended to put its
         * data, so let it restart the search. */
@@ -580,6 +595,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
    s->refcount_table = new_table;
    s->refcount_table_size = table_size;
    s->refcount_table_offset = table_offset;
+    update_max_refcount_table_index(s);

    /* Free old table. */
    qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
@@ -1718,7 +1734,7 @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
                    goto resize_fail;
                }

-                ret = bdrv_truncate(bs->file->bs, offset + s->cluster_size);
+                ret = bdrv_truncate(bs->file, offset + s->cluster_size);
                if (ret < 0) {
                    goto resize_fail;
                }
@@ -2171,6 +2187,7 @@ write_refblocks:
    s->refcount_table = on_disk_reftable;
    s->refcount_table_offset = reftable_offset;
    s->refcount_table_size = reftable_size;
+    update_max_refcount_table_index(s);

    return 0;

@@ -2383,7 +2400,11 @@ int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
    }

    if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
-        for (i = 0; i < s->refcount_table_size; i++) {
+        unsigned last_entry = s->max_refcount_table_index;
+        assert(last_entry < s->refcount_table_size);
+        assert(last_entry + 1 == s->refcount_table_size ||
+               (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0);
+        for (i = 0; i <= last_entry; i++) {
            if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
                overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
                s->cluster_size)) {
@@ -2871,6 +2892,7 @@ int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
    /* Now update the rest of the in-memory information */
    old_reftable = s->refcount_table;
    s->refcount_table = new_reftable;
+    update_max_refcount_table_index(s);

    s->refcount_bits = 1 << refcount_order;
    s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -814,7 +814,7 @@ static int qcow2_update_options(BlockDriverState *bs, QDict *options,
    return ret;
 }

-static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
+static int qcow2_do_open(BlockDriverState *bs, QDict *options, int flags,
                         Error **errp)
 {
    BDRVQcow2State *s = bs->opaque;
@@ -1205,6 +1205,18 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    return ret;
 }

+static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
+                      Error **errp)
+{
+    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
+                               false, errp);
+    if (!bs->file) {
+        return -EINVAL;
+    }
+
+    return qcow2_do_open(bs, options, flags, errp);
+}
+
 static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp)
 {
    BDRVQcow2State *s = bs->opaque;
@@ -1785,7 +1797,7 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp)
    options = qdict_clone_shallow(bs->options);

    flags &= ~BDRV_O_INACTIVE;
-    ret = qcow2_open(bs, options, flags, &local_err);
+    ret = qcow2_do_open(bs, options, flags, &local_err);
    QDECREF(options);
    if (local_err) {
        error_propagate(errp, local_err);
@@ -2190,7 +2202,8 @@ static int qcow2_create2(const char *filename, int64_t total_size,
    }

    blk = blk_new_open(filename, NULL, NULL,
-                       BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
+                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
+                       &local_err);
    if (blk == NULL) {
        error_propagate(errp, local_err);
        return -EIO;
@@ -2254,7 +2267,8 @@ static int qcow2_create2(const char *filename, int64_t total_size,
    options = qdict_new();
    qdict_put(options, "driver", qstring_from_str("qcow2"));
    blk = blk_new_open(filename, NULL, options,
-                       BDRV_O_RDWR | BDRV_O_NO_FLUSH, &local_err);
+                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_NO_FLUSH,
+                       &local_err);
    if (blk == NULL) {
        error_propagate(errp, local_err);
        ret = -EIO;
@@ -2570,7 +2584,7 @@ qcow2_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
        /* align end of file to a sector boundary to ease reading with
           sector based I/Os */
        cluster_offset = bdrv_getlength(bs->file->bs);
-        return bdrv_truncate(bs->file->bs, cluster_offset);
+        return bdrv_truncate(bs->file, cluster_offset);
    }

    buf = qemu_blockalign(bs, s->cluster_size);
@@ -2743,6 +2757,7 @@ static int make_completely_empty(BlockDriverState *bs)

    s->refcount_table_offset = s->cluster_size;
    s->refcount_table_size   = s->cluster_size / sizeof(uint64_t);
+    s->max_refcount_table_index = 0;

    g_free(s->refcount_table);
    s->refcount_table = new_reftable;
@@ -2783,7 +2798,7 @@ static int make_completely_empty(BlockDriverState *bs)
        goto fail;
    }

-    ret = bdrv_truncate(bs->file->bs, (3 + l1_clusters) * s->cluster_size);
+    ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size);
    if (ret < 0) {
        goto fail;
    }
@@ -3100,6 +3115,7 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
    uint64_t cluster_size = s->cluster_size;
    bool encrypt;
    int refcount_bits = s->refcount_bits;
+    Error *local_err = NULL;
    int ret;
    QemuOptDesc *desc = opts->list->desc;
    Qcow2AmendHelperCBInfo helper_cb_info;
@@ -3249,7 +3265,16 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
    }

    if (new_size) {
-        ret = bdrv_truncate(bs, new_size);
+        BlockBackend *blk = blk_new(BLK_PERM_RESIZE, BLK_PERM_ALL);
+        ret = blk_insert_bs(blk, bs, &local_err);
+        if (ret < 0) {
+            error_report_err(local_err);
+            blk_unref(blk);
+            return ret;
+        }
+
+        ret = blk_truncate(blk, new_size);
+        blk_unref(blk);
        if (ret < 0) {
            return ret;
        }
@@ -3386,6 +3411,7 @@ BlockDriver bdrv_qcow2 = {
    .bdrv_reopen_commit   = qcow2_reopen_commit,
    .bdrv_reopen_abort    = qcow2_reopen_abort,
    .bdrv_join_options    = qcow2_join_options,
+    .bdrv_child_perm      = bdrv_format_default_perms,
    .bdrv_create        = qcow2_create,
    .bdrv_has_zero_init = bdrv_has_zero_init_1,
    .bdrv_co_get_block_status = qcow2_co_get_block_status,
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -251,6 +251,7 @@ typedef struct BDRVQcow2State {
    uint64_t *refcount_table;
    uint64_t refcount_table_offset;
    uint32_t refcount_table_size;
+    uint32_t max_refcount_table_index; /* Last used entry in refcount_table */
    uint64_t free_cluster_index;
    uint64_t free_byte_offset;

--- a/block/qed-cluster.c
+++ b/block/qed-cluster.c
@@ -83,6 +83,7 @@ static void qed_find_cluster_cb(void *opaque, int ret)
    unsigned int index;
    unsigned int n;

+    qed_acquire(s);
    if (ret) {
        goto out;
    }
@@ -109,6 +110,7 @@ static void qed_find_cluster_cb(void *opaque, int ret)

 out:
    find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len);
+    qed_release(s);
    g_free(find_cluster_cb);
 }

--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -31,6 +31,7 @@ static void qed_read_table_cb(void *opaque, int ret)
 {
    QEDReadTableCB *read_table_cb = opaque;
    QEDTable *table = read_table_cb->table;
+    BDRVQEDState *s = read_table_cb->s;
    int noffsets = read_table_cb->qiov.size / sizeof(uint64_t);
    int i;

@@ -40,13 +41,15 @@ static void qed_read_table_cb(void *opaque, int ret)
    }

    /* Byteswap offsets */
+    qed_acquire(s);
    for (i = 0; i < noffsets; i++) {
        table->offsets[i] = le64_to_cpu(table->offsets[i]);
    }
+    qed_release(s);

 out:
    /* Completion */
-    trace_qed_read_table_cb(read_table_cb->s, read_table_cb->table, ret);
+    trace_qed_read_table_cb(s, read_table_cb->table, ret);
    gencb_complete(&read_table_cb->gencb, ret);
 }

@@ -84,8 +87,9 @@ typedef struct {
 static void qed_write_table_cb(void *opaque, int ret)
 {
    QEDWriteTableCB *write_table_cb = opaque;
+    BDRVQEDState *s = write_table_cb->s;

-    trace_qed_write_table_cb(write_table_cb->s,
+    trace_qed_write_table_cb(s,
                             write_table_cb->orig_table,
                             write_table_cb->flush,
                             ret);
@@ -97,8 +101,10 @@ static void qed_write_table_cb(void *opaque, int ret)
    if (write_table_cb->flush) {
        /* We still need to flush first */
        write_table_cb->flush = false;
+        qed_acquire(s);
        bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb,
                       write_table_cb);
+        qed_release(s);
        return;
    }

@@ -213,6 +219,7 @@ static void qed_read_l2_table_cb(void *opaque, int ret)
    CachedL2Table *l2_table = request->l2_table;
    uint64_t l2_offset = read_l2_table_cb->l2_offset;

+    qed_acquire(s);
    if (ret) {
        /* can't trust loaded L2 table anymore */
        qed_unref_l2_cache_entry(l2_table);
@@ -228,6 +235,7 @@ static void qed_read_l2_table_cb(void *opaque, int ret)
        request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
        assert(request->l2_table != NULL);
    }
+    qed_release(s);

    gencb_complete(&read_l2_table_cb->gencb, ret);
 }
--- a/block/qed.c
+++ b/block/qed.c
@@ -273,7 +273,19 @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
    return l2_table;
 }

-static void qed_aio_next_io(void *opaque, int ret);
+static void qed_aio_next_io(QEDAIOCB *acb, int ret);
+
+static void qed_aio_start_io(QEDAIOCB *acb)
+{
+    qed_aio_next_io(acb, 0);
+}
+
+static void qed_aio_next_io_cb(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+
+    qed_aio_next_io(acb, ret);
+}

 static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
 {
@@ -292,7 +304,7 @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)

    acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
    if (acb) {
-        qed_aio_next_io(acb, 0);
+        qed_aio_start_io(acb);
    }
 }

@@ -333,10 +345,22 @@ static void qed_need_check_timer_cb(void *opaque)

    trace_qed_need_check_timer_cb(s);

+    qed_acquire(s);
    qed_plug_allocating_write_reqs(s);

    /* Ensure writes are on disk before clearing flag */
    bdrv_aio_flush(s->bs->file->bs, qed_clear_need_check, s);
+    qed_release(s);
+}
+
+void qed_acquire(BDRVQEDState *s)
+{
+    aio_context_acquire(bdrv_get_aio_context(s->bs));
+}
+
+void qed_release(BDRVQEDState *s)
+{
+    aio_context_release(bdrv_get_aio_context(s->bs));
 }

 static void qed_start_need_check_timer(BDRVQEDState *s)
@@ -391,7 +415,7 @@ static void bdrv_qed_drain(BlockDriverState *bs)
    }
 }

-static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
+static int bdrv_qed_do_open(BlockDriverState *bs, QDict *options, int flags,
                            Error **errp)
 {
    BDRVQEDState *s = bs->opaque;
@@ -526,6 +550,18 @@ out:
    return ret;
 }

+static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
+                         Error **errp)
+{
+    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
+                               false, errp);
+    if (!bs->file) {
+        return -EINVAL;
+    }
+
+    return bdrv_qed_do_open(bs, options, flags, errp);
+}
+
 static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp)
 {
    BDRVQEDState *s = bs->opaque;
@@ -589,7 +625,8 @@ static int qed_create(const char *filename, uint32_t cluster_size,
    }

    blk = blk_new_open(filename, NULL, NULL,
-                       BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
+                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
+                       &local_err);
    if (blk == NULL) {
        error_propagate(errp, local_err);
        return -EIO;
@@ -721,7 +758,7 @@ static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t l
    }

    if (cb->co) {
-        qemu_coroutine_enter(cb->co);
+        aio_co_wake(cb->co);
    }
 }

@@ -918,6 +955,7 @@ static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
 static void qed_aio_complete_bh(void *opaque)
 {
    QEDAIOCB *acb = opaque;
+    BDRVQEDState *s = acb_to_s(acb);
    BlockCompletionFunc *cb = acb->common.cb;
    void *user_opaque = acb->common.opaque;
    int ret = acb->bh_ret;
@@ -925,7 +963,9 @@ static void qed_aio_complete_bh(void *opaque)
    qemu_aio_unref(acb);

    /* Invoke callback */
+    qed_acquire(s);
    cb(user_opaque, ret);
+    qed_release(s);
 }

 static void qed_aio_complete(QEDAIOCB *acb, int ret)
@@ -959,7 +999,7 @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
        QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
        acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
        if (acb) {
-            qed_aio_next_io(acb, 0);
+            qed_aio_start_io(acb);
        } else if (s->header.features & QED_F_NEED_CHECK) {
            qed_start_need_check_timer(s);
        }
@@ -984,7 +1024,7 @@ static void qed_commit_l2_update(void *opaque, int ret)
    acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
    assert(acb->request.l2_table != NULL);

-    qed_aio_next_io(opaque, ret);
+    qed_aio_next_io(acb, ret);
 }

 /**
@@ -1036,7 +1076,7 @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
    } else {
        /* Write out only the updated part of the L2 table */
        qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false,
-                            qed_aio_next_io, acb);
+                           qed_aio_next_io_cb, acb);
    }
    return;

@@ -1088,7 +1128,7 @@ static void qed_aio_write_main(void *opaque, int ret)
    }

    if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
-        next_fn = qed_aio_next_io;
+        next_fn = qed_aio_next_io_cb;
    } else {
        if (s->bs->backing) {
            next_fn = qed_aio_write_flush_before_l2_update;
@@ -1201,7 +1241,7 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
    if (acb->flags & QED_AIOCB_ZERO) {
        /* Skip ahead if the clusters are already zero */
        if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
-            qed_aio_next_io(acb, 0);
+            qed_aio_start_io(acb);
            return;
        }

@@ -1321,18 +1361,18 @@ static void qed_aio_read_data(void *opaque, int ret,
    /* Handle zero cluster and backing file reads */
    if (ret == QED_CLUSTER_ZERO) {
        qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
-        qed_aio_next_io(acb, 0);
+        qed_aio_start_io(acb);
        return;
    } else if (ret != QED_CLUSTER_FOUND) {
        qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
-                              &acb->backing_qiov, qed_aio_next_io, acb);
+                              &acb->backing_qiov, qed_aio_next_io_cb, acb);
        return;
    }

    BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
    bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE,
                   &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
-                   qed_aio_next_io, acb);
+                   qed_aio_next_io_cb, acb);
    return;

 err:
@@ -1342,9 +1382,8 @@ err:
 /**
 * Begin next I/O or complete the request
 */
-static void qed_aio_next_io(void *opaque, int ret)
+static void qed_aio_next_io(QEDAIOCB *acb, int ret)
 {
-    QEDAIOCB *acb = opaque;
    BDRVQEDState *s = acb_to_s(acb);
    QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
                                qed_aio_write_data : qed_aio_read_data;
@@ -1400,7 +1439,7 @@ static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
    qemu_iovec_init(&acb->cur_qiov, qiov->niov);

    /* Start request */
-    qed_aio_next_io(acb, 0);
+    qed_aio_start_io(acb);
    return &acb->common;
 }

@@ -1436,7 +1475,7 @@ static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret)
    cb->done = true;
    cb->ret = ret;
    if (cb->co) {
-        qemu_coroutine_enter(cb->co);
+        aio_co_wake(cb->co);
    }
 }

@@ -1603,7 +1642,7 @@ static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp)
    bdrv_qed_close(bs);

    memset(s, 0, sizeof(BDRVQEDState));
-    ret = bdrv_qed_open(bs, NULL, bs->open_flags, &local_err);
+    ret = bdrv_qed_do_open(bs, NULL, bs->open_flags, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        error_prepend(errp, "Could not reopen qed layer: ");
@@ -1666,6 +1705,7 @@ static BlockDriver bdrv_qed = {
    .bdrv_open                = bdrv_qed_open,
    .bdrv_close               = bdrv_qed_close,
    .bdrv_reopen_prepare      = bdrv_qed_reopen_prepare,
+    .bdrv_child_perm          = bdrv_format_default_perms,
    .bdrv_create              = bdrv_qed_create,
    .bdrv_has_zero_init       = bdrv_has_zero_init_1,
    .bdrv_co_get_block_status = bdrv_qed_co_get_block_status,
--- a/block/qed.h
+++ b/block/qed.h
@@ -198,6 +198,9 @@ enum {
 */
 typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len);

+void qed_acquire(BDRVQEDState *s);
+void qed_release(BDRVQEDState *s);
+
 /**
 * Generic callback for chaining async callbacks
 */
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -1032,10 +1032,17 @@ static void quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs,

    /* We can safely add the child now */
    bdrv_ref(child_bs);
-    child = bdrv_attach_child(bs, child_bs, indexstr, &child_format);
+
+    child = bdrv_attach_child(bs, child_bs, indexstr, &child_format, errp);
+    if (child == NULL) {
+        s->next_child_index--;
+        bdrv_unref(child_bs);
+        goto out;
+    }
    s->children = g_renew(BdrvChild *, s->children, s->num_children + 1);
    s->children[s->num_children++] = child;

+out:
    bdrv_drained_end(bs);
 }

@@ -1126,6 +1133,8 @@ static BlockDriver bdrv_quorum = {
    .bdrv_add_child                     = quorum_add_child,
    .bdrv_del_child                     = quorum_del_child,

+    .bdrv_child_perm                    = bdrv_filter_default_perms,
+
    .is_filter                          = true,
    .bdrv_recurse_is_first_non_filter   = quorum_recurse_is_first_non_filter,
 };
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -341,7 +341,7 @@ static int raw_truncate(BlockDriverState *bs, int64_t offset)

    s->size = offset;
    offset += s->offset;
-    return bdrv_truncate(bs->file->bs, offset);
+    return bdrv_truncate(bs->file, offset);
 }

 static int raw_media_changed(BlockDriverState *bs)
@@ -384,6 +384,12 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
    BDRVRawState *s = bs->opaque;
    int ret;

+    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
+                               false, errp);
+    if (!bs->file) {
+        return -EINVAL;
+    }
+
    bs->sg = bs->file->bs->sg;
    bs->supported_write_flags = BDRV_REQ_FUA &
        bs->file->bs->supported_write_flags;
@@ -461,6 +467,7 @@ BlockDriver bdrv_raw = {
    .bdrv_reopen_abort    = &raw_reopen_abort,
    .bdrv_open            = &raw_open,
    .bdrv_close           = &raw_close,
+    .bdrv_child_perm      = bdrv_filter_default_perms,
    .bdrv_create          = &raw_create,
    .bdrv_co_preadv       = &raw_co_preadv,
    .bdrv_co_pwritev      = &raw_co_pwritev,
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -13,13 +13,14 @@

 #include "qemu/osdep.h"

+#include <rbd/librbd.h>
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "block/block_int.h"
 #include "crypto/secret.h"
 #include "qemu/cutils.h"
-
-#include <rbd/librbd.h>
+#include "qapi/qmp/qstring.h"
+#include "qapi/qmp/qjson.h"

 /*
 * When specifying the image filename use:
@@ -55,13 +56,15 @@

 #define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER)

-#define RBD_MAX_CONF_NAME_SIZE 128
-#define RBD_MAX_CONF_VAL_SIZE 512
-#define RBD_MAX_CONF_SIZE 1024
-#define RBD_MAX_POOL_NAME_SIZE 128
-#define RBD_MAX_SNAP_NAME_SIZE 128
 #define RBD_MAX_SNAPS 100

+/* The LIBRBD_SUPPORTS_IOVEC is defined in librbd.h */
+#ifdef LIBRBD_SUPPORTS_IOVEC
+#define LIBRBD_USE_IOVEC 1
+#else
+#define LIBRBD_USE_IOVEC 0
+#endif
+
 typedef enum {
    RBD_AIO_READ,
    RBD_AIO_WRITE,
@@ -91,21 +94,16 @@ typedef struct BDRVRBDState {
    rados_t cluster;
    rados_ioctx_t io_ctx;
    rbd_image_t image;
-    char name[RBD_MAX_IMAGE_NAME_SIZE];
+    char *image_name;
    char *snap;
 } BDRVRBDState;

-static int qemu_rbd_next_tok(char *dst, int dst_len,
-                             char *src, char delim,
-                             const char *name,
-                             char **p, Error **errp)
+static char *qemu_rbd_next_tok(char *src, char delim, char **p)
 {
-    int l;
    char *end;

    *p = NULL;

-    if (delim != '\0') {
    for (end = src; *end; ++end) {
        if (*end == delim) {
            break;
@@ -118,19 +116,7 @@ static int qemu_rbd_next_tok(char *dst, int dst_len,
        *p = end + 1;
        *end = '\0';
    }
-    }
-    l = strlen(src);
-    if (l >= dst_len) {
-        error_setg(errp, "%s too long", name);
-        return -EINVAL;
-    } else if (l == 0) {
-        error_setg(errp, "%s too short", name);
-        return -EINVAL;
-    }
-
-    pstrcpy(dst, dst_len, src);
-
-    return 0;
+    return src;
 }

 static void qemu_rbd_unescape(char *src)
@@ -146,87 +132,92 @@ static void qemu_rbd_unescape(char *src)
    *p = '\0';
 }

-static int qemu_rbd_parsename(const char *filename,
-                              char *pool, int pool_len,
-                              char *snap, int snap_len,
-                              char *name, int name_len,
-                              char *conf, int conf_len,
+static void qemu_rbd_parse_filename(const char *filename, QDict *options,
                                    Error **errp)
 {
    const char *start;
    char *p, *buf;
-    int ret;
+    QList *keypairs = NULL;
+    char *found_str;

    if (!strstart(filename, "rbd:", &start)) {
        error_setg(errp, "File name must start with 'rbd:'");
-        return -EINVAL;
+        return;
    }

    buf = g_strdup(start);
    p = buf;
-    *snap = '\0';
-    *conf = '\0';

-    ret = qemu_rbd_next_tok(pool, pool_len, p,
-                            '/', "pool name", &p, errp);
-    if (ret < 0 || !p) {
-        ret = -EINVAL;
+    found_str = qemu_rbd_next_tok(p, '/', &p);
+    if (!p) {
+        error_setg(errp, "Pool name is required");
        goto done;
    }
-    qemu_rbd_unescape(pool);
+    qemu_rbd_unescape(found_str);
+    qdict_put(options, "pool", qstring_from_str(found_str));

    if (strchr(p, '@')) {
-        ret = qemu_rbd_next_tok(name, name_len, p,
-                                '@', "object name", &p, errp);
-        if (ret < 0) {
-            goto done;
-        }
-        ret = qemu_rbd_next_tok(snap, snap_len, p,
-                                ':', "snap name", &p, errp);
-        qemu_rbd_unescape(snap);
+        found_str = qemu_rbd_next_tok(p, '@', &p);
+        qemu_rbd_unescape(found_str);
+        qdict_put(options, "image", qstring_from_str(found_str));
+
+        found_str = qemu_rbd_next_tok(p, ':', &p);
+        qemu_rbd_unescape(found_str);
+        qdict_put(options, "snapshot", qstring_from_str(found_str));
    } else {
-        ret = qemu_rbd_next_tok(name, name_len, p,
-                                ':', "object name", &p, errp);
+        found_str = qemu_rbd_next_tok(p, ':', &p);
+        qemu_rbd_unescape(found_str);
+        qdict_put(options, "image", qstring_from_str(found_str));
    }
-    qemu_rbd_unescape(name);
-    if (ret < 0 || !p) {
+    if (!p) {
        goto done;
    }

-    ret = qemu_rbd_next_tok(conf, conf_len, p,
-                            '\0', "configuration", &p, errp);
+    /* The following are essentially all key/value pairs, and we treat
+     * 'id' and 'conf' a bit special.  Key/value pairs may be in any order. */
+    while (p) {
+        char *name, *value;
+        name = qemu_rbd_next_tok(p, '=', &p);
+        if (!p) {
+            error_setg(errp, "conf option %s has no value", name);
+            break;
+        }
+
+        qemu_rbd_unescape(name);
+
+        value = qemu_rbd_next_tok(p, ':', &p);
+        qemu_rbd_unescape(value);
+
+        if (!strcmp(name, "conf")) {
+            qdict_put(options, "conf", qstring_from_str(value));
+        } else if (!strcmp(name, "id")) {
+            qdict_put(options, "user" , qstring_from_str(value));
+        } else {
+            /*
+             * We pass these internally to qemu_rbd_set_keypairs(), so
+             * we can get away with the simpler list of [ "key1",
+             * "value1", "key2", "value2" ] rather than a raw dict
+             * { "key1": "value1", "key2": "value2" } where we can't
+             * guarantee order, or even a more correct but complex
+             * [ { "key1": "value1" }, { "key2": "value2" } ]
+             */
+            if (!keypairs) {
+                keypairs = qlist_new();
+            }
+            qlist_append(keypairs, qstring_from_str(name));
+            qlist_append(keypairs, qstring_from_str(value));
+        }
+    }
+
+    if (keypairs) {
+        qdict_put(options, "=keyvalue-pairs",
+                  qobject_to_json(QOBJECT(keypairs)));
+    }

 done:
    g_free(buf);
-    return ret;
-}
-
-static char *qemu_rbd_parse_clientname(const char *conf, char *clientname)
-{
-    const char *p = conf;
-
-    while (*p) {
-        int len;
-        const char *end = strchr(p, ':');
-
-        if (end) {
-            len = end - p;
-        } else {
-            len = strlen(p);
-        }
-
-        if (strncmp(p, "id=", 3) == 0) {
-            len -= 3;
-            strncpy(clientname, p + 3, len);
-            clientname[len] = '\0';
-            return clientname;
-        }
-        if (end == NULL) {
-            break;
-        }
-        p = end + 1;
-    }
-    return NULL;
+    QDECREF(keypairs);
+    return;
 }


@@ -249,94 +240,125 @@ static int qemu_rbd_set_auth(rados_t cluster, const char *secretid,
    return 0;
 }

-
-static int qemu_rbd_set_conf(rados_t cluster, const char *conf,
-                             bool only_read_conf_file,
+static int qemu_rbd_set_keypairs(rados_t cluster, const char *keypairs_json,
                                 Error **errp)
 {
-    char *p, *buf;
-    char name[RBD_MAX_CONF_NAME_SIZE];
-    char value[RBD_MAX_CONF_VAL_SIZE];
+    QList *keypairs;
+    QString *name;
+    QString *value;
+    const char *key;
+    size_t remaining;
    int ret = 0;

-    buf = g_strdup(conf);
-    p = buf;
+    if (!keypairs_json) {
+        return ret;
+    }
+    keypairs = qobject_to_qlist(qobject_from_json(keypairs_json,
+                                                  &error_abort));
+    remaining = qlist_size(keypairs) / 2;
+    assert(remaining);

-    while (p) {
-        ret = qemu_rbd_next_tok(name, sizeof(name), p,
-                                '=', "conf option name", &p, errp);
-        if (ret < 0) {
-            break;
-        }
-        qemu_rbd_unescape(name);
+    while (remaining--) {
+        name = qobject_to_qstring(qlist_pop(keypairs));
+        value = qobject_to_qstring(qlist_pop(keypairs));
+        assert(name && value);
+        key = qstring_get_str(name);

-        if (!p) {
-            error_setg(errp, "conf option %s has no value", name);
-            ret = -EINVAL;
-            break;
-        }
-
-        ret = qemu_rbd_next_tok(value, sizeof(value), p,
-                                ':', "conf option value", &p, errp);
+        ret = rados_conf_set(cluster, key, qstring_get_str(value));
+        QDECREF(name);
+        QDECREF(value);
        if (ret < 0) {
-            break;
-        }
-        qemu_rbd_unescape(value);
-
-        if (strcmp(name, "conf") == 0) {
-            /* read the conf file alone, so it doesn't override more
-               specific settings for a particular device */
-            if (only_read_conf_file) {
-                ret = rados_conf_read_file(cluster, value);
-                if (ret < 0) {
-                    error_setg_errno(errp, -ret, "error reading conf file %s",
-                                     value);
-                    break;
-                }
-            }
-        } else if (strcmp(name, "id") == 0) {
-            /* ignore, this is parsed by qemu_rbd_parse_clientname() */
-        } else if (!only_read_conf_file) {
-            ret = rados_conf_set(cluster, name, value);
-            if (ret < 0) {
-                error_setg_errno(errp, -ret, "invalid conf option %s", name);
+            error_setg_errno(errp, -ret, "invalid conf option %s", key);
            ret = -EINVAL;
            break;
        }
    }
-    }

-    g_free(buf);
+    QDECREF(keypairs);
    return ret;
 }

+static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)
+{
+    if (LIBRBD_USE_IOVEC) {
+        RBDAIOCB *acb = rcb->acb;
+        iov_memset(acb->qiov->iov, acb->qiov->niov, offs, 0,
+                   acb->qiov->size - offs);
+    } else {
+        memset(rcb->buf + offs, 0, rcb->size - offs);
+    }
+}
+
+static QemuOptsList runtime_opts = {
+    .name = "rbd",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+    .desc = {
+        {
+            .name = "pool",
+            .type = QEMU_OPT_STRING,
+            .help = "Rados pool name",
+        },
+        {
+            .name = "image",
+            .type = QEMU_OPT_STRING,
+            .help = "Image name in the pool",
+        },
+        {
+            .name = "conf",
+            .type = QEMU_OPT_STRING,
+            .help = "Rados config file location",
+        },
+        {
+            .name = "snapshot",
+            .type = QEMU_OPT_STRING,
+            .help = "Ceph snapshot name",
+        },
+        {
+            /* maps to 'id' in rados_create() */
+            .name = "user",
+            .type = QEMU_OPT_STRING,
+            .help = "Rados id name",
+        },
+        /*
+         * server.* extracted manually, see qemu_rbd_mon_host()
+         */
+        {
+            .name = "password-secret",
+            .type = QEMU_OPT_STRING,
+            .help = "ID of secret providing the password",
+        },
+
+        /*
+         * Keys for qemu_rbd_parse_filename(), not in the QAPI schema
+         */
+        {
+            /*
+             * HACK: name starts with '=' so that qemu_opts_parse()
+             * can't set it
+             */
+            .name = "=keyvalue-pairs",
+            .type = QEMU_OPT_STRING,
+            .help = "Legacy rados key/value option parameters",
+        },
+        { /* end of list */ }
+    },
+};
+
 static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
 {
    Error *local_err = NULL;
    int64_t bytes = 0;
    int64_t objsize;
    int obj_order = 0;
-    char pool[RBD_MAX_POOL_NAME_SIZE];
-    char name[RBD_MAX_IMAGE_NAME_SIZE];
-    char snap_buf[RBD_MAX_SNAP_NAME_SIZE];
-    char conf[RBD_MAX_CONF_SIZE];
-    char clientname_buf[RBD_MAX_CONF_SIZE];
-    char *clientname;
+    const char *pool, *image_name, *conf, *user, *keypairs;
    const char *secretid;
    rados_t cluster;
    rados_ioctx_t io_ctx;
-    int ret;
+    QDict *options = NULL;
+    int ret = 0;

    secretid = qemu_opt_get(opts, "password-secret");

-    if (qemu_rbd_parsename(filename, pool, sizeof(pool),
-                           snap_buf, sizeof(snap_buf),
-                           name, sizeof(name),
-                           conf, sizeof(conf), &local_err) < 0) {
-        error_propagate(errp, local_err);
-        return -EINVAL;
-    }
-
    /* Read out options */
    bytes = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
                     BDRV_SECTOR_SIZE);
@@ -344,35 +366,53 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
    if (objsize) {
        if ((objsize - 1) & objsize) {    /* not a power of 2? */
            error_setg(errp, "obj size needs to be power of 2");
-            return -EINVAL;
+            ret = -EINVAL;
+            goto exit;
        }
        if (objsize < 4096) {
            error_setg(errp, "obj size too small");
-            return -EINVAL;
+            ret = -EINVAL;
+            goto exit;
        }
        obj_order = ctz32(objsize);
    }

-    clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
-    ret = rados_create(&cluster, clientname);
-    if (ret < 0) {
-        error_setg_errno(errp, -ret, "error initializing");
-        return ret;
+    options = qdict_new();
+    qemu_rbd_parse_filename(filename, options, &local_err);
+    if (local_err) {
+        ret = -EINVAL;
+        error_propagate(errp, local_err);
+        goto exit;
    }

-    if (strstr(conf, "conf=") == NULL) {
-        /* try default location, but ignore failure */
-        rados_conf_read_file(cluster, NULL);
-    } else if (conf[0] != '\0' &&
-               qemu_rbd_set_conf(cluster, conf, true, &local_err) < 0) {
-        error_propagate(errp, local_err);
+    /*
+     * Caution: while qdict_get_try_str() is fine, getting non-string
+     * types would require more care.  When @options come from -blockdev
+     * or blockdev_add, its members are typed according to the QAPI
+     * schema, but when they come from -drive, they're all QString.
+     */
+    pool       = qdict_get_try_str(options, "pool");
+    conf       = qdict_get_try_str(options, "conf");
+    user       = qdict_get_try_str(options, "user");
+    image_name = qdict_get_try_str(options, "image");
+    keypairs   = qdict_get_try_str(options, "=keyvalue-pairs");
+
+    ret = rados_create(&cluster, user);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "error initializing");
+        goto exit;
+    }
+
+    /* try default location when conf=NULL, but ignore failure */
+    ret = rados_conf_read_file(cluster, conf);
+    if (conf && ret < 0) {
+        error_setg_errno(errp, -ret, "error reading conf file %s", conf);
        ret = -EIO;
        goto shutdown;
    }

-    if (conf[0] != '\0' &&
-        qemu_rbd_set_conf(cluster, conf, false, &local_err) < 0) {
-        error_propagate(errp, local_err);
+    ret = qemu_rbd_set_keypairs(cluster, keypairs, errp);
+    if (ret < 0) {
        ret = -EIO;
        goto shutdown;
    }
@@ -394,7 +434,7 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
        goto shutdown;
    }

-    ret = rbd_create(io_ctx, name, bytes, &obj_order);
+    ret = rbd_create(io_ctx, image_name, bytes, &obj_order);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "error rbd create");
    }
@@ -403,6 +443,9 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)

 shutdown:
    rados_shutdown(cluster);
+
+exit:
+    QDECREF(options);
    return ret;
 }

@@ -426,11 +469,11 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
        }
    } else {
        if (r < 0) {
-            memset(rcb->buf, 0, rcb->size);
+            qemu_rbd_memset(rcb, 0);
            acb->ret = r;
            acb->error = 1;
        } else if (r < rcb->size) {
-            memset(rcb->buf + r, 0, rcb->size - r);
+            qemu_rbd_memset(rcb, r);
            if (!acb->error) {
                acb->ret = rcb->size;
            }
@@ -441,92 +484,122 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)

    g_free(rcb);

+    if (!LIBRBD_USE_IOVEC) {
        if (acb->cmd == RBD_AIO_READ) {
            qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
        }
        qemu_vfree(acb->bounce);
+    }
+
    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));

    qemu_aio_unref(acb);
 }

-/* TODO Convert to fine grained options */
-static QemuOptsList runtime_opts = {
-    .name = "rbd",
-    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
-    .desc = {
-        {
-            .name = "filename",
-            .type = QEMU_OPT_STRING,
-            .help = "Specification of the rbd image",
-        },
-        {
-            .name = "password-secret",
-            .type = QEMU_OPT_STRING,
-            .help = "ID of secret providing the password",
-        },
-        { /* end of list */ }
-    },
-};
+static char *qemu_rbd_mon_host(QDict *options, Error **errp)
+{
+    const char **vals = g_new(const char *, qdict_size(options) + 1);
+    char keybuf[32];
+    const char *host, *port;
+    char *rados_str;
+    int i;
+
+    for (i = 0;; i++) {
+        sprintf(keybuf, "server.%d.host", i);
+        host = qdict_get_try_str(options, keybuf);
+        qdict_del(options, keybuf);
+        sprintf(keybuf, "server.%d.port", i);
+        port = qdict_get_try_str(options, keybuf);
+        qdict_del(options, keybuf);
+        if (!host && !port) {
+            break;
+        }
+        if (!host) {
+            error_setg(errp, "Parameter server.%d.host is missing", i);
+            rados_str = NULL;
+            goto out;
+        }
+
+        if (strchr(host, ':')) {
+            vals[i] = port ? g_strdup_printf("[%s]:%s", host, port)
+                : g_strdup_printf("[%s]", host);
+        } else {
+            vals[i] = port ? g_strdup_printf("%s:%s", host, port)
+                : g_strdup(host);
+        }
+    }
+    vals[i] = NULL;
+
+    rados_str = i ? g_strjoinv(";", (char **)vals) : NULL;
+out:
+    g_strfreev((char **)vals);
+    return rados_str;
+}

 static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
                         Error **errp)
 {
    BDRVRBDState *s = bs->opaque;
-    char pool[RBD_MAX_POOL_NAME_SIZE];
-    char snap_buf[RBD_MAX_SNAP_NAME_SIZE];
-    char conf[RBD_MAX_CONF_SIZE];
-    char clientname_buf[RBD_MAX_CONF_SIZE];
-    char *clientname;
+    const char *pool, *snap, *conf, *user, *image_name, *keypairs;
    const char *secretid;
    QemuOpts *opts;
    Error *local_err = NULL;
-    const char *filename;
+    char *mon_host = NULL;
    int r;

    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
    qemu_opts_absorb_qdict(opts, options, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
-        qemu_opts_del(opts);
-        return -EINVAL;
-    }
-
-    filename = qemu_opt_get(opts, "filename");
-    secretid = qemu_opt_get(opts, "password-secret");
-
-    if (qemu_rbd_parsename(filename, pool, sizeof(pool),
-                           snap_buf, sizeof(snap_buf),
-                           s->name, sizeof(s->name),
-                           conf, sizeof(conf), errp) < 0) {
        r = -EINVAL;
        goto failed_opts;
    }

-    clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
-    r = rados_create(&s->cluster, clientname);
+    mon_host = qemu_rbd_mon_host(options, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        r = -EINVAL;
+        goto failed_opts;
+    }
+
+    secretid = qemu_opt_get(opts, "password-secret");
+
+    pool           = qemu_opt_get(opts, "pool");
+    conf           = qemu_opt_get(opts, "conf");
+    snap           = qemu_opt_get(opts, "snapshot");
+    user           = qemu_opt_get(opts, "user");
+    image_name     = qemu_opt_get(opts, "image");
+    keypairs       = qemu_opt_get(opts, "=keyvalue-pairs");
+
+    if (!pool || !image_name) {
+        error_setg(errp, "Parameters 'pool' and 'image' are required");
+        r = -EINVAL;
+        goto failed_opts;
+    }
+
+    r = rados_create(&s->cluster, user);
    if (r < 0) {
        error_setg_errno(errp, -r, "error initializing");
        goto failed_opts;
    }

-    s->snap = NULL;
-    if (snap_buf[0] != '\0') {
-        s->snap = g_strdup(snap_buf);
+    s->snap = g_strdup(snap);
+    s->image_name = g_strdup(image_name);
+
+    /* try default location when conf=NULL, but ignore failure */
+    r = rados_conf_read_file(s->cluster, conf);
+    if (conf && r < 0) {
+        error_setg_errno(errp, -r, "error reading conf file %s", conf);
+        goto failed_shutdown;
    }

-    if (strstr(conf, "conf=") == NULL) {
-        /* try default location, but ignore failure */
-        rados_conf_read_file(s->cluster, NULL);
-    } else if (conf[0] != '\0') {
-        r = qemu_rbd_set_conf(s->cluster, conf, true, errp);
+    r = qemu_rbd_set_keypairs(s->cluster, keypairs, errp);
    if (r < 0) {
        goto failed_shutdown;
    }
-    }

-    if (conf[0] != '\0') {
-        r = qemu_rbd_set_conf(s->cluster, conf, false, errp);
+    if (mon_host) {
+        r = rados_conf_set(s->cluster, "mon_host", mon_host);
        if (r < 0) {
            goto failed_shutdown;
        }
@@ -562,13 +635,23 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
        goto failed_shutdown;
    }

-    r = rbd_open(s->io_ctx, s->name, &s->image, s->snap);
+    /* rbd_open is always r/w */
+    r = rbd_open(s->io_ctx, s->image_name, &s->image, s->snap);
    if (r < 0) {
-        error_setg_errno(errp, -r, "error reading header from %s", s->name);
+        error_setg_errno(errp, -r, "error reading header from %s",
+                         s->image_name);
        goto failed_open;
    }

-    bs->read_only = (s->snap != NULL);
+    /* If we are using an rbd snapshot, we must be r/o, otherwise
+     * leave as-is */
+    if (s->snap != NULL) {
+        r = bdrv_set_read_only(bs, true, &local_err);
+        if (r < 0) {
+            error_propagate(errp, local_err);
+            goto failed_open;
+        }
+    }

    qemu_opts_del(opts);
    return 0;
@@ -578,11 +661,33 @@ failed_open:
 failed_shutdown:
    rados_shutdown(s->cluster);
    g_free(s->snap);
+    g_free(s->image_name);
 failed_opts:
    qemu_opts_del(opts);
+    g_free(mon_host);
    return r;
 }

+
+/* Since RBD is currently always opened R/W via the API,
+ * we just need to check if we are using a snapshot or not, in
+ * order to determine if we will allow it to be R/W */
+static int qemu_rbd_reopen_prepare(BDRVReopenState *state,
+                                   BlockReopenQueue *queue, Error **errp)
+{
+    BDRVRBDState *s = state->bs->opaque;
+    int ret = 0;
+
+    if (s->snap && state->flags & BDRV_O_RDWR) {
+        error_setg(errp,
+                   "Cannot change node '%s' to r/w when using RBD snapshot",
+                   bdrv_get_device_or_node_name(state->bs));
+        ret = -EINVAL;
+    }
+
+    return ret;
+}
+
 static void qemu_rbd_close(BlockDriverState *bs)
 {
    BDRVRBDState *s = bs->opaque;
@@ -590,6 +695,7 @@ static void qemu_rbd_close(BlockDriverState *bs)
    rbd_close(s->image);
    rados_ioctx_destroy(s->io_ctx);
    g_free(s->snap);
+    g_free(s->image_name);
    rados_shutdown(s->cluster);
 }

@@ -655,7 +761,6 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
    RBDAIOCB *acb;
    RADOSCB *rcb = NULL;
    rbd_completion_t c;
-    char *buf;
    int r;

    BDRVRBDState *s = bs->opaque;
@@ -664,6 +769,10 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
    acb->cmd = cmd;
    acb->qiov = qiov;
    assert(!qiov || qiov->size == size);
+
+    rcb = g_new(RADOSCB, 1);
+
+    if (!LIBRBD_USE_IOVEC) {
        if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) {
            acb->bounce = NULL;
        } else {
@@ -672,19 +781,17 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
                goto failed;
            }
        }
+        if (cmd == RBD_AIO_WRITE) {
+            qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
+        }
+        rcb->buf = acb->bounce;
+    }
+
    acb->ret = 0;
    acb->error = 0;
    acb->s = s;

-    if (cmd == RBD_AIO_WRITE) {
-        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
-    }
-
-    buf = acb->bounce;
-
-    rcb = g_new(RADOSCB, 1);
    rcb->acb = acb;
-    rcb->buf = buf;
    rcb->s = acb->s;
    rcb->size = size;
    r = rbd_aio_create_completion(rcb, (rbd_callback_t) rbd_finish_aiocb, &c);
@@ -694,10 +801,18 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,

    switch (cmd) {
    case RBD_AIO_WRITE:
-        r = rbd_aio_write(s->image, off, size, buf, c);
+#ifdef LIBRBD_SUPPORTS_IOVEC
+            r = rbd_aio_writev(s->image, qiov->iov, qiov->niov, off, c);
+#else
+            r = rbd_aio_write(s->image, off, size, rcb->buf, c);
+#endif
        break;
    case RBD_AIO_READ:
-        r = rbd_aio_read(s->image, off, size, buf, c);
+#ifdef LIBRBD_SUPPORTS_IOVEC
+            r = rbd_aio_readv(s->image, qiov->iov, qiov->niov, off, c);
+#else
+            r = rbd_aio_read(s->image, off, size, rcb->buf, c);
+#endif
        break;
    case RBD_AIO_DISCARD:
        r = rbd_aio_discard_wrapper(s->image, off, size, c);
@@ -712,14 +827,16 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
    if (r < 0) {
        goto failed_completion;
    }
-
    return &acb->common;

 failed_completion:
    rbd_aio_release(c);
 failed:
    g_free(rcb);
+    if (!LIBRBD_USE_IOVEC) {
        qemu_vfree(acb->bounce);
+    }
+
    qemu_aio_unref(acb);
    return NULL;
 }
@@ -974,9 +1091,10 @@ static QemuOptsList qemu_rbd_create_opts = {
 static BlockDriver bdrv_rbd = {
    .format_name            = "rbd",
    .instance_size          = sizeof(BDRVRBDState),
-    .bdrv_needs_filename = true,
+    .bdrv_parse_filename    = qemu_rbd_parse_filename,
    .bdrv_file_open         = qemu_rbd_open,
    .bdrv_close             = qemu_rbd_close,
+    .bdrv_reopen_prepare    = qemu_rbd_reopen_prepare,
    .bdrv_create            = qemu_rbd_create,
    .bdrv_has_zero_init     = bdrv_has_zero_init_1,
    .bdrv_get_info          = qemu_rbd_getinfo,
--- a/block/replication.c
+++ b/block/replication.c
@@ -86,6 +86,12 @@ static int replication_open(BlockDriverState *bs, QDict *options,
    const char *mode;
    const char *top_id;

+    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
+                               false, errp);
+    if (!bs->file) {
+        return -EINVAL;
+    }
+
    ret = -EINVAL;
    opts = qemu_opts_create(&replication_runtime_opts, NULL, 0, &error_abort);
    qemu_opts_absorb_qdict(opts, options, &local_err);
@@ -149,6 +155,18 @@ static void replication_close(BlockDriverState *bs)
    replication_remove(s->rs);
 }

+static void replication_child_perm(BlockDriverState *bs, BdrvChild *c,
+                                   const BdrvChildRole *role,
+                                   uint64_t perm, uint64_t shared,
+                                   uint64_t *nperm, uint64_t *nshared)
+{
+    *nperm = *nshared = BLK_PERM_CONSISTENT_READ \
+                        | BLK_PERM_WRITE \
+                        | BLK_PERM_WRITE_UNCHANGED;
+
+    return;
+}
+
 static int64_t replication_getlength(BlockDriverState *bs)
 {
    return bdrv_getlength(bs->file->bs);
@@ -638,7 +656,7 @@ static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
        s->replication_state = BLOCK_REPLICATION_FAILOVER;
        commit_active_start(NULL, s->active_disk->bs, s->secondary_disk->bs,
                            BLOCK_JOB_INTERNAL, 0, BLOCKDEV_ON_ERROR_REPORT,
-                            replication_done, bs, errp, true);
+                            NULL, replication_done, bs, true, errp);
        break;
    default:
        aio_context_release(aio_context);
@@ -654,6 +672,7 @@ BlockDriver bdrv_replication = {

    .bdrv_open                  = replication_open,
    .bdrv_close                 = replication_close,
+    .bdrv_child_perm            = replication_child_perm,

    .bdrv_getlength             = replication_getlength,
    .bdrv_co_readv              = replication_co_readv,
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -13,7 +13,11 @@
 */

 #include "qemu/osdep.h"
+#include "qapi-visit.h"
 #include "qapi/error.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qint.h"
+#include "qapi/qobject-input-visitor.h"
 #include "qemu/uri.h"
 #include "qemu/error-report.h"
 #include "qemu/sockets.h"
@@ -374,8 +378,7 @@ struct BDRVSheepdogState {
    uint32_t cache_flags;
    bool discard_supported;

-    char *host_spec;
-    bool is_unix;
+    SocketAddress *addr;
    int fd;

    CoMutex lock;
@@ -486,7 +489,7 @@ static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *acb)
 retry:
    QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
        if (AIOCBOverlapping(acb, cb)) {
-            qemu_co_queue_wait(&s->overlapping_queue);
+            qemu_co_queue_wait(&s->overlapping_queue, NULL);
            goto retry;
        }
    }
@@ -527,23 +530,79 @@ static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
    QLIST_INSERT_HEAD(&s->inflight_aiocb_head, acb, aiocb_siblings);
 }

+static SocketAddress *sd_socket_address(const char *path,
+                                        const char *host, const char *port)
+{
+    SocketAddress *addr = g_new0(SocketAddress, 1);
+
+    if (path) {
+        addr->type = SOCKET_ADDRESS_KIND_UNIX;
+        addr->u.q_unix.data = g_new0(UnixSocketAddress, 1);
+        addr->u.q_unix.data->path = g_strdup(path);
+    } else {
+        addr->type = SOCKET_ADDRESS_KIND_INET;
+        addr->u.inet.data = g_new0(InetSocketAddress, 1);
+        addr->u.inet.data->host = g_strdup(host ?: SD_DEFAULT_ADDR);
+        addr->u.inet.data->port = g_strdup(port ?: stringify(SD_DEFAULT_PORT));
+    }
+
+    return addr;
+}
+
+static SocketAddress *sd_server_config(QDict *options, Error **errp)
+{
+    QDict *server = NULL;
+    QObject *crumpled_server = NULL;
+    Visitor *iv = NULL;
+    SocketAddressFlat *saddr_flat = NULL;
+    SocketAddress *saddr = NULL;
+    Error *local_err = NULL;
+
+    qdict_extract_subqdict(options, &server, "server.");
+
+    crumpled_server = qdict_crumple(server, errp);
+    if (!crumpled_server) {
+        goto done;
+    }
+
+    /*
+     * FIXME .numeric, .to, .ipv4 or .ipv6 don't work with -drive
+     * server.type=inet.  .to doesn't matter, it's ignored anyway.
+     * That's because when @options come from -blockdev or
+     * blockdev_add, members are typed according to the QAPI schema,
+     * but when they come from -drive, they're all QString.  The
+     * visitor expects the former.
+     */
+    iv = qobject_input_visitor_new(crumpled_server);
+    visit_type_SocketAddressFlat(iv, NULL, &saddr_flat, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        goto done;
+    }
+
+    saddr = socket_address_crumple(saddr_flat);
+
+done:
+    qapi_free_SocketAddressFlat(saddr_flat);
+    visit_free(iv);
+    qobject_decref(crumpled_server);
+    QDECREF(server);
+    return saddr;
+}
+
 /* Return -EIO in case of error, file descriptor on success */
 static int connect_to_sdog(BDRVSheepdogState *s, Error **errp)
 {
    int fd;

-    if (s->is_unix) {
-        fd = unix_connect(s->host_spec, errp);
-    } else {
-        fd = inet_connect(s->host_spec, errp);
+    fd = socket_connect(s->addr, NULL, NULL, errp);

-        if (fd >= 0) {
+    if (s->addr->type == SOCKET_ADDRESS_KIND_INET && fd >= 0) {
        int ret = socket_set_nodelay(fd);
        if (ret < 0) {
            error_report("%s", strerror(errno));
        }
    }
-    }

    if (fd >= 0) {
        qemu_set_nonblock(fd);
@@ -575,13 +634,6 @@ static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
    return ret;
 }

-static void restart_co_req(void *opaque)
-{
-    Coroutine *co = opaque;
-
-    qemu_coroutine_enter(co);
-}
-
 typedef struct SheepdogReqCo {
    int sockfd;
    BlockDriverState *bs;
@@ -592,12 +644,19 @@ typedef struct SheepdogReqCo {
    unsigned int *rlen;
    int ret;
    bool finished;
+    Coroutine *co;
 } SheepdogReqCo;

+static void restart_co_req(void *opaque)
+{
+    SheepdogReqCo *srco = opaque;
+
+    aio_co_wake(srco->co);
+}
+
 static coroutine_fn void do_co_req(void *opaque)
 {
    int ret;
-    Coroutine *co;
    SheepdogReqCo *srco = opaque;
    int sockfd = srco->sockfd;
    SheepdogReq *hdr = srco->hdr;
@@ -605,9 +664,9 @@ static coroutine_fn void do_co_req(void *opaque)
    unsigned int *wlen = srco->wlen;
    unsigned int *rlen = srco->rlen;

-    co = qemu_coroutine_self();
+    srco->co = qemu_coroutine_self();
    aio_set_fd_handler(srco->aio_context, sockfd, false,
-                       NULL, restart_co_req, NULL, co);
+                       NULL, restart_co_req, NULL, srco);

    ret = send_co_req(sockfd, hdr, data, wlen);
    if (ret < 0) {
@@ -615,7 +674,7 @@ static coroutine_fn void do_co_req(void *opaque)
    }

    aio_set_fd_handler(srco->aio_context, sockfd, false,
-                       restart_co_req, NULL, NULL, co);
+                       restart_co_req, NULL, NULL, srco);

    ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
    if (ret != sizeof(*hdr)) {
@@ -643,6 +702,7 @@ out:
    aio_set_fd_handler(srco->aio_context, sockfd, false,
                       NULL, NULL, NULL, NULL);

+    srco->co = NULL;
    srco->ret = ret;
    srco->finished = true;
    if (srco->bs) {
@@ -676,7 +736,7 @@ static int do_req(int sockfd, BlockDriverState *bs, SheepdogReq *hdr,
    } else {
        co = qemu_coroutine_create(do_co_req, &srco);
        if (bs) {
-            qemu_coroutine_enter(co);
+            bdrv_coroutine_enter(bs, co);
            BDRV_POLL_WHILE(bs, !srco.finished);
        } else {
            qemu_coroutine_enter(co);
@@ -819,8 +879,7 @@ static void coroutine_fn aio_read_response(void *opaque)
    case AIOCB_DISCARD_OBJ:
        switch (rsp.result) {
        case SD_RES_INVALID_PARMS:
-            error_report("sheep(%s) doesn't support discard command",
-                         s->host_spec);
+            error_report("server doesn't support discard command");
            rsp.result = SD_RES_SUCCESS;
            s->discard_supported = false;
            break;
@@ -866,7 +925,7 @@ static void coroutine_fn aio_read_response(void *opaque)
         * We've finished all requests which belong to the AIOCB, so
         * we can switch back to sd_co_readv/writev now.
         */
-        qemu_coroutine_enter(acb->coroutine);
+        aio_co_wake(acb->coroutine);
    }

    return;
@@ -883,14 +942,14 @@ static void co_read_response(void *opaque)
        s->co_recv = qemu_coroutine_create(aio_read_response, opaque);
    }

-    qemu_coroutine_enter(s->co_recv);
+    aio_co_enter(s->aio_context, s->co_recv);
 }

 static void co_write_request(void *opaque)
 {
    BDRVSheepdogState *s = opaque;

-    qemu_coroutine_enter(s->co_send);
+    aio_co_wake(s->co_send);
 }

 /*
@@ -913,71 +972,155 @@ static int get_sheep_fd(BDRVSheepdogState *s, Error **errp)
    return fd;
 }

-static int sd_parse_uri(BDRVSheepdogState *s, const char *filename,
-                        char *vdi, uint32_t *snapid, char *tag)
+/*
+ * Parse numeric snapshot ID in @str
+ * If @str can't be parsed as number, return false.
+ * Else, if the number is zero or too large, set *@snapid to zero and
+ * return true.
+ * Else, set *@snapid to the number and return true.
+ */
+static bool sd_parse_snapid(const char *str, uint32_t *snapid)
 {
-    URI *uri;
-    QueryParams *qp = NULL;
-    int ret = 0;
+    unsigned long ul;
+    int ret;

-    uri = uri_parse(filename);
+    ret = qemu_strtoul(str, NULL, 10, &ul);
+    if (ret == -ERANGE) {
+        ul = ret = 0;
+    }
+    if (ret) {
+        return false;
+    }
+    if (ul > UINT32_MAX) {
+        ul = 0;
+    }
+
+    *snapid = ul;
+    return true;
+}
+
+static bool sd_parse_snapid_or_tag(const char *str,
+                                   uint32_t *snapid, char tag[])
+{
+    if (!sd_parse_snapid(str, snapid)) {
+        *snapid = 0;
+        if (g_strlcpy(tag, str, SD_MAX_VDI_TAG_LEN) >= SD_MAX_VDI_TAG_LEN) {
+            return false;
+        }
+    } else if (!*snapid) {
+        return false;
+    } else {
+        tag[0] = 0;
+    }
+    return true;
+}
+
+typedef struct {
+    const char *path;           /* non-null iff transport is tcp */
+    const char *host;           /* valid when transport is tcp */
+    int port;                   /* valid when transport is tcp */
+    char vdi[SD_MAX_VDI_LEN];
+    char tag[SD_MAX_VDI_TAG_LEN];
+    uint32_t snap_id;
+    /* Remainder is only for sd_config_done() */
+    URI *uri;
+    QueryParams *qp;
+} SheepdogConfig;
+
+static void sd_config_done(SheepdogConfig *cfg)
+{
+    if (cfg->qp) {
+        query_params_free(cfg->qp);
+    }
+    uri_free(cfg->uri);
+}
+
+static void sd_parse_uri(SheepdogConfig *cfg, const char *filename,
+                         Error **errp)
+{
+    Error *err = NULL;
+    QueryParams *qp = NULL;
+    bool is_unix;
+    URI *uri;
+
+    memset(cfg, 0, sizeof(*cfg));
+
+    cfg->uri = uri = uri_parse(filename);
    if (!uri) {
-        return -EINVAL;
+        error_setg(&err, "invalid URI");
+        goto out;
    }

    /* transport */
    if (!strcmp(uri->scheme, "sheepdog")) {
-        s->is_unix = false;
+        is_unix = false;
    } else if (!strcmp(uri->scheme, "sheepdog+tcp")) {
-        s->is_unix = false;
+        is_unix = false;
    } else if (!strcmp(uri->scheme, "sheepdog+unix")) {
-        s->is_unix = true;
+        is_unix = true;
    } else {
-        ret = -EINVAL;
+        error_setg(&err, "URI scheme must be 'sheepdog', 'sheepdog+tcp',"
+                   " or 'sheepdog+unix'");
        goto out;
    }

    if (uri->path == NULL || !strcmp(uri->path, "/")) {
-        ret = -EINVAL;
+        error_setg(&err, "missing file path in URI");
        goto out;
    }
-    pstrcpy(vdi, SD_MAX_VDI_LEN, uri->path + 1);
-
-    qp = query_params_parse(uri->query);
-    if (qp->n > 1 || (s->is_unix && !qp->n) || (!s->is_unix && qp->n)) {
-        ret = -EINVAL;
+    if (g_strlcpy(cfg->vdi, uri->path + 1, SD_MAX_VDI_LEN)
+        >= SD_MAX_VDI_LEN) {
+        error_setg(&err, "VDI name is too long");
        goto out;
    }

-    if (s->is_unix) {
+    cfg->qp = qp = query_params_parse(uri->query);
+
+    if (is_unix) {
        /* sheepdog+unix:///vdiname?socket=path */
-        if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) {
-            ret = -EINVAL;
+        if (uri->server || uri->port) {
+            error_setg(&err, "URI scheme %s doesn't accept a server address",
+                       uri->scheme);
            goto out;
        }
-        s->host_spec = g_strdup(qp->p[0].value);
+        if (!qp->n) {
+            error_setg(&err,
+                       "URI scheme %s requires query parameter 'socket'",
+                       uri->scheme);
+            goto out;
+        }
+        if (qp->n != 1 || strcmp(qp->p[0].name, "socket")) {
+            error_setg(&err, "unexpected query parameters");
+            goto out;
+        }
+        cfg->path = qp->p[0].value;
    } else {
        /* sheepdog[+tcp]://[host:port]/vdiname */
-        s->host_spec = g_strdup_printf("%s:%d", uri->server ?: SD_DEFAULT_ADDR,
-                                       uri->port ?: SD_DEFAULT_PORT);
+        if (qp->n) {
+            error_setg(&err, "unexpected query parameters");
+            goto out;
+        }
+        cfg->host = uri->server;
+        cfg->port = uri->port;
    }

    /* snapshot tag */
    if (uri->fragment) {
-        *snapid = strtoul(uri->fragment, NULL, 10);
-        if (*snapid == 0) {
-            pstrcpy(tag, SD_MAX_VDI_TAG_LEN, uri->fragment);
+        if (!sd_parse_snapid_or_tag(uri->fragment,
+                                    &cfg->snap_id, cfg->tag)) {
+            error_setg(&err, "'%s' is not a valid snapshot ID",
+                       uri->fragment);
+            goto out;
        }
    } else {
-        *snapid = CURRENT_VDI_ID; /* search current vdi */
+        cfg->snap_id = CURRENT_VDI_ID; /* search current vdi */
    }

 out:
-    if (qp) {
-        query_params_free(qp);
+    if (err) {
+        error_propagate(errp, err);
+        sd_config_done(cfg);
    }
-    uri_free(uri);
-    return ret;
 }

 /*
@@ -997,12 +1140,13 @@ out:
 * You can run VMs outside the Sheepdog cluster by specifying
 * `hostname' and `port' (experimental).
 */
-static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
-                         char *vdi, uint32_t *snapid, char *tag)
+static void parse_vdiname(SheepdogConfig *cfg, const char *filename,
+                          Error **errp)
 {
+    Error *err = NULL;
    char *p, *q, *uri;
    const char *host_spec, *vdi_spec;
-    int nr_sep, ret;
+    int nr_sep;

    strstart(filename, "sheepdog:", &filename);
    p = q = g_strdup(filename);
@@ -1037,12 +1181,60 @@ static int parse_vdiname(BDRVSheepdogState *s, const char *filename,

    uri = g_strdup_printf("sheepdog://%s/%s", host_spec, vdi_spec);

-    ret = sd_parse_uri(s, uri, vdi, snapid, tag);
+    /*
+     * FIXME We to escape URI meta-characters, e.g. "x?y=z"
+     * produces "sheepdog://x?y=z".  Because of that ...
+     */
+    sd_parse_uri(cfg, uri, &err);
+    if (err) {
+        /*
+         * ... this can fail, but the error message is misleading.
+         * Replace it by the traditional useless one until the
+         * escaping is fixed.
+         */
+        error_free(err);
+        error_setg(errp, "Can't parse filename");
+    }

    g_free(q);
    g_free(uri);
+}

-    return ret;
+static void sd_parse_filename(const char *filename, QDict *options,
+                              Error **errp)
+{
+    Error *err = NULL;
+    SheepdogConfig cfg;
+    char buf[32];
+
+    if (strstr(filename, "://")) {
+        sd_parse_uri(&cfg, filename, &err);
+    } else {
+        parse_vdiname(&cfg, filename, &err);
+    }
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+
+    if (cfg.path) {
+        qdict_set_default_str(options, "server.path", cfg.path);
+        qdict_set_default_str(options, "server.type", "unix");
+    } else {
+        qdict_set_default_str(options, "server.type", "inet");
+        qdict_set_default_str(options, "server.host",
+                              cfg.host ?: SD_DEFAULT_ADDR);
+        snprintf(buf, sizeof(buf), "%d", cfg.port ?: SD_DEFAULT_PORT);
+        qdict_set_default_str(options, "server.port", buf);
+    }
+    qdict_set_default_str(options, "vdi", cfg.vdi);
+    qdict_set_default_str(options, "tag", cfg.tag);
+    if (cfg.snap_id) {
+        snprintf(buf, sizeof(buf), "%d", cfg.snap_id);
+        qdict_set_default_str(options, "snap-id", buf);
+    }
+
+    sd_config_done(&cfg);
 }

 static int find_vdi_name(BDRVSheepdogState *s, const char *filename,
@@ -1356,15 +1548,21 @@ static void sd_attach_aio_context(BlockDriverState *bs,
                       co_read_response, NULL, NULL, s);
 }

-/* TODO Convert to fine grained options */
 static QemuOptsList runtime_opts = {
    .name = "sheepdog",
    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
    .desc = {
        {
-            .name = "filename",
+            .name = "vdi",
+            .type = QEMU_OPT_STRING,
+        },
+        {
+            .name = "snap-id",
+            .type = QEMU_OPT_NUMBER,
+        },
+        {
+            .name = "tag",
            .type = QEMU_OPT_STRING,
-            .help = "URL to the sheepdog image",
        },
        { /* end of list */ }
    },
@@ -1376,12 +1574,11 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,
    int ret, fd;
    uint32_t vid = 0;
    BDRVSheepdogState *s = bs->opaque;
-    char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
-    uint32_t snapid;
+    const char *vdi, *snap_id_str, *tag;
+    uint64_t snap_id;
    char *buf = NULL;
    QemuOpts *opts;
    Error *local_err = NULL;
-    const char *filename;

    s->bs = bs;
    s->aio_context = bdrv_get_aio_context(bs);
@@ -1391,37 +1588,63 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,
    if (local_err) {
        error_propagate(errp, local_err);
        ret = -EINVAL;
-        goto out;
+        goto err_no_fd;
    }

-    filename = qemu_opt_get(opts, "filename");
+    s->addr = sd_server_config(options, errp);
+    if (!s->addr) {
+        ret = -EINVAL;
+        goto err_no_fd;
+    }
+
+    vdi = qemu_opt_get(opts, "vdi");
+    snap_id_str = qemu_opt_get(opts, "snap-id");
+    snap_id = qemu_opt_get_number(opts, "snap-id", CURRENT_VDI_ID);
+    tag = qemu_opt_get(opts, "tag");
+
+    if (!vdi) {
+        error_setg(errp, "parameter 'vdi' is missing");
+        ret = -EINVAL;
+        goto err_no_fd;
+    }
+    if (strlen(vdi) >= SD_MAX_VDI_LEN) {
+        error_setg(errp, "value of parameter 'vdi' is too long");
+        ret = -EINVAL;
+        goto err_no_fd;
+    }
+
+    if (snap_id > UINT32_MAX) {
+        snap_id = 0;
+    }
+    if (snap_id_str && !snap_id) {
+        error_setg(errp, "'snap-id=%s' is not a valid snapshot ID",
+                   snap_id_str);
+        ret = -EINVAL;
+        goto err_no_fd;
+    }
+
+    if (!tag) {
+        tag = "";
+    }
+    if (tag && strlen(tag) >= SD_MAX_VDI_TAG_LEN) {
+        error_setg(errp, "value of parameter 'tag' is too long");
+        ret = -EINVAL;
+        goto err_no_fd;
+    }

    QLIST_INIT(&s->inflight_aio_head);
    QLIST_INIT(&s->failed_aio_head);
    QLIST_INIT(&s->inflight_aiocb_head);
-    s->fd = -1;

-    memset(vdi, 0, sizeof(vdi));
-    memset(tag, 0, sizeof(tag));
-
-    if (strstr(filename, "://")) {
-        ret = sd_parse_uri(s, filename, vdi, &snapid, tag);
-    } else {
-        ret = parse_vdiname(s, filename, vdi, &snapid, tag);
-    }
-    if (ret < 0) {
-        error_setg(errp, "Can't parse filename");
-        goto out;
-    }
    s->fd = get_sheep_fd(s, errp);
    if (s->fd < 0) {
        ret = s->fd;
-        goto out;
+        goto err_no_fd;
    }

-    ret = find_vdi_name(s, vdi, snapid, tag, &vid, true, errp);
+    ret = find_vdi_name(s, vdi, (uint32_t)snap_id, tag, &vid, true, errp);
    if (ret) {
-        goto out;
+        goto err;
    }

    /*
@@ -1434,7 +1657,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,
    }
    s->discard_supported = true;

-    if (snapid || tag[0] != '\0') {
+    if (snap_id || tag[0]) {
        DPRINTF("%" PRIx32 " snapshot inode was open.\n", vid);
        s->is_snapshot = true;
    }
@@ -1442,7 +1665,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,
    fd = connect_to_sdog(s, errp);
    if (fd < 0) {
        ret = fd;
-        goto out;
+        goto err;
    }

    buf = g_malloc(SD_INODE_SIZE);
@@ -1453,7 +1676,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,

    if (ret) {
        error_setg(errp, "Can't read snapshot inode");
-        goto out;
+        goto err;
    }

    memcpy(&s->inode, buf, sizeof(s->inode));
@@ -1465,12 +1688,12 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,
    qemu_opts_del(opts);
    g_free(buf);
    return 0;
-out:
+
+err:
    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd,
                       false, NULL, NULL, NULL, NULL);
-    if (s->fd >= 0) {
    closesocket(s->fd);
-    }
+err_no_fd:
    qemu_opts_del(opts);
    g_free(buf);
    return ret;
@@ -1608,7 +1831,7 @@ static int sd_prealloc(const char *filename, Error **errp)
    int ret;

    blk = blk_new_open(filename, NULL, NULL,
-                       BDRV_O_RDWR | BDRV_O_PROTOCOL, errp);
+                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
    if (blk == NULL) {
        ret = -EIO;
        goto out_with_err_set;
@@ -1684,6 +1907,7 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
    }

    copy = strtol(n1, NULL, 10);
+    /* FIXME fix error checking by switching to qemu_strtol() */
    if (copy > SD_MAX_COPIES || copy < 1) {
        return -EINVAL;
    }
@@ -1698,6 +1922,7 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
    }

    parity = strtol(n2, NULL, 10);
+    /* FIXME fix error checking by switching to qemu_strtol() */
    if (parity >= SD_EC_MAX_STRIP || parity < 1) {
        return -EINVAL;
    }
@@ -1736,29 +1961,34 @@ static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
 static int sd_create(const char *filename, QemuOpts *opts,
                     Error **errp)
 {
+    Error *err = NULL;
    int ret = 0;
    uint32_t vid = 0;
    char *backing_file = NULL;
    char *buf = NULL;
    BDRVSheepdogState *s;
-    char tag[SD_MAX_VDI_TAG_LEN];
-    uint32_t snapid;
+    SheepdogConfig cfg;
    uint64_t max_vdi_size;
    bool prealloc = false;

    s = g_new0(BDRVSheepdogState, 1);

-    memset(tag, 0, sizeof(tag));
    if (strstr(filename, "://")) {
-        ret = sd_parse_uri(s, filename, s->name, &snapid, tag);
+        sd_parse_uri(&cfg, filename, &err);
    } else {
-        ret = parse_vdiname(s, filename, s->name, &snapid, tag);
+        parse_vdiname(&cfg, filename, &err);
    }
-    if (ret < 0) {
-        error_setg(errp, "Can't parse filename");
+    if (err) {
+        error_propagate(errp, err);
        goto out;
    }

+    buf = cfg.port ? g_strdup_printf("%d", cfg.port) : NULL;
+    s->addr = sd_socket_address(cfg.path, cfg.host, buf);
+    g_free(buf);
+    strcpy(s->name, cfg.vdi);
+    sd_config_done(&cfg);
+
    s->inode.vdi_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
                                 BDRV_SECTOR_SIZE);
    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
@@ -1828,14 +2058,12 @@ static int sd_create(const char *filename, QemuOpts *opts,
    if (s->inode.block_size_shift == 0) {
        SheepdogVdiReq hdr;
        SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr;
-        Error *local_err = NULL;
        int fd;
        unsigned int wlen = 0, rlen = 0;

-        fd = connect_to_sdog(s, &local_err);
+        fd = connect_to_sdog(s, errp);
        if (fd < 0) {
-            error_report_err(local_err);
-            ret = -EIO;
+            ret = fd;
            goto out;
        }

@@ -1921,7 +2149,7 @@ static void sd_close(BlockDriverState *bs)
    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd,
                       false, NULL, NULL, NULL, NULL);
    closesocket(s->fd);
-    g_free(s->host_spec);
+    qapi_free_SocketAddress(s->addr);
 }

 static int64_t sd_getlength(BlockDriverState *bs)
@@ -2366,19 +2594,16 @@ static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
    BDRVSheepdogState *old_s;
    char tag[SD_MAX_VDI_TAG_LEN];
    uint32_t snapid = 0;
-    int ret = 0;
+    int ret;
+
+    if (!sd_parse_snapid_or_tag(snapshot_id, &snapid, tag)) {
+        return -EINVAL;
+    }

    old_s = g_new(BDRVSheepdogState, 1);

    memcpy(old_s, s, sizeof(BDRVSheepdogState));

-    snapid = strtoul(snapshot_id, NULL, 10);
-    if (snapid) {
-        tag[0] = 0;
-    } else {
-        pstrcpy(tag, sizeof(tag), snapshot_id);
-    }
-
    ret = reload_inode(s, snapid, tag);
    if (ret) {
        goto out;
@@ -2404,18 +2629,15 @@ out:

 #define NR_BATCHED_DISCARD 128

-static bool remove_objects(BDRVSheepdogState *s)
+static int remove_objects(BDRVSheepdogState *s, Error **errp)
 {
    int fd, i = 0, nr_objs = 0;
-    Error *local_err = NULL;
-    int ret = 0;
-    bool result = true;
+    int ret;
    SheepdogInode *inode = &s->inode;

-    fd = connect_to_sdog(s, &local_err);
+    fd = connect_to_sdog(s, errp);
    if (fd < 0) {
-        error_report_err(local_err);
-        return false;
+        return fd;
    }

    nr_objs = count_data_objs(inode);
@@ -2445,15 +2667,15 @@ static bool remove_objects(BDRVSheepdogState *s)
                                    data_vdi_id[start_idx]),
                           false, s->cache_flags);
        if (ret < 0) {
-            error_report("failed to discard snapshot inode.");
-            result = false;
+            error_setg(errp, "Failed to discard snapshot inode");
            goto out;
        }
    }

+    ret = 0;
 out:
    closesocket(fd);
-    return result;
+    return ret;
 }

 static int sd_snapshot_delete(BlockDriverState *bs,
@@ -2461,9 +2683,12 @@ static int sd_snapshot_delete(BlockDriverState *bs,
                              const char *name,
                              Error **errp)
 {
+    /*
+     * FIXME should delete the snapshot matching both @snapshot_id and
+     * @name, but @name not used here
+     */
    unsigned long snap_id = 0;
    char snap_tag[SD_MAX_VDI_TAG_LEN];
-    Error *local_err = NULL;
    int fd, ret;
    char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
    BDRVSheepdogState *s = bs->opaque;
@@ -2476,15 +2701,22 @@ static int sd_snapshot_delete(BlockDriverState *bs,
    };
    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;

-    if (!remove_objects(s)) {
-        return -1;
+    ret = remove_objects(s, errp);
+    if (ret) {
+        return ret;
    }

    memset(buf, 0, sizeof(buf));
    memset(snap_tag, 0, sizeof(snap_tag));
    pstrcpy(buf, SD_MAX_VDI_LEN, s->name);
+    /* TODO Use sd_parse_snapid() once this mess is cleaned up */
    ret = qemu_strtoul(snapshot_id, NULL, 10, &snap_id);
    if (ret || snap_id > UINT32_MAX) {
+        /*
+         * FIXME Since qemu_strtoul() returns -EINVAL when
+         * @snapshot_id is null, @snapshot_id is mandatory.  Correct
+         * would be to require at least one of @snapshot_id and @name.
+         */
        error_setg(errp, "Invalid snapshot ID: %s",
                         snapshot_id ? snapshot_id : "<null>");
        return -EINVAL;
@@ -2493,40 +2725,42 @@ static int sd_snapshot_delete(BlockDriverState *bs,
    if (snap_id) {
        hdr.snapid = (uint32_t) snap_id;
    } else {
+        /* FIXME I suspect we should use @name here */
+        /* FIXME don't truncate silently */
        pstrcpy(snap_tag, sizeof(snap_tag), snapshot_id);
        pstrcpy(buf + SD_MAX_VDI_LEN, SD_MAX_VDI_TAG_LEN, snap_tag);
    }

-    ret = find_vdi_name(s, s->name, snap_id, snap_tag, &vid, true,
-                        &local_err);
+    ret = find_vdi_name(s, s->name, snap_id, snap_tag, &vid, true, errp);
    if (ret) {
        return ret;
    }

-    fd = connect_to_sdog(s, &local_err);
+    fd = connect_to_sdog(s, errp);
    if (fd < 0) {
-        error_report_err(local_err);
-        return -1;
+        return fd;
    }

    ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
                 buf, &wlen, &rlen);
    closesocket(fd);
    if (ret) {
+        error_setg_errno(errp, -ret, "Couldn't send request to server");
        return ret;
    }

    switch (rsp->result) {
    case SD_RES_NO_VDI:
-        error_report("%s was already deleted", s->name);
+        error_setg(errp, "Can't find the snapshot");
+        return -ENOENT;
    case SD_RES_SUCCESS:
        break;
    default:
-        error_report("%s, %s", sd_strerror(rsp->result), s->name);
-        return -1;
+        error_setg(errp, "%s", sd_strerror(rsp->result));
+        return -EIO;
    }

-    return ret;
+    return 0;
 }

 static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
@@ -2831,7 +3065,7 @@ static BlockDriver bdrv_sheepdog = {
    .format_name    = "sheepdog",
    .protocol_name  = "sheepdog",
    .instance_size  = sizeof(BDRVSheepdogState),
-    .bdrv_needs_filename = true,
+    .bdrv_parse_filename    = sd_parse_filename,
    .bdrv_file_open = sd_open,
    .bdrv_reopen_prepare    = sd_reopen_prepare,
    .bdrv_reopen_commit     = sd_reopen_commit,
@@ -2867,7 +3101,7 @@ static BlockDriver bdrv_sheepdog_tcp = {
    .format_name    = "sheepdog",
    .protocol_name  = "sheepdog+tcp",
    .instance_size  = sizeof(BDRVSheepdogState),
-    .bdrv_needs_filename = true,
+    .bdrv_parse_filename    = sd_parse_filename,
    .bdrv_file_open = sd_open,
    .bdrv_reopen_prepare    = sd_reopen_prepare,
    .bdrv_reopen_commit     = sd_reopen_commit,
@@ -2903,7 +3137,7 @@ static BlockDriver bdrv_sheepdog_unix = {
    .format_name    = "sheepdog",
    .protocol_name  = "sheepdog+unix",
    .instance_size  = sizeof(BDRVSheepdogState),
-    .bdrv_needs_filename = true,
+    .bdrv_parse_filename    = sd_parse_filename,
    .bdrv_file_open = sd_open,
    .bdrv_reopen_prepare    = sd_reopen_prepare,
    .bdrv_reopen_commit     = sd_reopen_commit,
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -27,6 +27,7 @@
 #include "block/block_int.h"
 #include "qapi/error.h"
 #include "qapi/qmp/qerror.h"
+#include "qapi/qmp/qstring.h"

 QemuOptsList internal_snapshot_opts = {
    .name = "snapshot",
@@ -189,14 +190,33 @@ int bdrv_snapshot_goto(BlockDriverState *bs,
    }

    if (bs->file) {
+        BlockDriverState *file;
+        QDict *options = qdict_clone_shallow(bs->options);
+        QDict *file_options;
+
+        file = bs->file->bs;
+        /* Prevent it from getting deleted when detached from bs */
+        bdrv_ref(file);
+
+        qdict_extract_subqdict(options, &file_options, "file.");
+        QDECREF(file_options);
+        qdict_put(options, "file", qstring_from_str(bdrv_get_node_name(file)));
+
        drv->bdrv_close(bs);
-        ret = bdrv_snapshot_goto(bs->file->bs, snapshot_id);
-        open_ret = drv->bdrv_open(bs, NULL, bs->open_flags, NULL);
+        bdrv_unref_child(bs, bs->file);
+        bs->file = NULL;
+
+        ret = bdrv_snapshot_goto(file, snapshot_id);
+        open_ret = drv->bdrv_open(bs, options, bs->open_flags, NULL);
+        QDECREF(options);
        if (open_ret < 0) {
-            bdrv_unref(bs->file->bs);
+            bdrv_unref(file);
            bs->drv = NULL;
            return open_ret;
        }
+
+        assert(bs->file->bs == file);
+        bdrv_unref(file);
        return ret;
    }

--- a/block/ssh.c
+++ b/block/ssh.c
@@ -601,7 +601,15 @@ static InetSocketAddress *ssh_config(QDict *options, Error **errp)
        goto out;
    }

-    iv = qobject_input_visitor_new(crumpled_addr, true);
+    /*
+     * FIXME .numeric, .to, .ipv4 or .ipv6 don't work with -drive.
+     * .to doesn't matter, it's ignored anyway.
+     * That's because when @options come from -blockdev or
+     * blockdev_add, members are typed according to the QAPI schema,
+     * but when they come from -drive, they're all QString.  The
+     * visitor expects the former.
+     */
+    iv = qobject_input_visitor_new(crumpled_addr);
    visit_type_InetSocketAddress(iv, NULL, &inet, &local_error);
    if (local_error) {
        error_propagate(errp, local_error);
@@ -673,7 +681,7 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
    }

    /* Open the socket and connect. */
-    s->sock = inet_connect_saddr(s->inet, errp, NULL, NULL);
+    s->sock = inet_connect_saddr(s->inet, NULL, NULL, errp);
    if (s->sock < 0) {
        ret = -EIO;
        goto err;
@@ -889,10 +897,14 @@ static void restart_coroutine(void *opaque)

    DPRINTF("co=%p", co);

-    qemu_coroutine_enter(co);
+    aio_co_wake(co);
 }

-static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs)
+/* A non-blocking call returned EAGAIN, so yield, ensuring the
+ * handlers are set up so that we'll be rescheduled when there is an
+ * interesting event on the socket.
+ */
+static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs)
 {
    int r;
    IOHandler *rd_handler = NULL, *wr_handler = NULL;
@@ -912,25 +924,10 @@ static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs)

    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
                       false, rd_handler, wr_handler, NULL, co);
-}
-
-static coroutine_fn void clear_fd_handler(BDRVSSHState *s,
-                                          BlockDriverState *bs)
-{
-    DPRINTF("s->sock=%d", s->sock);
-    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
-                       false, NULL, NULL, NULL, NULL);
-}
-
-/* A non-blocking call returned EAGAIN, so yield, ensuring the
- * handlers are set up so that we'll be rescheduled when there is an
- * interesting event on the socket.
- */
-static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs)
-{
-    set_fd_handler(s, bs);
    qemu_coroutine_yield();
-    clear_fd_handler(s, bs);
+    DPRINTF("s->sock=%d - back", s->sock);
+    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, false,
+                       NULL, NULL, NULL, NULL);
 }

 /* SFTP has a function `libssh2_sftp_seek64' which seeks to a position
--- a/block/stream.c
+++ b/block/stream.c
@@ -68,6 +68,7 @@ static void stream_complete(BlockJob *job, void *opaque)
    StreamCompleteData *data = opaque;
    BlockDriverState *bs = blk_bs(job->blk);
    BlockDriverState *base = s->base;
+    Error *local_err = NULL;

    if (!block_job_is_cancelled(&s->common) && data->reached_end &&
        data->ret == 0) {
@@ -79,11 +80,19 @@ static void stream_complete(BlockJob *job, void *opaque)
            }
        }
        data->ret = bdrv_change_backing_file(bs, base_id, base_fmt);
-        bdrv_set_backing_hd(bs, base);
+        bdrv_set_backing_hd(bs, base, &local_err);
+        if (local_err) {
+            error_report_err(local_err);
+            data->ret = -EPERM;
+            goto out;
+        }
    }

+out:
    /* Reopen the image back in read-only mode if necessary */
    if (s->bs_flags != bdrv_get_flags(bs)) {
+        /* Give up write permissions before making it read-only */
+        blk_set_perm(job->blk, 0, BLK_PERM_ALL, &error_abort);
        bdrv_reopen(bs, s->bs_flags, NULL);
    }

@@ -229,25 +238,35 @@ void stream_start(const char *job_id, BlockDriverState *bs,
    BlockDriverState *iter;
    int orig_bs_flags;

-    s = block_job_create(job_id, &stream_job_driver, bs, speed,
-                         BLOCK_JOB_DEFAULT, NULL, NULL, errp);
-    if (!s) {
-        return;
-    }
-
    /* Make sure that the image is opened in read-write mode */
    orig_bs_flags = bdrv_get_flags(bs);
    if (!(orig_bs_flags & BDRV_O_RDWR)) {
        if (bdrv_reopen(bs, orig_bs_flags | BDRV_O_RDWR, errp) != 0) {
-            block_job_unref(&s->common);
            return;
        }
    }

-    /* Block all intermediate nodes between bs and base, because they
-     * will disappear from the chain after this operation */
+    /* Prevent concurrent jobs trying to modify the graph structure here, we
+     * already have our own plans. Also don't allow resize as the image size is
+     * queried only at the job start and then cached. */
+    s = block_job_create(job_id, &stream_job_driver, bs,
+                         BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
+                         BLK_PERM_GRAPH_MOD,
+                         BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
+                         BLK_PERM_WRITE,
+                         speed, BLOCK_JOB_DEFAULT, NULL, NULL, errp);
+    if (!s) {
+        goto fail;
+    }
+
+    /* Block all intermediate nodes between bs and base, because they will
+     * disappear from the chain after this operation. The streaming job reads
+     * every block only once, assuming that it doesn't change, so block writes
+     * and resizes. */
    for (iter = backing_bs(bs); iter && iter != base; iter = backing_bs(iter)) {
-        block_job_add_bdrv(&s->common, iter);
+        block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
+                           BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED,
+                           &error_abort);
    }

    s->base = base;
@@ -257,4 +276,10 @@ void stream_start(const char *job_id, BlockDriverState *bs,
    s->on_error = on_error;
    trace_stream_start(bs, base, s);
    block_job_start(&s->common);
+    return;
+
+fail:
+    if (orig_bs_flags != bdrv_get_flags(bs)) {
+        bdrv_reopen(bs, s->bs_flags, NULL);
+    }
 }
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -326,7 +326,7 @@ void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
    if (must_wait || blkp->pending_reqs[is_write]) {
        blkp->pending_reqs[is_write]++;
        qemu_mutex_unlock(&tg->lock);
-        qemu_co_queue_wait(&blkp->throttled_reqs[is_write]);
+        qemu_co_queue_wait(&blkp->throttled_reqs[is_write], NULL);
        qemu_mutex_lock(&tg->lock);
        blkp->pending_reqs[is_write]--;
    }
@@ -416,7 +416,9 @@ static void timer_cb(BlockBackend *blk, bool is_write)
    qemu_mutex_unlock(&tg->lock);

    /* Run the request that was waiting for this timer */
+    aio_context_acquire(blk_get_aio_context(blk));
    empty_queue = !qemu_co_enter_next(&blkp->throttled_reqs[is_write]);
+    aio_context_release(blk_get_aio_context(blk));

    /* If the request queue was empty then we have to take care of
     * scheduling the next one */
--- a/block/trace-events
+++ b/block/trace-events
@@ -110,3 +110,20 @@ qed_aio_write_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s
 qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
 qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
 qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
+
+# block/vxhs.c
+vxhs_iio_callback(int error) "ctx is NULL: error %d"
+vxhs_iio_callback_chnfail(int err, int error) "QNIO channel failed, no i/o %d, %d"
+vxhs_iio_callback_unknwn(int opcode, int err) "unexpected opcode %d, errno %d"
+vxhs_aio_rw_invalid(int req) "Invalid I/O request iodir %d"
+vxhs_aio_rw_ioerr(char *guid, int iodir, uint64_t size, uint64_t off, void *acb, int ret, int err) "IO ERROR (vDisk %s) FOR : Read/Write = %d size = %"PRIu64" offset = %"PRIu64" ACB = %p. Error = %d, errno = %d"
+vxhs_get_vdisk_stat_err(char *guid, int ret, int err) "vDisk (%s) stat ioctl failed, ret = %d, errno = %d"
+vxhs_get_vdisk_stat(char *vdisk_guid, uint64_t vdisk_size) "vDisk %s stat ioctl returned size %"PRIu64
+vxhs_complete_aio(void *acb, uint64_t ret) "aio failed acb %p ret %"PRIu64
+vxhs_parse_uri_filename(const char *filename) "URI passed via bdrv_parse_filename %s"
+vxhs_open_vdiskid(const char *vdisk_id) "Opening vdisk-id %s"
+vxhs_open_hostinfo(char *of_vsa_addr, int port) "Adding host %s:%d to BDRVVXHSState"
+vxhs_open_iio_open(const char *host) "Failed to connect to storage agent on host %s"
+vxhs_parse_uri_hostinfo(char *host, int port) "Host: IP %s, Port %d"
+vxhs_close(char *vdisk_guid) "Closing vdisk %s"
+vxhs_get_creds(const char *cacert, const char *client_key, const char *client_cert) "cacert %s, client_key %s, client_cert %s"
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -363,6 +363,12 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
    int ret;
    Error *local_err = NULL;

+    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
+                               false, errp);
+    if (!bs->file) {
+        return -EINVAL;
+    }
+
    logout("\n");

    ret = bdrv_read(bs->file, 0, (uint8_t *)&header, 1);
@@ -757,7 +763,8 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    blk = blk_new_open(filename, NULL, NULL,
-                       BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
+                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
+                       &local_err);
    if (blk == NULL) {
        error_propagate(errp, local_err);
        ret = -EIO;
@@ -885,6 +892,7 @@ static BlockDriver bdrv_vdi = {
    .bdrv_open = vdi_open,
    .bdrv_close = vdi_close,
    .bdrv_reopen_prepare = vdi_reopen_prepare,
+    .bdrv_child_perm          = bdrv_format_default_perms,
    .bdrv_create = vdi_create,
    .bdrv_has_zero_init = bdrv_has_zero_init_1,
    .bdrv_co_get_block_status = vdi_co_get_block_status,
--- a/block/vhdx-log.c
+++ b/block/vhdx-log.c
@@ -548,7 +548,7 @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s,
            if (new_file_size % (1024*1024)) {
                /* round up to nearest 1MB boundary */
                new_file_size = ((new_file_size >> 20) + 1) << 20;
-                bdrv_truncate(bs->file->bs, new_file_size);
+                bdrv_truncate(bs->file, new_file_size);
            }
        }
        qemu_vfree(desc_entries);
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -898,6 +898,12 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
    uint64_t signature;
    Error *local_err = NULL;

+    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
+                               false, errp);
+    if (!bs->file) {
+        return -EINVAL;
+    }
+
    s->bat = NULL;
    s->first_visible_write = true;

@@ -1165,7 +1171,7 @@ static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s,
    /* per the spec, the address for a block is in units of 1MB */
    *new_offset = ROUND_UP(*new_offset, 1024 * 1024);

-    return bdrv_truncate(bs->file->bs, *new_offset + s->block_size);
+    return bdrv_truncate(bs->file, *new_offset + s->block_size);
 }

 /*
@@ -1853,7 +1859,8 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    blk = blk_new_open(filename, NULL, NULL,
-                       BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
+                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
+                       &local_err);
    if (blk == NULL) {
        error_propagate(errp, local_err);
        ret = -EIO;
@@ -1977,6 +1984,7 @@ static BlockDriver bdrv_vhdx = {
    .bdrv_open              = vhdx_open,
    .bdrv_close             = vhdx_close,
    .bdrv_reopen_prepare    = vhdx_reopen_prepare,
+    .bdrv_child_perm        = bdrv_format_default_perms,
    .bdrv_co_readv          = vhdx_co_readv,
    .bdrv_co_writev         = vhdx_co_writev,
    .bdrv_create            = vhdx_create,
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -943,6 +943,12 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
    uint32_t magic;
    Error *local_err = NULL;

+    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
+                               false, errp);
+    if (!bs->file) {
+        return -EINVAL;
+    }
+
    buf = vmdk_read_desc(bs->file, 0, errp);
    if (!buf) {
        return -EINVAL;
@@ -1361,8 +1367,8 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
            goto out;
        }

-        data->lba = offset >> BDRV_SECTOR_BITS;
-        data->size = buf_len;
+        data->lba = cpu_to_le64(offset >> BDRV_SECTOR_BITS);
+        data->size = cpu_to_le32(buf_len);

        n_bytes = buf_len + sizeof(VmdkGrainMarker);
        iov = (struct iovec) {
@@ -1697,7 +1703,8 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
    }

    blk = blk_new_open(filename, NULL, NULL,
-                       BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
+                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
+                       &local_err);
    if (blk == NULL) {
        error_propagate(errp, local_err);
        ret = -EIO;
@@ -2065,7 +2072,8 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    new_blk = blk_new_open(filename, NULL, NULL,
-                           BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
+                           BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
+                           &local_err);
    if (new_blk == NULL) {
        error_propagate(errp, local_err);
        ret = -EIO;
@@ -2353,6 +2361,7 @@ static BlockDriver bdrv_vmdk = {
    .bdrv_open                    = vmdk_open,
    .bdrv_check                   = vmdk_check,
    .bdrv_reopen_prepare          = vmdk_reopen_prepare,
+    .bdrv_child_perm              = bdrv_format_default_perms,
    .bdrv_co_preadv               = vmdk_co_preadv,
    .bdrv_co_pwritev              = vmdk_co_pwritev,
    .bdrv_co_pwritev_compressed   = vmdk_co_pwritev_compressed,
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -220,6 +220,12 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
    int disk_type = VHD_DYNAMIC;
    int ret;

+    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
+                               false, errp);
+    if (!bs->file) {
+        return -EINVAL;
+    }
+
    opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
    qemu_opts_absorb_qdict(opts, options, &local_err);
    if (local_err) {
@@ -909,7 +915,8 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    blk = blk_new_open(filename, NULL, NULL,
-                       BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
+                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
+                       &local_err);
    if (blk == NULL) {
        error_propagate(errp, local_err);
        ret = -EIO;
@@ -1061,6 +1068,7 @@ static BlockDriver bdrv_vpc = {
    .bdrv_open              = vpc_open,
    .bdrv_close             = vpc_close,
    .bdrv_reopen_prepare    = vpc_reopen_prepare,
+    .bdrv_child_perm        = bdrv_format_default_perms,
    .bdrv_create            = vpc_create,

    .bdrv_co_preadv             = vpc_co_preadv,
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -1156,8 +1156,6 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,

    s->current_cluster=0xffffffff;

-    /* read only is the default for safety */
-    bs->read_only = true;
    s->qcow = NULL;
    s->qcow_filename = NULL;
    s->fat2 = NULL;
@@ -1169,11 +1167,24 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
    s->sector_count = cyls * heads * secs - (s->first_sectors_number - 1);

    if (qemu_opt_get_bool(opts, "rw", false)) {
+        if (!bdrv_is_read_only(bs)) {
            ret = enable_write_target(bs, errp);
            if (ret < 0) {
                goto fail;
            }
-        bs->read_only = false;
+        } else {
+            ret = -EPERM;
+            error_setg(errp,
+                       "Unable to set VVFAT to 'rw' when drive is read-only");
+            goto fail;
+        }
+    } else  {
+        /* read only is the default for safety */
+        ret = bdrv_set_read_only(bs, true, &local_err);
+        if (ret < 0) {
+            error_propagate(errp, local_err);
+            goto fail;
+        }
    }

    bs->total_sectors = cyls * heads * secs;
@@ -1394,7 +1405,13 @@ static int vvfat_read(BlockDriverState *bs, int64_t sector_num,
 	   return -1;
 	if (s->qcow) {
 	    int n;
-            if (bdrv_is_allocated(s->qcow->bs, sector_num, nb_sectors-i, &n)) {
+            int ret;
+            ret = bdrv_is_allocated(s->qcow->bs, sector_num,
+                                    nb_sectors - i, &n);
+            if (ret < 0) {
+                return ret;
+            }
+            if (ret) {
                DLOG(fprintf(stderr, "sectors %d+%d allocated\n",
                             (int)sector_num, n));
                if (bdrv_read(s->qcow, sector_num, buf + i * 0x200, n)) {
@@ -1668,7 +1685,8 @@ static inline uint32_t modified_fat_get(BDRVVVFATState* s,
    }
 }

-static inline int cluster_was_modified(BDRVVVFATState* s, uint32_t cluster_num)
+static inline bool cluster_was_modified(BDRVVVFATState *s,
+                                        uint32_t cluster_num)
 {
    int was_modified = 0;
    int i, dummy;
@@ -1683,7 +1701,13 @@ static inline int cluster_was_modified(BDRVVVFATState* s, uint32_t cluster_num)
                                         1, &dummy);
    }

-    return was_modified;
+    /*
+     * Note that this treats failures to learn allocation status the
+     * same as if an allocation has occurred.  It's as safe as
+     * anything else, given that a failure to learn allocation status
+     * will probably result in more failures.
+     */
+    return !!was_modified;
 }

 static const char* get_basename(const char* path)
@@ -1833,6 +1857,9 @@ static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s,
                    int res;

                    res = bdrv_is_allocated(s->qcow->bs, offset + i, 1, &dummy);
+                    if (res < 0) {
+                        return -1;
+                    }
                    if (!res) {
                        res = vvfat_read(s->bs, offset, s->cluster_buffer, 1);
                        if (res) {
@@ -2968,6 +2995,7 @@ static void write_target_close(BlockDriverState *bs) {

 static BlockDriver vvfat_write_target = {
    .format_name        = "vvfat_write_target",
+    .instance_size      = sizeof(void*),
    .bdrv_co_pwritev    = write_target_commit,
    .bdrv_close         = write_target_close,
 };
@@ -3036,13 +3064,12 @@ static int enable_write_target(BlockDriverState *bs, Error **errp)
    unlink(s->qcow_filename);
 #endif

-    backing = bdrv_new();
-    bdrv_set_backing_hd(s->bs, backing);
-    bdrv_unref(backing);
+    backing = bdrv_new_open_driver(&vvfat_write_target, NULL, BDRV_O_ALLOW_RDWR,
+                                   &error_abort);
+    *(void**) backing->opaque = s;

-    s->bs->backing->bs->drv = &vvfat_write_target;
-    s->bs->backing->bs->opaque = g_new(void *, 1);
-    *(void**)s->bs->backing->bs->opaque = s;
+    bdrv_set_backing_hd(s->bs, backing, &error_abort);
+    bdrv_unref(backing);

    return 0;

@@ -3052,6 +3079,27 @@ err:
    return ret;
 }

+static void vvfat_child_perm(BlockDriverState *bs, BdrvChild *c,
+                             const BdrvChildRole *role,
+                             uint64_t perm, uint64_t shared,
+                             uint64_t *nperm, uint64_t *nshared)
+{
+    BDRVVVFATState *s = bs->opaque;
+
+    assert(c == s->qcow || role == &child_backing);
+
+    if (c == s->qcow) {
+        /* This is a private node, nobody should try to attach to it */
+        *nperm = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE;
+        *nshared = BLK_PERM_WRITE_UNCHANGED;
+    } else {
+        /* The backing file is there so 'commit' can use it. vvfat doesn't
+         * access it in any way. */
+        *nperm = 0;
+        *nshared = BLK_PERM_ALL;
+    }
+}
+
 static void vvfat_close(BlockDriverState *bs)
 {
    BDRVVVFATState *s = bs->opaque;
@@ -3077,6 +3125,7 @@ static BlockDriver bdrv_vvfat = {
    .bdrv_file_open         = vvfat_open,
    .bdrv_refresh_limits    = vvfat_refresh_limits,
    .bdrv_close             = vvfat_close,
+    .bdrv_child_perm        = vvfat_child_perm,

    .bdrv_co_preadv         = vvfat_co_preadv,
    .bdrv_co_pwritev        = vvfat_co_pwritev,
--- a/block/vxhs.c
+++ b/block/vxhs.c
@@ -0,0 +1,575 @@
+/*
+ * QEMU Block driver for Veritas HyperScale (VxHS)
+ *
+ * Copyright (c) 2017 Veritas Technologies LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include <qnio/qnio_api.h>
+#include <sys/param.h>
+#include "block/block_int.h"
+#include "qapi/qmp/qerror.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qstring.h"
+#include "trace.h"
+#include "qemu/uri.h"
+#include "qapi/error.h"
+#include "qemu/uuid.h"
+#include "crypto/tlscredsx509.h"
+
+#define VXHS_OPT_FILENAME           "filename"
+#define VXHS_OPT_VDISK_ID           "vdisk-id"
+#define VXHS_OPT_SERVER             "server"
+#define VXHS_OPT_HOST               "host"
+#define VXHS_OPT_PORT               "port"
+
+/* Only accessed under QEMU global mutex */
+static uint32_t vxhs_ref;
+
+typedef enum {
+    VDISK_AIO_READ,
+    VDISK_AIO_WRITE,
+} VDISKAIOCmd;
+
+/*
+ * HyperScale AIO callbacks structure
+ */
+typedef struct VXHSAIOCB {
+    BlockAIOCB common;
+    int err;
+} VXHSAIOCB;
+
+typedef struct VXHSvDiskHostsInfo {
+    void *dev_handle; /* Device handle */
+    char *host; /* Host name or IP */
+    int port; /* Host's port number */
+} VXHSvDiskHostsInfo;
+
+/*
+ * Structure per vDisk maintained for state
+ */
+typedef struct BDRVVXHSState {
+    VXHSvDiskHostsInfo vdisk_hostinfo; /* Per host info */
+    char *vdisk_guid;
+    char *tlscredsid; /* tlscredsid */
+} BDRVVXHSState;
+
+static void vxhs_complete_aio_bh(void *opaque)
+{
+    VXHSAIOCB *acb = opaque;
+    BlockCompletionFunc *cb = acb->common.cb;
+    void *cb_opaque = acb->common.opaque;
+    int ret = 0;
+
+    if (acb->err != 0) {
+        trace_vxhs_complete_aio(acb, acb->err);
+        ret = (-EIO);
+    }
+
+    qemu_aio_unref(acb);
+    cb(cb_opaque, ret);
+}
+
+/*
+ * Called from a libqnio thread
+ */
+static void vxhs_iio_callback(void *ctx, uint32_t opcode, uint32_t error)
+{
+    VXHSAIOCB *acb = NULL;
+
+    switch (opcode) {
+    case IRP_READ_REQUEST:
+    case IRP_WRITE_REQUEST:
+
+        /*
+         * ctx is VXHSAIOCB*
+         * ctx is NULL if error is QNIOERROR_CHANNEL_HUP
+         */
+        if (ctx) {
+            acb = ctx;
+        } else {
+            trace_vxhs_iio_callback(error);
+            goto out;
+        }
+
+        if (error) {
+            if (!acb->err) {
+                acb->err = error;
+            }
+            trace_vxhs_iio_callback(error);
+        }
+
+        aio_bh_schedule_oneshot(bdrv_get_aio_context(acb->common.bs),
+                                vxhs_complete_aio_bh, acb);
+        break;
+
+    default:
+        if (error == QNIOERROR_HUP) {
+            /*
+             * Channel failed, spontaneous notification,
+             * not in response to I/O
+             */
+            trace_vxhs_iio_callback_chnfail(error, errno);
+        } else {
+            trace_vxhs_iio_callback_unknwn(opcode, error);
+        }
+        break;
+    }
+out:
+    return;
+}
+
+static QemuOptsList runtime_opts = {
+    .name = "vxhs",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+    .desc = {
+        {
+            .name = VXHS_OPT_FILENAME,
+            .type = QEMU_OPT_STRING,
+            .help = "URI to the Veritas HyperScale image",
+        },
+        {
+            .name = VXHS_OPT_VDISK_ID,
+            .type = QEMU_OPT_STRING,
+            .help = "UUID of the VxHS vdisk",
+        },
+        {
+            .name = "tls-creds",
+            .type = QEMU_OPT_STRING,
+            .help = "ID of the TLS/SSL credentials to use",
+        },
+        { /* end of list */ }
+    },
+};
+
+static QemuOptsList runtime_tcp_opts = {
+    .name = "vxhs_tcp",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_tcp_opts.head),
+    .desc = {
+        {
+            .name = VXHS_OPT_HOST,
+            .type = QEMU_OPT_STRING,
+            .help = "host address (ipv4 addresses)",
+        },
+        {
+            .name = VXHS_OPT_PORT,
+            .type = QEMU_OPT_NUMBER,
+            .help = "port number on which VxHSD is listening (default 9999)",
+            .def_value_str = "9999"
+        },
+        { /* end of list */ }
+    },
+};
+
+/*
+ * Parse incoming URI and populate *options with the host
+ * and device information
+ */
+static int vxhs_parse_uri(const char *filename, QDict *options)
+{
+    URI *uri = NULL;
+    char *port;
+    int ret = 0;
+
+    trace_vxhs_parse_uri_filename(filename);
+    uri = uri_parse(filename);
+    if (!uri || !uri->server || !uri->path) {
+        uri_free(uri);
+        return -EINVAL;
+    }
+
+    qdict_put(options, VXHS_OPT_SERVER".host", qstring_from_str(uri->server));
+
+    if (uri->port) {
+        port = g_strdup_printf("%d", uri->port);
+        qdict_put(options, VXHS_OPT_SERVER".port", qstring_from_str(port));
+        g_free(port);
+    }
+
+    qdict_put(options, "vdisk-id", qstring_from_str(uri->path));
+
+    trace_vxhs_parse_uri_hostinfo(uri->server, uri->port);
+    uri_free(uri);
+
+    return ret;
+}
+
+static void vxhs_parse_filename(const char *filename, QDict *options,
+                                Error **errp)
+{
+    if (qdict_haskey(options, "vdisk-id") || qdict_haskey(options, "server")) {
+        error_setg(errp, "vdisk-id/server and a file name may not be specified "
+                         "at the same time");
+        return;
+    }
+
+    if (strstr(filename, "://")) {
+        int ret = vxhs_parse_uri(filename, options);
+        if (ret < 0) {
+            error_setg(errp, "Invalid URI. URI should be of the form "
+                       "  vxhs://<host_ip>:<port>/<vdisk-id>");
+        }
+    }
+}
+
+static int vxhs_init_and_ref(void)
+{
+    if (vxhs_ref++ == 0) {
+        if (iio_init(QNIO_VERSION, vxhs_iio_callback)) {
+            return -ENODEV;
+        }
+    }
+    return 0;
+}
+
+static void vxhs_unref(void)
+{
+    if (--vxhs_ref == 0) {
+        iio_fini();
+    }
+}
+
+static void vxhs_get_tls_creds(const char *id, char **cacert,
+                               char **key, char **cert, Error **errp)
+{
+    Object *obj;
+    QCryptoTLSCreds *creds;
+    QCryptoTLSCredsX509 *creds_x509;
+
+    obj = object_resolve_path_component(
+        object_get_objects_root(), id);
+
+    if (!obj) {
+        error_setg(errp, "No TLS credentials with id '%s'",
+                   id);
+        return;
+    }
+
+    creds_x509 = (QCryptoTLSCredsX509 *)
+        object_dynamic_cast(obj, TYPE_QCRYPTO_TLS_CREDS_X509);
+
+    if (!creds_x509) {
+        error_setg(errp, "Object with id '%s' is not TLS credentials",
+                   id);
+        return;
+    }
+
+    creds = &creds_x509->parent_obj;
+
+    if (creds->endpoint != QCRYPTO_TLS_CREDS_ENDPOINT_CLIENT) {
+        error_setg(errp,
+                   "Expecting TLS credentials with a client endpoint");
+        return;
+    }
+
+    /*
+     * Get the cacert, client_cert and client_key file names.
+     */
+    if (!creds->dir) {
+        error_setg(errp, "TLS object missing 'dir' property value");
+        return;
+    }
+
+    *cacert = g_strdup_printf("%s/%s", creds->dir,
+                              QCRYPTO_TLS_CREDS_X509_CA_CERT);
+    *cert = g_strdup_printf("%s/%s", creds->dir,
+                            QCRYPTO_TLS_CREDS_X509_CLIENT_CERT);
+    *key = g_strdup_printf("%s/%s", creds->dir,
+                           QCRYPTO_TLS_CREDS_X509_CLIENT_KEY);
+}
+
+static int vxhs_open(BlockDriverState *bs, QDict *options,
+                     int bdrv_flags, Error **errp)
+{
+    BDRVVXHSState *s = bs->opaque;
+    void *dev_handlep;
+    QDict *backing_options = NULL;
+    QemuOpts *opts = NULL;
+    QemuOpts *tcp_opts = NULL;
+    char *of_vsa_addr = NULL;
+    Error *local_err = NULL;
+    const char *vdisk_id_opt;
+    const char *server_host_opt;
+    int ret = 0;
+    char *cacert = NULL;
+    char *client_key = NULL;
+    char *client_cert = NULL;
+
+    ret = vxhs_init_and_ref();
+    if (ret < 0) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    /* Create opts info from runtime_opts and runtime_tcp_opts list */
+    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
+    tcp_opts = qemu_opts_create(&runtime_tcp_opts, NULL, 0, &error_abort);
+
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    /* vdisk-id is the disk UUID */
+    vdisk_id_opt = qemu_opt_get(opts, VXHS_OPT_VDISK_ID);
+    if (!vdisk_id_opt) {
+        error_setg(&local_err, QERR_MISSING_PARAMETER, VXHS_OPT_VDISK_ID);
+        ret = -EINVAL;
+        goto out;
+    }
+
+    /* vdisk-id may contain a leading '/' */
+    if (strlen(vdisk_id_opt) > UUID_FMT_LEN + 1) {
+        error_setg(&local_err, "vdisk-id cannot be more than %d characters",
+                   UUID_FMT_LEN);
+        ret = -EINVAL;
+        goto out;
+    }
+
+    s->vdisk_guid = g_strdup(vdisk_id_opt);
+    trace_vxhs_open_vdiskid(vdisk_id_opt);
+
+    /* get the 'server.' arguments */
+    qdict_extract_subqdict(options, &backing_options, VXHS_OPT_SERVER".");
+
+    qemu_opts_absorb_qdict(tcp_opts, backing_options, &local_err);
+    if (local_err != NULL) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    server_host_opt = qemu_opt_get(tcp_opts, VXHS_OPT_HOST);
+    if (!server_host_opt) {
+        error_setg(&local_err, QERR_MISSING_PARAMETER,
+                   VXHS_OPT_SERVER"."VXHS_OPT_HOST);
+        ret = -EINVAL;
+        goto out;
+    }
+
+    if (strlen(server_host_opt) > MAXHOSTNAMELEN) {
+        error_setg(&local_err, "server.host cannot be more than %d characters",
+                   MAXHOSTNAMELEN);
+        ret = -EINVAL;
+        goto out;
+    }
+
+    /* check if we got tls-creds via the --object argument */
+    s->tlscredsid = g_strdup(qemu_opt_get(opts, "tls-creds"));
+    if (s->tlscredsid) {
+        vxhs_get_tls_creds(s->tlscredsid, &cacert, &client_key,
+                           &client_cert, &local_err);
+        if (local_err != NULL) {
+            ret = -EINVAL;
+            goto out;
+        }
+        trace_vxhs_get_creds(cacert, client_key, client_cert);
+    }
+
+    s->vdisk_hostinfo.host = g_strdup(server_host_opt);
+    s->vdisk_hostinfo.port = g_ascii_strtoll(qemu_opt_get(tcp_opts,
+                                                          VXHS_OPT_PORT),
+                                                          NULL, 0);
+
+    trace_vxhs_open_hostinfo(s->vdisk_hostinfo.host,
+                             s->vdisk_hostinfo.port);
+
+    of_vsa_addr = g_strdup_printf("of://%s:%d",
+                                  s->vdisk_hostinfo.host,
+                                  s->vdisk_hostinfo.port);
+
+    /*
+     * Open qnio channel to storage agent if not opened before
+     */
+    dev_handlep = iio_open(of_vsa_addr, s->vdisk_guid, 0,
+                           cacert, client_key, client_cert);
+    if (dev_handlep == NULL) {
+        trace_vxhs_open_iio_open(of_vsa_addr);
+        ret = -ENODEV;
+        goto out;
+    }
+    s->vdisk_hostinfo.dev_handle = dev_handlep;
+
+out:
+    g_free(of_vsa_addr);
+    QDECREF(backing_options);
+    qemu_opts_del(tcp_opts);
+    qemu_opts_del(opts);
+    g_free(cacert);
+    g_free(client_key);
+    g_free(client_cert);
+
+    if (ret < 0) {
+        vxhs_unref();
+        error_propagate(errp, local_err);
+        g_free(s->vdisk_hostinfo.host);
+        g_free(s->vdisk_guid);
+        g_free(s->tlscredsid);
+        s->vdisk_guid = NULL;
+    }
+
+    return ret;
+}
+
+static const AIOCBInfo vxhs_aiocb_info = {
+    .aiocb_size = sizeof(VXHSAIOCB)
+};
+
+/*
+ * This allocates QEMU-VXHS callback for each IO
+ * and is passed to QNIO. When QNIO completes the work,
+ * it will be passed back through the callback.
+ */
+static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t sector_num,
+                               QEMUIOVector *qiov, int nb_sectors,
+                               BlockCompletionFunc *cb, void *opaque,
+                               VDISKAIOCmd iodir)
+{
+    VXHSAIOCB *acb = NULL;
+    BDRVVXHSState *s = bs->opaque;
+    size_t size;
+    uint64_t offset;
+    int iio_flags = 0;
+    int ret = 0;
+    void *dev_handle = s->vdisk_hostinfo.dev_handle;
+
+    offset = sector_num * BDRV_SECTOR_SIZE;
+    size = nb_sectors * BDRV_SECTOR_SIZE;
+    acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque);
+
+    /*
+     * Initialize VXHSAIOCB.
+     */
+    acb->err = 0;
+
+    iio_flags = IIO_FLAG_ASYNC;
+
+    switch (iodir) {
+    case VDISK_AIO_WRITE:
+            ret = iio_writev(dev_handle, acb, qiov->iov, qiov->niov,
+                             offset, (uint64_t)size, iio_flags);
+            break;
+    case VDISK_AIO_READ:
+            ret = iio_readv(dev_handle, acb, qiov->iov, qiov->niov,
+                            offset, (uint64_t)size, iio_flags);
+            break;
+    default:
+            trace_vxhs_aio_rw_invalid(iodir);
+            goto errout;
+    }
+
+    if (ret != 0) {
+        trace_vxhs_aio_rw_ioerr(s->vdisk_guid, iodir, size, offset,
+                                acb, ret, errno);
+        goto errout;
+    }
+    return &acb->common;
+
+errout:
+    qemu_aio_unref(acb);
+    return NULL;
+}
+
+static BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs,
+                                   int64_t sector_num, QEMUIOVector *qiov,
+                                   int nb_sectors,
+                                   BlockCompletionFunc *cb, void *opaque)
+{
+    return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors, cb,
+                       opaque, VDISK_AIO_READ);
+}
+
+static BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs,
+                                   int64_t sector_num, QEMUIOVector *qiov,
+                                   int nb_sectors,
+                                   BlockCompletionFunc *cb, void *opaque)
+{
+    return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors,
+                       cb, opaque, VDISK_AIO_WRITE);
+}
+
+static void vxhs_close(BlockDriverState *bs)
+{
+    BDRVVXHSState *s = bs->opaque;
+
+    trace_vxhs_close(s->vdisk_guid);
+
+    g_free(s->vdisk_guid);
+    s->vdisk_guid = NULL;
+
+    /*
+     * Close vDisk device
+     */
+    if (s->vdisk_hostinfo.dev_handle) {
+        iio_close(s->vdisk_hostinfo.dev_handle);
+        s->vdisk_hostinfo.dev_handle = NULL;
+    }
+
+    vxhs_unref();
+
+    /*
+     * Free the dynamically allocated host string etc
+     */
+    g_free(s->vdisk_hostinfo.host);
+    g_free(s->tlscredsid);
+    s->tlscredsid = NULL;
+    s->vdisk_hostinfo.host = NULL;
+    s->vdisk_hostinfo.port = 0;
+}
+
+static int64_t vxhs_get_vdisk_stat(BDRVVXHSState *s)
+{
+    int64_t vdisk_size = -1;
+    int ret = 0;
+    void *dev_handle = s->vdisk_hostinfo.dev_handle;
+
+    ret = iio_ioctl(dev_handle, IOR_VDISK_STAT, &vdisk_size, 0);
+    if (ret < 0) {
+        trace_vxhs_get_vdisk_stat_err(s->vdisk_guid, ret, errno);
+        return -EIO;
+    }
+
+    trace_vxhs_get_vdisk_stat(s->vdisk_guid, vdisk_size);
+    return vdisk_size;
+}
+
+/*
+ * Returns the size of vDisk in bytes. This is required
+ * by QEMU block upper block layer so that it is visible
+ * to guest.
+ */
+static int64_t vxhs_getlength(BlockDriverState *bs)
+{
+    BDRVVXHSState *s = bs->opaque;
+    int64_t vdisk_size;
+
+    vdisk_size = vxhs_get_vdisk_stat(s);
+    if (vdisk_size < 0) {
+        return -EIO;
+    }
+
+    return vdisk_size;
+}
+
+static BlockDriver bdrv_vxhs = {
+    .format_name                  = "vxhs",
+    .protocol_name                = "vxhs",
+    .instance_size                = sizeof(BDRVVXHSState),
+    .bdrv_file_open               = vxhs_open,
+    .bdrv_parse_filename          = vxhs_parse_filename,
+    .bdrv_close                   = vxhs_close,
+    .bdrv_getlength               = vxhs_getlength,
+    .bdrv_aio_readv               = vxhs_aio_readv,
+    .bdrv_aio_writev              = vxhs_aio_writev,
+};
+
+static void bdrv_vxhs_init(void)
+{
+    bdrv_register(&bdrv_vxhs);
+}
+
+block_init(bdrv_vxhs_init);
--- a/block/win32-aio.c
+++ b/block/win32-aio.c
@@ -41,7 +41,7 @@ struct QEMUWin32AIOState {
    HANDLE hIOCP;
    EventNotifier e;
    int count;
-    bool is_aio_context_attached;
+    AioContext *aio_ctx;
 };

 typedef struct QEMUWin32AIOCB {
@@ -87,7 +87,6 @@ static void win32_aio_process_completion(QEMUWin32AIOState *s,
        qemu_vfree(waiocb->buf);
    }

-
    waiocb->common.cb(waiocb->common.opaque, ret);
    qemu_aio_unref(waiocb);
 }
@@ -176,13 +175,13 @@ void win32_aio_detach_aio_context(QEMUWin32AIOState *aio,
                                  AioContext *old_context)
 {
    aio_set_event_notifier(old_context, &aio->e, false, NULL, NULL);
-    aio->is_aio_context_attached = false;
+    aio->aio_ctx = NULL;
 }

 void win32_aio_attach_aio_context(QEMUWin32AIOState *aio,
                                  AioContext *new_context)
 {
-    aio->is_aio_context_attached = true;
+    aio->aio_ctx = new_context;
    aio_set_event_notifier(new_context, &aio->e, false,
                           win32_aio_completion_cb, NULL);
 }
@@ -212,7 +211,7 @@ out_free_state:

 void win32_aio_cleanup(QEMUWin32AIOState *aio)
 {
-    assert(!aio->is_aio_context_attached);
+    assert(!aio->aio_ctx);
    CloseHandle(aio->hIOCP);
    event_notifier_cleanup(&aio->e);
    g_free(aio);
--- a/blockdev-nbd.c
+++ b/blockdev-nbd.c
@@ -124,6 +124,7 @@ void qmp_nbd_server_start(SocketAddress *addr,
            goto error;
        }

+        /* TODO SOCKET_ADDRESS_KIND_FD where fd has AF_INET or AF_INET6 */
        if (addr->type != SOCKET_ADDRESS_KIND_INET) {
            error_setg(errp, "TLS is only supported with IPv4/IPv6");
            goto error;
--- a/blockdev.c
+++ b/blockdev.c
@@ -52,6 +52,7 @@
 #include "sysemu/arch_init.h"
 #include "qemu/cutils.h"
 #include "qemu/help_option.h"
+#include "qemu/throttle-options.h"

 static QTAILQ_HEAD(, BlockDriverState) monitor_bdrv_states =
    QTAILQ_HEAD_INITIALIZER(monitor_bdrv_states);
@@ -227,27 +228,30 @@ DriveInfo *drive_get(BlockInterfaceType type, int bus, int unit)
    return NULL;
 }

-bool drive_check_orphaned(void)
+void drive_check_orphaned(void)
 {
    BlockBackend *blk;
    DriveInfo *dinfo;
-    bool rs = false;
+    Location loc;
+    bool orphans = false;

    for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
        dinfo = blk_legacy_dinfo(blk);
-        /* If dinfo->bdrv->dev is NULL, it has no device attached. */
-        /* Unless this is a default drive, this may be an oversight. */
        if (!blk_get_attached_dev(blk) && !dinfo->is_default &&
            dinfo->type != IF_NONE) {
-            fprintf(stderr, "Warning: Orphaned drive without device: "
-                    "id=%s,file=%s,if=%s,bus=%d,unit=%d\n",
-                    blk_name(blk), blk_bs(blk) ? blk_bs(blk)->filename : "",
+            loc_push_none(&loc);
+            qemu_opts_loc_restore(dinfo->opts);
+            error_report("machine type does not support"
+                         " if=%s,bus=%d,unit=%d",
                         if_name[dinfo->type], dinfo->bus, dinfo->unit);
-            rs = true;
+            loc_pop(&loc);
+            orphans = true;
        }
    }

-    return rs;
+    if (orphans) {
+        exit(1);
+    }
 }

 DriveInfo *drive_get_by_index(BlockInterfaceType type, int index)
@@ -554,7 +558,7 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
    if ((!file || !*file) && !qdict_size(bs_opts)) {
        BlockBackendRootState *blk_rs;

-        blk = blk_new();
+        blk = blk_new(0, BLK_PERM_ALL);
        blk_rs = blk_get_root_state(blk);
        blk_rs->open_flags    = bdrv_flags;
        blk_rs->read_only     = read_only;
@@ -1610,6 +1614,7 @@ typedef struct ExternalSnapshotState {
    BlockDriverState *old_bs;
    BlockDriverState *new_bs;
    AioContext *aio_context;
+    bool overlay_appended;
 } ExternalSnapshotState;

 static void external_snapshot_prepare(BlkActionState *common,
@@ -1723,7 +1728,7 @@ static void external_snapshot_prepare(BlkActionState *common,
            bdrv_img_create(new_image_file, format,
                            state->old_bs->filename,
                            state->old_bs->drv->format_name,
-                            NULL, size, flags, &local_err, false);
+                            NULL, size, flags, false, &local_err);
            if (local_err) {
                error_propagate(errp, local_err);
                return;
@@ -1764,7 +1769,21 @@ static void external_snapshot_prepare(BlkActionState *common,

    if (!state->new_bs->drv->supports_backing) {
        error_setg(errp, "The snapshot does not support backing images");
+        return;
    }
+
+    bdrv_set_aio_context(state->new_bs, state->aio_context);
+
+    /* This removes our old bs and adds the new bs. This is an operation that
+     * can fail, so we need to do it in .prepare; undoing it for abort is
+     * always possible. */
+    bdrv_ref(state->new_bs);
+    bdrv_append(state->new_bs, state->old_bs, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+    state->overlay_appended = true;
 }

 static void external_snapshot_commit(BlkActionState *common)
@@ -1772,10 +1791,6 @@ static void external_snapshot_commit(BlkActionState *common)
    ExternalSnapshotState *state =
                             DO_UPCAST(ExternalSnapshotState, common, common);

-    bdrv_set_aio_context(state->new_bs, state->aio_context);
-
-    /* This removes our old bs and adds the new bs */
-    bdrv_append(state->new_bs, state->old_bs);
    /* We don't need (or want) to use the transactional
     * bdrv_reopen_multiple() across all the entries at once, because we
     * don't want to abort all of them if one of them fails the reopen */
@@ -1790,7 +1805,9 @@ static void external_snapshot_abort(BlkActionState *common)
    ExternalSnapshotState *state =
                             DO_UPCAST(ExternalSnapshotState, common, common);
    if (state->new_bs) {
-        bdrv_unref(state->new_bs);
+        if (state->overlay_appended) {
+            bdrv_replace_node(state->new_bs, state->old_bs, &error_abort);
+        }
    }
 }

@@ -1801,6 +1818,7 @@ static void external_snapshot_clean(BlkActionState *common)
    if (state->aio_context) {
        bdrv_drained_end(state->old_bs);
        aio_context_release(state->aio_context);
+        bdrv_unref(state->new_bs);
    }
 }

@@ -2029,7 +2047,9 @@ static void block_dirty_bitmap_clear_abort(BlkActionState *common)
    BlockDirtyBitmapState *state = DO_UPCAST(BlockDirtyBitmapState,
                                             common, common);

+    if (state->backup) {
        bdrv_undo_clear_dirty_bitmap(state->bitmap, state->backup);
+    }
 }

 static void block_dirty_bitmap_clear_commit(BlkActionState *common)
@@ -2307,7 +2327,7 @@ static int do_open_tray(const char *blk_name, const char *qdev_id,
    }

    if (!locked || force) {
-        blk_dev_change_media_cb(blk, false);
+        blk_dev_change_media_cb(blk, false, &error_abort);
    }

    if (locked && !force) {
@@ -2345,6 +2365,7 @@ void qmp_blockdev_close_tray(bool has_device, const char *device,
                             Error **errp)
 {
    BlockBackend *blk;
+    Error *local_err = NULL;

    device = has_device ? device : NULL;
    id = has_id ? id : NULL;
@@ -2368,7 +2389,11 @@ void qmp_blockdev_close_tray(bool has_device, const char *device,
        return;
    }

-    blk_dev_change_media_cb(blk, true);
+    blk_dev_change_media_cb(blk, true, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
 }

 void qmp_x_blockdev_remove_medium(bool has_device, const char *device,
@@ -2421,7 +2446,7 @@ void qmp_x_blockdev_remove_medium(bool has_device, const char *device,
         * called at all); therefore, the medium needs to be ejected here.
         * Do it after blk_remove_bs() so blk_is_inserted(blk) returns the @load
         * value passed here (i.e. false). */
-        blk_dev_change_media_cb(blk, false);
+        blk_dev_change_media_cb(blk, false, &error_abort);
    }

 out:
@@ -2431,7 +2456,9 @@ out:
 static void qmp_blockdev_insert_anon_medium(BlockBackend *blk,
                                            BlockDriverState *bs, Error **errp)
 {
+    Error *local_err = NULL;
    bool has_device;
+    int ret;

    /* For BBs without a device, we can exchange the BDS tree at will */
    has_device = blk_get_attached_dev(blk);
@@ -2451,7 +2478,10 @@ static void qmp_blockdev_insert_anon_medium(BlockBackend *blk,
        return;
    }

-    blk_insert_bs(blk, bs);
+    ret = blk_insert_bs(blk, bs, errp);
+    if (ret < 0) {
+        return;
+    }

    if (!blk_dev_has_tray(blk)) {
        /* For tray-less devices, blockdev-close-tray is a no-op (or may not be
@@ -2459,7 +2489,12 @@ static void qmp_blockdev_insert_anon_medium(BlockBackend *blk,
         * slot here.
         * Do it after blk_insert_bs() so blk_is_inserted(blk) returns the @load
         * value passed here (i.e. true). */
-        blk_dev_change_media_cb(blk, true);
+        blk_dev_change_media_cb(blk, true, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            blk_remove_bs(blk);
+            return;
+        }
    }
 }

@@ -2800,7 +2835,7 @@ void hmp_drive_del(Monitor *mon, const QDict *qdict)

    bs = bdrv_find_node(id);
    if (bs) {
-        qmp_x_blockdev_del(id, &local_err);
+        qmp_blockdev_del(id, &local_err);
        if (local_err) {
            error_report_err(local_err);
        }
@@ -2855,6 +2890,7 @@ void qmp_block_resize(bool has_device, const char *device,
                      int64_t size, Error **errp)
 {
    Error *local_err = NULL;
+    BlockBackend *blk = NULL;
    BlockDriverState *bs;
    AioContext *aio_context;
    int ret;
@@ -2885,10 +2921,16 @@ void qmp_block_resize(bool has_device, const char *device,
        goto out;
    }

+    blk = blk_new(BLK_PERM_RESIZE, BLK_PERM_ALL);
+    ret = blk_insert_bs(blk, bs, errp);
+    if (ret < 0) {
+        goto out;
+    }
+
    /* complete all in-flight operations before resizing the device */
    bdrv_drain_all();

-    ret = bdrv_truncate(bs, size);
+    ret = blk_truncate(blk, size);
    switch (ret) {
    case 0:
        break;
@@ -2910,6 +2952,7 @@ void qmp_block_resize(bool has_device, const char *device,
    }

 out:
+    blk_unref(blk);
    aio_context_release(aio_context);
 }

@@ -3005,6 +3048,7 @@ void qmp_block_commit(bool has_job_id, const char *job_id, const char *device,
                      bool has_top, const char *top,
                      bool has_backing_file, const char *backing_file,
                      bool has_speed, int64_t speed,
+                      bool has_filter_node_name, const char *filter_node_name,
                      Error **errp)
 {
    BlockDriverState *bs;
@@ -3020,6 +3064,9 @@ void qmp_block_commit(bool has_job_id, const char *job_id, const char *device,
    if (!has_speed) {
        speed = 0;
    }
+    if (!has_filter_node_name) {
+        filter_node_name = NULL;
+    }

    /* Important Note:
     *  libvirt relies on the DeviceNotFound error class in order to probe for
@@ -3094,8 +3141,8 @@ void qmp_block_commit(bool has_job_id, const char *job_id, const char *device,
            goto out;
        }
        commit_active_start(has_job_id ? job_id : NULL, bs, base_bs,
-                            BLOCK_JOB_DEFAULT, speed, on_error, NULL, NULL,
-                            &local_err, false);
+                            BLOCK_JOB_DEFAULT, speed, on_error,
+                            filter_node_name, NULL, NULL, false, &local_err);
    } else {
        BlockDriverState *overlay_bs = bdrv_find_overlay(bs, top_bs);
        if (bdrv_op_is_blocked(overlay_bs, BLOCK_OP_TYPE_COMMIT_TARGET, errp)) {
@@ -3103,7 +3150,7 @@ void qmp_block_commit(bool has_job_id, const char *job_id, const char *device,
        }
        commit_start(has_job_id ? job_id : NULL, bs, base_bs, top_bs, speed,
                     on_error, has_backing_file ? backing_file : NULL,
-                     &local_err);
+                     filter_node_name, &local_err);
    }
    if (local_err != NULL) {
        error_propagate(errp, local_err);
@@ -3190,10 +3237,10 @@ static BlockJob *do_drive_backup(DriveBackup *backup, BlockJobTxn *txn,
        if (source) {
            bdrv_img_create(backup->target, backup->format, source->filename,
                            source->drv->format_name, NULL,
-                            size, flags, &local_err, false);
+                            size, flags, false, &local_err);
        } else {
            bdrv_img_create(backup->target, backup->format, NULL, NULL, NULL,
-                            size, flags, &local_err, false);
+                            size, flags, false, &local_err);
        }
    }

@@ -3339,6 +3386,8 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
                                   bool has_on_target_error,
                                   BlockdevOnError on_target_error,
                                   bool has_unmap, bool unmap,
+                                   bool has_filter_node_name,
+                                   const char *filter_node_name,
                                   Error **errp)
 {

@@ -3360,6 +3409,9 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
    if (!has_unmap) {
        unmap = true;
    }
+    if (!has_filter_node_name) {
+        filter_node_name = NULL;
+    }

    if (granularity != 0 && (granularity < 512 || granularity > 1048576 * 64)) {
        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "granularity",
@@ -3389,7 +3441,8 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
    mirror_start(job_id, bs, target,
                 has_replaces ? replaces : NULL,
                 speed, granularity, buf_size, sync, backing_mode,
-                 on_source_error, on_target_error, unmap, errp);
+                 on_source_error, on_target_error, unmap, filter_node_name,
+                 errp);
 }

 void qmp_drive_mirror(DriveMirror *arg, Error **errp)
@@ -3478,7 +3531,7 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
        /* create new image w/o backing file */
        assert(format);
        bdrv_img_create(arg->target, format,
-                        NULL, NULL, NULL, size, flags, &local_err, false);
+                        NULL, NULL, NULL, size, flags, false, &local_err);
    } else {
        switch (arg->mode) {
        case NEW_IMAGE_MODE_EXISTING:
@@ -3488,7 +3541,7 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
            bdrv_img_create(arg->target, format,
                            source->filename,
                            source->drv->format_name,
-                            NULL, size, flags, &local_err, false);
+                            NULL, size, flags, false, &local_err);
            break;
        default:
            abort();
@@ -3527,6 +3580,7 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
                           arg->has_on_source_error, arg->on_source_error,
                           arg->has_on_target_error, arg->on_target_error,
                           arg->has_unmap, arg->unmap,
+                           false, NULL,
                           &local_err);
    bdrv_unref(target_bs);
    error_propagate(errp, local_err);
@@ -3545,6 +3599,8 @@ void qmp_blockdev_mirror(bool has_job_id, const char *job_id,
                         BlockdevOnError on_source_error,
                         bool has_on_target_error,
                         BlockdevOnError on_target_error,
+                         bool has_filter_node_name,
+                         const char *filter_node_name,
                         Error **errp)
 {
    BlockDriverState *bs;
@@ -3576,6 +3632,7 @@ void qmp_blockdev_mirror(bool has_job_id, const char *job_id,
                           has_on_source_error, on_source_error,
                           has_on_target_error, on_target_error,
                           true, true,
+                           has_filter_node_name, filter_node_name,
                           &local_err);
    error_propagate(errp, local_err);

@@ -3843,7 +3900,7 @@ fail:
    visit_free(v);
 }

-void qmp_x_blockdev_del(const char *node_name, Error **errp)
+void qmp_blockdev_del(const char *node_name, Error **errp)
 {
    AioContext *aio_context;
    BlockDriverState *bs;
@@ -3999,83 +4056,11 @@ QemuOptsList qemu_common_drive_opts = {
            .name = BDRV_OPT_READ_ONLY,
            .type = QEMU_OPT_BOOL,
            .help = "open drive file as read-only",
-        },{
-            .name = "throttling.iops-total",
-            .type = QEMU_OPT_NUMBER,
-            .help = "limit total I/O operations per second",
-        },{
-            .name = "throttling.iops-read",
-            .type = QEMU_OPT_NUMBER,
-            .help = "limit read operations per second",
-        },{
-            .name = "throttling.iops-write",
-            .type = QEMU_OPT_NUMBER,
-            .help = "limit write operations per second",
-        },{
-            .name = "throttling.bps-total",
-            .type = QEMU_OPT_NUMBER,
-            .help = "limit total bytes per second",
-        },{
-            .name = "throttling.bps-read",
-            .type = QEMU_OPT_NUMBER,
-            .help = "limit read bytes per second",
-        },{
-            .name = "throttling.bps-write",
-            .type = QEMU_OPT_NUMBER,
-            .help = "limit write bytes per second",
-        },{
-            .name = "throttling.iops-total-max",
-            .type = QEMU_OPT_NUMBER,
-            .help = "I/O operations burst",
-        },{
-            .name = "throttling.iops-read-max",
-            .type = QEMU_OPT_NUMBER,
-            .help = "I/O operations read burst",
-        },{
-            .name = "throttling.iops-write-max",
-            .type = QEMU_OPT_NUMBER,
-            .help = "I/O operations write burst",
-        },{
-            .name = "throttling.bps-total-max",
-            .type = QEMU_OPT_NUMBER,
-            .help = "total bytes burst",
-        },{
-            .name = "throttling.bps-read-max",
-            .type = QEMU_OPT_NUMBER,
-            .help = "total bytes read burst",
-        },{
-            .name = "throttling.bps-write-max",
-            .type = QEMU_OPT_NUMBER,
-            .help = "total bytes write burst",
-        },{
-            .name = "throttling.iops-total-max-length",
-            .type = QEMU_OPT_NUMBER,
-            .help = "length of the iops-total-max burst period, in seconds",
-        },{
-            .name = "throttling.iops-read-max-length",
-            .type = QEMU_OPT_NUMBER,
-            .help = "length of the iops-read-max burst period, in seconds",
-        },{
-            .name = "throttling.iops-write-max-length",
-            .type = QEMU_OPT_NUMBER,
-            .help = "length of the iops-write-max burst period, in seconds",
-        },{
-            .name = "throttling.bps-total-max-length",
-            .type = QEMU_OPT_NUMBER,
-            .help = "length of the bps-total-max burst period, in seconds",
-        },{
-            .name = "throttling.bps-read-max-length",
-            .type = QEMU_OPT_NUMBER,
-            .help = "length of the bps-read-max burst period, in seconds",
-        },{
-            .name = "throttling.bps-write-max-length",
-            .type = QEMU_OPT_NUMBER,
-            .help = "length of the bps-write-max burst period, in seconds",
-        },{
-            .name = "throttling.iops-size",
-            .type = QEMU_OPT_NUMBER,
-            .help = "when limiting by iops max size of an I/O in bytes",
-        },{
+        },
+
+        THROTTLE_OPTS,
+
+        {
            .name = "throttling.group",
            .type = QEMU_OPT_STRING,
            .help = "name of the block throttling group",
--- a/blockjob.c
+++ b/blockjob.c
@@ -55,6 +55,36 @@ struct BlockJobTxn {

 static QLIST_HEAD(, BlockJob) block_jobs = QLIST_HEAD_INITIALIZER(block_jobs);

+static char *child_job_get_parent_desc(BdrvChild *c)
+{
+    BlockJob *job = c->opaque;
+    return g_strdup_printf("%s job '%s'",
+                           BlockJobType_lookup[job->driver->job_type],
+                           job->id);
+}
+
+static const BdrvChildRole child_job = {
+    .get_parent_desc    = child_job_get_parent_desc,
+    .stay_at_node       = true,
+};
+
+static void block_job_drained_begin(void *opaque)
+{
+    BlockJob *job = opaque;
+    block_job_pause(job);
+}
+
+static void block_job_drained_end(void *opaque)
+{
+    BlockJob *job = opaque;
+    block_job_resume(job);
+}
+
+static const BlockDevOps block_job_dev_ops = {
+    .drained_begin = block_job_drained_begin,
+    .drained_end = block_job_drained_end,
+};
+
 BlockJob *block_job_next(BlockJob *job)
 {
    if (!job) {
@@ -115,19 +145,44 @@ static void block_job_detach_aio_context(void *opaque)
    block_job_unref(job);
 }

-void block_job_add_bdrv(BlockJob *job, BlockDriverState *bs)
+void block_job_remove_all_bdrv(BlockJob *job)
 {
-    job->nodes = g_slist_prepend(job->nodes, bs);
+    GSList *l;
+    for (l = job->nodes; l; l = l->next) {
+        BdrvChild *c = l->data;
+        bdrv_op_unblock_all(c->bs, job->blocker);
+        bdrv_root_unref_child(c);
+    }
+    g_slist_free(job->nodes);
+    job->nodes = NULL;
+}
+
+int block_job_add_bdrv(BlockJob *job, const char *name, BlockDriverState *bs,
+                       uint64_t perm, uint64_t shared_perm, Error **errp)
+{
+    BdrvChild *c;
+
+    c = bdrv_root_attach_child(bs, name, &child_job, perm, shared_perm,
+                               job, errp);
+    if (c == NULL) {
+        return -EPERM;
+    }
+
+    job->nodes = g_slist_prepend(job->nodes, c);
    bdrv_ref(bs);
    bdrv_op_block_all(bs, job->blocker);
+
+    return 0;
 }

 void *block_job_create(const char *job_id, const BlockJobDriver *driver,
-                       BlockDriverState *bs, int64_t speed, int flags,
+                       BlockDriverState *bs, uint64_t perm,
+                       uint64_t shared_perm, int64_t speed, int flags,
                       BlockCompletionFunc *cb, void *opaque, Error **errp)
 {
    BlockBackend *blk;
    BlockJob *job;
+    int ret;

    if (bs->job) {
        error_setg(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
@@ -159,15 +214,14 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
        }
    }

-    blk = blk_new();
-    blk_insert_bs(blk, bs);
+    blk = blk_new(perm, shared_perm);
+    ret = blk_insert_bs(blk, bs, errp);
+    if (ret < 0) {
+        blk_unref(blk);
+        return NULL;
+    }

    job = g_malloc0(driver->instance_size);
-    error_setg(&job->blocker, "block device is in use by block job: %s",
-               BlockJobType_lookup[driver->job_type]);
-    block_job_add_bdrv(job, bs);
-    bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);
-
    job->driver        = driver;
    job->id            = g_strdup(job_id);
    job->blk           = blk;
@@ -177,8 +231,15 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
    job->paused        = true;
    job->pause_count   = 1;
    job->refcnt        = 1;
+
+    error_setg(&job->blocker, "block device is in use by block job: %s",
+               BlockJobType_lookup[driver->job_type]);
+    block_job_add_bdrv(job, "main node", bs, 0, BLK_PERM_ALL, &error_abort);
    bs->job = job;

+    blk_set_dev_ops(blk, &block_job_dev_ops, job);
+    bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);
+
    QLIST_INSERT_HEAD(&block_jobs, job, job_list);

    blk_add_aio_context_notifier(blk, block_job_attached_aio_context,
@@ -208,16 +269,28 @@ static bool block_job_started(BlockJob *job)
    return job->co;
 }

+/**
+ * All jobs must allow a pause point before entering their job proper. This
+ * ensures that jobs can be paused prior to being started, then resumed later.
+ */
+static void coroutine_fn block_job_co_entry(void *opaque)
+{
+    BlockJob *job = opaque;
+
+    assert(job && job->driver && job->driver->start);
+    block_job_pause_point(job);
+    job->driver->start(job);
+}
+
 void block_job_start(BlockJob *job)
 {
    assert(job && !block_job_started(job) && job->paused &&
-           !job->busy && job->driver->start);
-    job->co = qemu_coroutine_create(job->driver->start, job);
-    if (--job->pause_count == 0) {
-        job->paused = false;
+           job->driver && job->driver->start);
+    job->co = qemu_coroutine_create(block_job_co_entry, job);
+    job->pause_count--;
    job->busy = true;
-        qemu_coroutine_enter(job->co);
-    }
+    job->paused = false;
+    bdrv_coroutine_enter(blk_bs(job->blk), job->co);
 }

 void block_job_ref(BlockJob *job)
@@ -228,15 +301,9 @@ void block_job_ref(BlockJob *job)
 void block_job_unref(BlockJob *job)
 {
    if (--job->refcnt == 0) {
-        GSList *l;
        BlockDriverState *bs = blk_bs(job->blk);
        bs->job = NULL;
-        for (l = job->nodes; l; l = l->next) {
-            bs = l->data;
-            bdrv_op_unblock_all(bs, job->blocker);
-            bdrv_unref(bs);
-        }
-        g_slist_free(job->nodes);
+        block_job_remove_all_bdrv(job);
        blk_remove_aio_context_notifier(job->blk,
                                        block_job_attached_aio_context,
                                        block_job_detach_aio_context, job);
@@ -465,7 +532,7 @@ void block_job_user_resume(BlockJob *job)
 void block_job_enter(BlockJob *job)
 {
    if (job->co && !job->busy) {
-        qemu_coroutine_enter(job->co);
+        bdrv_coroutine_enter(blk_bs(job->blk), job->co);
    }
 }

@@ -719,12 +786,16 @@ static void block_job_defer_to_main_loop_bh(void *opaque)

    /* Fetch BDS AioContext again, in case it has changed */
    aio_context = blk_get_aio_context(data->job->blk);
+    if (aio_context != data->aio_context) {
        aio_context_acquire(aio_context);
+    }

    data->job->deferred_to_main_loop = false;
    data->fn(data->job, data->opaque);

+    if (aio_context != data->aio_context) {
        aio_context_release(aio_context);
+    }

    aio_context_release(data->aio_context);

--- a/bsd-user/mmap.c
+++ b/bsd-user/mmap.c
@@ -24,8 +24,7 @@

 //#define DEBUG_MMAP

-#if defined(CONFIG_USE_NPTL)
-pthread_mutex_t mmap_mutex;
+static pthread_mutex_t mmap_mutex = PTHREAD_MUTEX_INITIALIZER;
 static int __thread mmap_lock_count;

 void mmap_lock(void)
@@ -62,16 +61,6 @@ void mmap_fork_end(int child)
    else
        pthread_mutex_unlock(&mmap_mutex);
 }
-#else
-/* We aren't threadsafe to start with, so no need to worry about locking.  */
-void mmap_lock(void)
-{
-}
-
-void mmap_unlock(void)
-{
-}
-#endif

 /* NOTE: all the constants are the HOST ones, but addresses are target. */
 int target_mprotect(abi_ulong start, abi_ulong len, int prot)
@@ -199,12 +188,7 @@ static int mmap_frag(abi_ulong real_start,
    return 0;
 }

-#if defined(__CYGWIN__)
-/* Cygwin doesn't have a whole lot of address space.  */
-static abi_ulong mmap_next_start = 0x18000000;
-#else
 static abi_ulong mmap_next_start = 0x40000000;
-#endif

 unsigned long last_brk;

--- a/bsd-user/qemu.h
+++ b/bsd-user/qemu.h
@@ -209,10 +209,8 @@ abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size,
                       abi_ulong new_addr);
 int target_msync(abi_ulong start, abi_ulong len, int flags);
 extern unsigned long last_brk;
-#if defined(CONFIG_USE_NPTL)
 void mmap_fork_start(void);
 void mmap_fork_end(int child);
-#endif

 /* main.c */
 extern unsigned long x86_stack_size;
--- a/chardev/char-fd.c
+++ b/chardev/char-fd.c
@@ -58,7 +58,7 @@ static gboolean fd_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque)
    ret = qio_channel_read(
        chan, (gchar *)buf, len, NULL);
    if (ret == 0) {
-        remove_fd_in_watch(chr);
+        remove_fd_in_watch(chr, NULL);
        qemu_chr_be_event(chr, CHR_EVENT_CLOSED);
        return FALSE;
    }
@@ -89,7 +89,7 @@ static void fd_chr_update_read_handler(Chardev *chr,
 {
    FDChardev *s = FD_CHARDEV(chr);

-    remove_fd_in_watch(chr);
+    remove_fd_in_watch(chr, NULL);
    if (s->ioc_in) {
        chr->fd_in_tag = io_add_watch_poll(chr, s->ioc_in,
                                           fd_chr_read_poll,
@@ -103,7 +103,7 @@ static void char_fd_finalize(Object *obj)
    Chardev *chr = CHARDEV(obj);
    FDChardev *s = FD_CHARDEV(obj);

-    remove_fd_in_watch(chr);
+    remove_fd_in_watch(chr, NULL);
    if (s->ioc_in) {
        object_unref(OBJECT(s->ioc_in));
    }
--- a/chardev/char-io.c
+++ b/chardev/char-io.c
@@ -127,14 +127,14 @@ guint io_add_watch_poll(Chardev *chr,
    return tag;
 }

-static void io_remove_watch_poll(guint tag)
+static void io_remove_watch_poll(guint tag, GMainContext *context)
 {
    GSource *source;
    IOWatchPoll *iwp;

    g_return_if_fail(tag > 0);

-    source = g_main_context_find_source_by_id(NULL, tag);
+    source = g_main_context_find_source_by_id(context, tag);
    g_return_if_fail(source != NULL);

    iwp = io_watch_poll_from_source(source);
@@ -146,10 +146,10 @@ static void io_remove_watch_poll(guint tag)
    g_source_destroy(&iwp->parent);
 }

-void remove_fd_in_watch(Chardev *chr)
+void remove_fd_in_watch(Chardev *chr, GMainContext *context)
 {
    if (chr->fd_in_tag) {
-        io_remove_watch_poll(chr->fd_in_tag);
+        io_remove_watch_poll(chr->fd_in_tag, context);
        chr->fd_in_tag = 0;
    }
 }
--- a/chardev/char-io.h
+++ b/chardev/char-io.h
@@ -36,7 +36,7 @@ guint io_add_watch_poll(Chardev *chr,
                        gpointer user_data,
                        GMainContext *context);

-void remove_fd_in_watch(Chardev *chr);
+void remove_fd_in_watch(Chardev *chr, GMainContext *context);

 int io_channel_send(QIOChannel *ioc, const void *buf, size_t len);

--- a/chardev/char-pty.c
+++ b/chardev/char-pty.c
@@ -129,7 +129,7 @@ static int char_pty_chr_write(Chardev *chr, const uint8_t *buf, int len)
        /* guest sends data, check for (re-)connect */
        pty_chr_update_read_handler_locked(chr);
        if (!s->connected) {
-            return 0;
+            return len;
        }
    }
    return io_channel_send(s->ioc, buf, len);
@@ -199,7 +199,7 @@ static void pty_chr_state(Chardev *chr, int connected)
            g_source_remove(s->open_tag);
            s->open_tag = 0;
        }
-        remove_fd_in_watch(chr);
+        remove_fd_in_watch(chr, NULL);
        s->connected = 0;
        /* (re-)connect poll interval for idle guests: once per second.
         * We check more frequently in case the guests sends data to
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -47,7 +47,6 @@ typedef struct {
    int max_size;
    int do_telnetopt;
    int do_nodelay;
-    int is_unix;
    int *read_msgfds;
    size_t read_msgfds_num;
    int *write_msgfds;
@@ -97,6 +96,9 @@ static gboolean tcp_chr_accept(QIOChannel *chan,
                               GIOCondition cond,
                               void *opaque);

+static int tcp_chr_read_poll(void *opaque);
+static void tcp_chr_disconnect(Chardev *chr);
+
 /* Called with chr_write_lock held.  */
 static int tcp_chr_write(Chardev *chr, const uint8_t *buf, int len)
 {
@@ -114,6 +116,13 @@ static int tcp_chr_write(Chardev *chr, const uint8_t *buf, int len)
            s->write_msgfds_num = 0;
        }

+        if (ret < 0 && errno != EAGAIN) {
+            if (tcp_chr_read_poll(chr) <= 0) {
+                tcp_chr_disconnect(chr);
+                return len;
+            } /* else let the read handler finish it properly */
+        }
+
        return ret;
    } else {
        /* XXX: indicate an error ? */
@@ -318,7 +327,7 @@ static void tcp_chr_free_connection(Chardev *chr)
    }

    tcp_set_msgfds(chr, NULL, 0);
-    remove_fd_in_watch(chr);
+    remove_fd_in_watch(chr, NULL);
    object_unref(OBJECT(s->sioc));
    s->sioc = NULL;
    object_unref(OBJECT(s->ioc));
@@ -348,6 +357,10 @@ static char *SocketAddress_to_str(const char *prefix, SocketAddress *addr,
        return g_strdup_printf("%sfd:%s%s", prefix, addr->u.fd.data->str,
                               is_listen ? ",server" : "");
        break;
+    case SOCKET_ADDRESS_KIND_VSOCK:
+        return g_strdup_printf("%svsock:%s:%s", prefix,
+                               addr->u.vsock.data->cid,
+                               addr->u.vsock.data->port);
    default:
        abort();
    }
@@ -488,7 +501,7 @@ static void tcp_chr_update_read_handler(Chardev *chr,
        return;
    }

-    remove_fd_in_watch(chr);
+    remove_fd_in_watch(chr, NULL);
    if (s->ioc) {
        chr->fd_in_tag = io_add_watch_poll(chr, s->ioc,
                                           tcp_chr_read_poll,
@@ -815,7 +828,6 @@ static void qmp_chardev_open_socket(Chardev *chr,
    int64_t reconnect   = sock->has_reconnect ? sock->reconnect : 0;
    QIOChannelSocket *sioc = NULL;

-    s->is_unix = addr->type == SOCKET_ADDRESS_KIND_UNIX;
    s->is_listen = is_listen;
    s->is_telnet = is_telnet;
    s->do_nodelay = do_nodelay;
@@ -855,7 +867,8 @@ static void qmp_chardev_open_socket(Chardev *chr,
    s->addr = QAPI_CLONE(SocketAddress, sock->addr);

    qemu_chr_set_feature(chr, QEMU_CHAR_FEATURE_RECONNECTABLE);
-    if (s->is_unix) {
+    /* TODO SOCKET_ADDRESS_FD where fd has AF_UNIX */
+    if (addr->type == SOCKET_ADDRESS_KIND_UNIX) {
        qemu_chr_set_feature(chr, QEMU_CHAR_FEATURE_FD_PASS);
    }

--- a/chardev/char-udp.c
+++ b/chardev/char-udp.c
@@ -81,7 +81,7 @@ static gboolean udp_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque)
    ret = qio_channel_read(
        s->ioc, (char *)s->buf, sizeof(s->buf), NULL);
    if (ret <= 0) {
-        remove_fd_in_watch(chr);
+        remove_fd_in_watch(chr, NULL);
        return FALSE;
    }
    s->bufcnt = ret;
@@ -101,7 +101,7 @@ static void udp_chr_update_read_handler(Chardev *chr,
 {
    UdpChardev *s = UDP_CHARDEV(chr);

-    remove_fd_in_watch(chr);
+    remove_fd_in_watch(chr, NULL);
    if (s->ioc) {
        chr->fd_in_tag = io_add_watch_poll(chr, s->ioc,
                                           udp_chr_read_poll,
@@ -115,7 +115,7 @@ static void char_udp_finalize(Object *obj)
    Chardev *chr = CHARDEV(obj);
    UdpChardev *s = UDP_CHARDEV(obj);

-    remove_fd_in_watch(chr);
+    remove_fd_in_watch(chr, NULL);
    if (s->ioc) {
        object_unref(OBJECT(s->ioc));
    }
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -560,7 +560,7 @@ void qemu_chr_fe_set_handlers(CharBackend *b,
    cc = CHARDEV_GET_CLASS(s);
    if (!opaque && !fd_can_read && !fd_read && !fd_event) {
        fe_open = 0;
-        remove_fd_in_watch(s);
+        remove_fd_in_watch(s, context);
    } else {
        fe_open = 1;
    }
@@ -652,6 +652,7 @@ QemuOpts *qemu_chr_parse_compat(const char *label, const char *filename)
    if (strcmp(filename, "null")    == 0 ||
        strcmp(filename, "pty")     == 0 ||
        strcmp(filename, "msmouse") == 0 ||
+        strcmp(filename, "wctablet") == 0 ||
        strcmp(filename, "braille") == 0 ||
        strcmp(filename, "testdev") == 0 ||
        strcmp(filename, "stdio")   == 0) {
--- a/315
+++ b/315
@@ -301,7 +301,6 @@ glusterfs=""
 glusterfs_xlator_opt="no"
 glusterfs_discard="no"
 glusterfs_zerofill="no"
-archipelago="no"
 gtk=""
 gtkabi=""
 gtk_gl="no"
@@ -321,6 +320,11 @@ numa=""
 tcmalloc="no"
 jemalloc="no"
 replication="yes"
+vxhs=""
+
+supported_cpu="no"
+supported_os="no"
+bogus_os="no"

 # parse CC options first
 for opt do
@@ -518,26 +522,36 @@ ARCH=
 # Normalise host CPU name and set ARCH.
 # Note that this case should only have supported host CPUs, not guests.
 case "$cpu" in
-  ia64|ppc|ppc64|s390|s390x|sparc64|x32)
+  ppc|ppc64|s390|s390x|sparc64|x32)
+    cpu="$cpu"
+    supported_cpu="yes"
+  ;;
+  ia64)
    cpu="$cpu"
  ;;
  i386|i486|i586|i686|i86pc|BePC)
    cpu="i386"
+    supported_cpu="yes"
  ;;
  x86_64|amd64)
    cpu="x86_64"
+    supported_cpu="yes"
  ;;
  armv*b|armv*l|arm)
    cpu="arm"
+    supported_cpu="yes"
  ;;
  aarch64)
    cpu="aarch64"
+    supported_cpu="yes"
  ;;
  mips*)
    cpu="mips"
+    supported_cpu="yes"
  ;;
  sparc|sun4[cdmuv])
    cpu="sparc"
+    supported_cpu="yes"
  ;;
  *)
    # This will result in either an error or falling back to TCI later
@@ -554,12 +568,6 @@ fi
 HOST_VARIANT_DIR=""

 case $targetos in
-CYGWIN*)
-  mingw32="yes"
-  QEMU_CFLAGS="-mno-cygwin $QEMU_CFLAGS"
-  audio_possible_drivers="sdl"
-  audio_drv_list="sdl"
-;;
 MINGW32*)
  mingw32="yes"
  hax="yes"
@@ -569,6 +577,7 @@ MINGW32*)
  else
    audio_drv_list=""
  fi
+  supported_os="yes"
 ;;
 GNU/kFreeBSD)
  bsd="yes"
@@ -586,6 +595,7 @@ FreeBSD)
  libs_qga="-lutil $libs_qga"
  netmap=""  # enable netmap autodetect
  HOST_VARIANT_DIR="freebsd"
+  supported_os="yes"
 ;;
 DragonFly)
  bsd="yes"
@@ -627,6 +637,7 @@ Darwin)
  # won't work when we're compiling with gcc as a C compiler.
  QEMU_CFLAGS="-DOS_OBJECT_USE_OBJC=0 $QEMU_CFLAGS"
  HOST_VARIANT_DIR="darwin"
+  supported_os="yes"
 ;;
 SunOS)
  solaris="yes"
@@ -673,7 +684,7 @@ Haiku)
  QEMU_CFLAGS="-DB_USE_POSITIVE_POSIX_ERRORS $QEMU_CFLAGS"
  LIBS="-lposix_error_mapper -lnetwork $LIBS"
 ;;
-*)
+Linux)
  audio_drv_list="oss"
  audio_possible_drivers="oss alsa sdl pa"
  linux="yes"
@@ -683,6 +694,13 @@ Haiku)
  vhost_scsi="yes"
  vhost_vsock="yes"
  QEMU_INCLUDES="-I\$(SRC_PATH)/linux-headers -I$(pwd)/linux-headers $QEMU_INCLUDES"
+  supported_os="yes"
+;;
+*)
+  # This is a fatal error, but don't report it yet, because we
+  # might be going to just print the --help text, or it might
+  # be the result of a missing compiler.
+  bogus_os="yes"
 ;;
 esac

@@ -725,7 +743,7 @@ if test "$mingw32" = "yes" ; then
  sysconfdir="\${prefix}"
  local_statedir=
  confsuffix=""
-  libs_qga="-lws2_32 -lwinmm -lpowrprof -liphlpapi -lnetapi32 $libs_qga"
+  libs_qga="-lws2_32 -lwinmm -lpowrprof -lwtsapi32 -liphlpapi -lnetapi32 $libs_qga"
 fi

 werror=""
@@ -1101,10 +1119,6 @@ for opt do
  ;;
  --enable-glusterfs) glusterfs="yes"
  ;;
-  --disable-archipelago) archipelago="no"
-  ;;
-  --enable-archipelago) archipelago="yes"
-  ;;
  --disable-virtio-blk-data-plane|--enable-virtio-blk-data-plane)
      echo "$0: $opt is obsolete, virtio-blk data-plane is always on" >&2
  ;;
@@ -1170,6 +1184,10 @@ for opt do
  ;;
  --enable-replication) replication="yes"
  ;;
+  --disable-vxhs) vxhs="no"
+  ;;
+  --enable-vxhs) vxhs="yes"
+  ;;
  *)
      echo "ERROR: unknown option $opt"
      echo "Try '$0 --help' for more information"
@@ -1178,21 +1196,6 @@ for opt do
  esac
 done

-if ! has $python; then
-  error_exit "Python not found. Use --python=/path/to/python"
-fi
-
-# Note that if the Python conditional here evaluates True we will exit
-# with status 1 which is a shell 'false' value.
-if ! $python -c 'import sys; sys.exit(sys.version_info < (2,6) or sys.version_info >= (3,))'; then
-  error_exit "Cannot use '$python', Python 2.6 or later is required." \
-      "Note that Python 3 or later is not yet supported." \
-      "Use --python=/path/to/python to specify a supported Python."
-fi
-
-# Suppress writing compiled files
-python="$python -B"
-
 case "$cpu" in
    ppc)
           CPU_CFLAGS="-m32"
@@ -1267,6 +1270,9 @@ for config in $mak_wilds; do
    default_target_list="${default_target_list} $(basename "$config" .mak)"
 done

+# Enumerate public trace backends for --help output
+trace_backend_list=$(echo $(grep -le '^PUBLIC = True$' "$source_path"/scripts/tracetool/backend/*.py | sed -e 's/^.*\/\(.*\)\.py$/\1/'))
+
 if test x"$show_help" = x"yes" ; then
 cat << EOF

@@ -1320,7 +1326,7 @@ Advanced options (experts only):
                           set block driver read-only whitelist
                           (affects only QEMU, not qemu-img)
  --enable-trace-backends=B Set trace backend
-                           Available backends: $($python $source_path/scripts/tracetool.py --list-backends)
+                           Available backends: $trace_backend_list
  --with-trace-file=NAME   Full PATH,NAME of file to store traces
                           Default:trace-<pid>
  --disable-slirp          disable SLIRP userspace network connectivity
@@ -1335,6 +1341,12 @@ Advanced options (experts only):
  --with-vss-sdk=SDK-path  enable Windows VSS support in QEMU Guest Agent
  --with-win-sdk=SDK-path  path to Windows Platform SDK (to build VSS .tlb)
  --tls-priority           default TLS protocol/cipher priority string
+  --enable-gprof           QEMU profiling with gprof
+  --enable-profiler        profiler support
+  --enable-xen-pv-domain-build
+                           xen pv domain builder
+  --enable-debug-stack-usage
+                           track the maximum stack usage of stacks created by qemu_alloc_stack

 Optional features, enabled with --enable-FEATURE and
 disabled with --disable-FEATURE, default is enabled if available:
@@ -1396,19 +1408,40 @@ disabled with --disable-FEATURE, default is enabled if available:
  seccomp         seccomp support
  coroutine-pool  coroutine freelist (better performance)
  glusterfs       GlusterFS backend
-  archipelago     Archipelago backend
  tpm             TPM support
  libssh2         ssh block device support
  numa            libnuma support
  tcmalloc        tcmalloc support
  jemalloc        jemalloc support
  replication     replication support
+  vhost-vsock     virtio sockets device support
+  opengl          opengl support
+  virglrenderer   virgl rendering support
+  xfsctl          xfsctl support
+  qom-cast-debug  cast debugging support
+  tools           build qemu-io, qemu-nbd and qemu-image tools
+  vxhs            Veritas HyperScale vDisk backend support

 NOTE: The object files are built at the place where configure is launched
 EOF
 exit 0
 fi

+if ! has $python; then
+  error_exit "Python not found. Use --python=/path/to/python"
+fi
+
+# Note that if the Python conditional here evaluates True we will exit
+# with status 1 which is a shell 'false' value.
+if ! $python -c 'import sys; sys.exit(sys.version_info < (2,6) or sys.version_info >= (3,))'; then
+  error_exit "Cannot use '$python', Python 2.6 or later is required." \
+      "Note that Python 3 or later is not yet supported." \
+      "Use --python=/path/to/python to specify a supported Python."
+fi
+
+# Suppress writing compiled files
+python="$python -B"
+
 # Now we have handled --enable-tcg-interpreter and know we're not just
 # printing the help message, bail out if the host CPU isn't supported.
 if test "$ARCH" = "unknown"; then
@@ -1441,6 +1474,14 @@ if ! compile_prog ; then
    error_exit "\"$cc\" cannot build an executable (is your linker broken?)"
 fi

+if test "$bogus_os" = "yes"; then
+    # Now that we know that we're not printing the help and that
+    # the compiler works (so the results of the check_defines we used
+    # to identify the OS are reliable), if we didn't recognize the
+    # host OS we should stop now.
+    error_exit "Unrecognized host OS $targetos"
+fi
+
 # Check that the C++ compiler exists and works with the C compiler
 if has $cxx; then
    cat > $TMPC <<EOF
@@ -1956,8 +1997,24 @@ fi
 # xen probe

 if test "$xen" != "no" ; then
+  # Check whether Xen library path is specified via --extra-ldflags to avoid
+  # overriding this setting with pkg-config output. If not, try pkg-config
+  # to obtain all needed flags.
+
+  if ! echo $EXTRA_LDFLAGS | grep tools/libxc > /dev/null && \
+     $pkg_config --exists xencontrol ; then
+    xen_ctrl_version="$(printf '%d%02d%02d' \
+      $($pkg_config --modversion xencontrol | sed 's/\./ /g') )"
+    xen=yes
+    xen_pc="xencontrol xenstore xenguest xenforeignmemory xengnttab"
+    xen_pc="$xen_pc xenevtchn xendevicemodel"
+    QEMU_CFLAGS="$QEMU_CFLAGS $($pkg_config --cflags $xen_pc)"
+    libs_softmmu="$($pkg_config --libs $xen_pc) $libs_softmmu"
+    LDFLAGS="$($pkg_config --libs $xen_pc) $LDFLAGS"
+  else
+
    xen_libs="-lxenstore -lxenctrl -lxenguest"
-  xen_stable_libs="-lxenforeignmemory -lxengnttab -lxenevtchn"
+    xen_stable_libs="-lxencall -lxenforeignmemory -lxengnttab -lxenevtchn"

    # First we test whether Xen headers and libraries are available.
    # If no, we are done and there is no Xen support.
@@ -1980,6 +2037,25 @@ EOF
    # Xen unstable
    elif
        cat > $TMPC <<EOF &&
+#undef XC_WANT_COMPAT_DEVICEMODEL_API
+#define __XEN_TOOLS__
+#include <xendevicemodel.h>
+int main(void) {
+  xendevicemodel_handle *xd;
+
+  xd = xendevicemodel_open(0, 0);
+  xendevicemodel_close(xd);
+
+  return 0;
+}
+EOF
+        compile_prog "" "$xen_libs -lxendevicemodel $xen_stable_libs"
+      then
+      xen_stable_libs="-lxendevicemodel $xen_stable_libs"
+      xen_ctrl_version=40900
+      xen=yes
+    elif
+        cat > $TMPC <<EOF &&
 /*
 * If we have stable libs the we don't want the libxc compat
 * layers, regardless of what CFLAGS we may have been given.
@@ -2031,7 +2107,7 @@ int main(void) {
 EOF
        compile_prog "" "$xen_libs $xen_stable_libs"
      then
-    xen_ctrl_version=480
+      xen_ctrl_version=40800
      xen=yes
    elif
        cat > $TMPC <<EOF &&
@@ -2082,7 +2158,7 @@ int main(void) {
 EOF
        compile_prog "" "$xen_libs $xen_stable_libs"
      then
-    xen_ctrl_version=471
+      xen_ctrl_version=40701
      xen=yes
    elif
        cat > $TMPC <<EOF &&
@@ -2097,7 +2173,7 @@ int main(void) {
 EOF
        compile_prog "" "$xen_libs"
      then
-    xen_ctrl_version=470
+      xen_ctrl_version=40700
      xen=yes

    # Xen 4.6
@@ -2125,7 +2201,7 @@ int main(void) {
 EOF
        compile_prog "" "$xen_libs"
      then
-    xen_ctrl_version=460
+      xen_ctrl_version=40600
      xen=yes

    # Xen 4.5
@@ -2152,7 +2228,7 @@ int main(void) {
 EOF
        compile_prog "" "$xen_libs"
      then
-    xen_ctrl_version=450
+      xen_ctrl_version=40500
      xen=yes

    elif
@@ -2177,7 +2253,7 @@ int main(void) {
 EOF
        compile_prog "" "$xen_libs"
      then
-    xen_ctrl_version=420
+      xen_ctrl_version=40200
      xen=yes

    else
@@ -2189,11 +2265,12 @@ EOF
    fi

    if test "$xen" = yes; then
-    if test $xen_ctrl_version -ge 471  ; then
+      if test $xen_ctrl_version -ge 40701  ; then
        libs_softmmu="$xen_stable_libs $libs_softmmu"
      fi
      libs_softmmu="$xen_libs $libs_softmmu"
    fi
+  fi
 fi

 if test "$xen_pci_passthrough" != "no"; then
@@ -2888,6 +2965,12 @@ for drv in $audio_drv_list; do
    audio_pt_int="yes"
    ;;

+    sdl)
+    if test "$sdl" = "no"; then
+        error_exit "sdl not found or disabled, can not use sdl audio driver"
+    fi
+    ;;
+
    coreaudio)
      libs_softmmu="-framework CoreAudio $libs_softmmu"
    ;;
@@ -2901,8 +2984,8 @@ for drv in $audio_drv_list; do
      libs_softmmu="$oss_lib $libs_softmmu"
    ;;

-    sdl|wav)
-    # XXX: Probes for CoreAudio, DirectSound, SDL(?)
+    wav)
+    # XXX: Probes for CoreAudio, DirectSound
    ;;

    *)
@@ -3035,12 +3118,23 @@ fi
 ##########################################
 # glib support probe

-glib_req_ver=2.22
+if test "$mingw32" = yes; then
+    glib_req_ver=2.30
+else
+    glib_req_ver=2.22
+fi
 glib_modules=gthread-2.0
 if test "$modules" = yes; then
    glib_modules="$glib_modules gmodule-2.0"
 fi

+# This workaround is required due to a bug in pkg-config file for glib as it
+# doesn't define GLIB_STATIC_COMPILATION for pkg-config --static
+
+if test "$static" = yes -a "$mingw32" = yes; then
+    QEMU_CFLAGS="-DGLIB_STATIC_COMPILATION $QEMU_CFLAGS"
+fi
+
 for i in $glib_modules; do
    if $pkg_config --atleast-version=$glib_req_ver $i; then
        glib_cflags=$($pkg_config --cflags $i)
@@ -3378,7 +3472,7 @@ fi
 fdt_required=no
 for target in $target_list; do
  case $target in
-    aarch64*-softmmu|arm*-softmmu|ppc*-softmmu|microblaze*-softmmu)
+    aarch64*-softmmu|arm*-softmmu|ppc*-softmmu|microblaze*-softmmu|mips64el-softmmu)
      fdt_required=yes
    ;;
  esac
@@ -3396,11 +3490,11 @@ fi
 if test "$fdt" != "no" ; then
  fdt_libs="-lfdt"
  # explicitly check for libfdt_env.h as it is missing in some stable installs
-  # and test for required functions to make sure we are on a version >= 1.4.0
+  # and test for required functions to make sure we are on a version >= 1.4.2
  cat > $TMPC << EOF
 #include <libfdt.h>
 #include <libfdt_env.h>
-int main(void) { fdt_get_property_by_offset(0, 0, 0); return 0; }
+int main(void) { fdt_first_subnode(0, 0); return 0; }
 EOF
  if compile_prog "" "$fdt_libs" ; then
    # system DTC is good - use it
@@ -3418,7 +3512,7 @@ EOF
    fdt_libs="-L\$(BUILD_DIR)/dtc/libfdt $fdt_libs"
  elif test "$fdt" = "yes" ; then
    # have neither and want - prompt for system/submodule install
-    error_exit "DTC (libfdt) version >= 1.4.0 not present. Your options:" \
+    error_exit "DTC (libfdt) version >= 1.4.2 not present. Your options:" \
        "  (1) Preferred: Install the DTC (libfdt) devel package" \
        "  (2) Fetch the DTC submodule, using:" \
        "      git submodule update --init dtc"
@@ -3443,6 +3537,7 @@ if test "$opengl" != "no" ; then
    if test "$gtk" = "yes" && $pkg_config --exists "$gtkpackage >= 3.16"; then
        gtk_gl="yes"
    fi
+    QEMU_CFLAGS="$QEMU_CFLAGS $opengl_cflags"
  else
    if test "$opengl" = "yes" ; then
      feature_not_found "opengl" "Please install opengl (mesa) devel pkgs: $opengl_pkgs"
@@ -3466,37 +3561,6 @@ EOF
  fi
 fi

-##########################################
-# archipelago probe
-if test "$archipelago" != "no" ; then
-    cat > $TMPC <<EOF
-#include <stdio.h>
-#include <xseg/xseg.h>
-#include <xseg/protocol.h>
-int main(void) {
-    xseg_initialize();
-    return 0;
-}
-EOF
-    archipelago_libs=-lxseg
-    if compile_prog "" "$archipelago_libs"; then
-        archipelago="yes"
-        libs_tools="$archipelago_libs $libs_tools"
-        libs_softmmu="$archipelago_libs $libs_softmmu"
-
-	echo "WARNING: Please check the licenses of QEMU and libxseg carefully."
-	echo "GPLv3 versions of libxseg may not be compatible with QEMU's"
-	echo "license and therefore prevent redistribution."
-	echo
-	echo "To disable Archipelago, use --disable-archipelago"
-    else
-      if test "$archipelago" = "yes" ; then
-        feature_not_found "Archipelago backend support" "Install libxseg devel"
-      fi
-      archipelago="no"
-    fi
-fi
-

 ##########################################
 # glusterfs probe
@@ -4747,6 +4811,47 @@ if test "$modules" = "yes" && test "$LD_REL_FLAGS" = ""; then
  feature_not_found "modules" "Cannot find how to build relocatable objects"
 fi

+##########################################
+# check for sysmacros.h
+
+have_sysmacros=no
+cat > $TMPC << EOF
+#include <sys/sysmacros.h>
+int main(void) {
+    return makedev(0, 0);
+}
+EOF
+if compile_prog "" "" ; then
+    have_sysmacros=yes
+fi
+
+##########################################
+# Veritas HyperScale block driver VxHS
+# Check if libvxhs is installed
+
+if test "$vxhs" != "no" ; then
+  cat > $TMPC <<EOF
+#include <stdint.h>
+#include <qnio/qnio_api.h>
+
+void *vxhs_callback;
+
+int main(void) {
+    iio_init(QNIO_VERSION, vxhs_callback);
+    return 0;
+}
+EOF
+  vxhs_libs="-lvxhs -lssl"
+  if compile_prog "" "$vxhs_libs" ; then
+    vxhs=yes
+  else
+    if test "$vxhs" = "yes" ; then
+      feature_not_found "vxhs block device" "Install libvxhs See github"
+    fi
+    vxhs=no
+  fi
+fi
+
 ##########################################
 # End of CC checks
 # After here, no more $cc or $ld runs
@@ -5099,7 +5204,6 @@ echo "coroutine backend $coroutine"
 echo "coroutine pool    $coroutine_pool"
 echo "debug stack usage $debug_stack_usage"
 echo "GlusterFS support $glusterfs"
-echo "Archipelago support $archipelago"
 echo "gcov              $gcov_tool"
 echo "gcov enabled      $gcov"
 echo "TPM support       $tpm"
@@ -5114,11 +5218,38 @@ echo "tcmalloc support  $tcmalloc"
 echo "jemalloc support  $jemalloc"
 echo "avx2 optimization $avx2_opt"
 echo "replication support $replication"
+echo "VxHS block device $vxhs"

 if test "$sdl_too_old" = "yes"; then
 echo "-> Your SDL version is too old - please upgrade to have SDL support"
 fi

+if test "$supported_cpu" = "no"; then
+    echo
+    echo "WARNING: SUPPORT FOR THIS HOST CPU WILL GO AWAY IN FUTURE RELEASES!"
+    echo
+    echo "CPU host architecture $cpu support is not currently maintained."
+    echo "The QEMU project intends to remove support for this host CPU in"
+    echo "a future release if nobody volunteers to maintain it and to"
+    echo "provide a build host for our continuous integration setup."
+    echo "configure has succeeded and you can continue to build, but"
+    echo "if you care about QEMU on this platform you should contact"
+    echo "us upstream at qemu-devel@nongnu.org."
+fi
+
+if test "$supported_os" = "no"; then
+    echo
+    echo "WARNING: SUPPORT FOR THIS HOST OS WILL GO AWAY IN FUTURE RELEASES!"
+    echo
+    echo "Host OS $targetos support is not currently maintained."
+    echo "The QEMU project intends to remove support for this host OS in"
+    echo "a future release if nobody volunteers to maintain it and to"
+    echo "provide a build host for our continuous integration setup."
+    echo "configure has succeeded and you can continue to build, but"
+    echo "if you care about QEMU on this platform you should contact"
+    echo "us upstream at qemu-devel@nongnu.org."
+fi
+
 config_host_mak="config-host.mak"

 echo "# Automatically generated by configure - do not modify" >config-all-disas.mak
@@ -5515,7 +5646,6 @@ fi

 if test "$opengl" = "yes" ; then
  echo "CONFIG_OPENGL=y" >> $config_host_mak
-  echo "OPENGL_CFLAGS=$opengl_cflags" >> $config_host_mak
  echo "OPENGL_LIBS=$opengl_libs" >> $config_host_mak
  if test "$opengl_dmabuf" = "yes" ; then
    echo "CONFIG_OPENGL_DMABUF=y" >> $config_host_mak
@@ -5640,11 +5770,6 @@ if test "$glusterfs_zerofill" = "yes" ; then
  echo "CONFIG_GLUSTERFS_ZEROFILL=y" >> $config_host_mak
 fi

-if test "$archipelago" = "yes" ; then
-  echo "CONFIG_ARCHIPELAGO=m" >> $config_host_mak
-  echo "ARCHIPELAGO_LIBS=$archipelago_libs" >> $config_host_mak
-fi
-
 if test "$libssh2" = "yes" ; then
  echo "CONFIG_LIBSSH2=m" >> $config_host_mak
  echo "LIBSSH2_CFLAGS=$libssh2_cflags" >> $config_host_mak
@@ -5719,6 +5844,10 @@ if test "$have_af_vsock" = "yes" ; then
  echo "CONFIG_AF_VSOCK=y" >> $config_host_mak
 fi

+if test "$have_sysmacros" = "yes" ; then
+  echo "CONFIG_SYSMACROS=y" >> $config_host_mak
+fi
+
 # Hold two types of flag:
 #   CONFIG_THREAD_SETNAME_BYTHREAD  - we've got a way of setting the name on
 #                                     a thread we have a handle to
@@ -5729,6 +5858,11 @@ if test "$pthread_setname_np" = "yes" ; then
  echo "CONFIG_PTHREAD_SETNAME_NP=y" >> $config_host_mak
 fi

+if test "$vxhs" = "yes" ; then
+  echo "CONFIG_VXHS=y" >> $config_host_mak
+  echo "VXHS_LIBS=$vxhs_libs" >> $config_host_mak
+fi
+
 if test "$tcg_interpreter" = "yes"; then
  QEMU_INCLUDES="-I\$(SRC_PATH)/tcg/tci $QEMU_INCLUDES"
 elif test "$ARCH" = "sparc64" ; then
@@ -5843,7 +5977,7 @@ target_name=$(echo $target | cut -d '-' -f 1)
 target_bigendian="no"

 case "$target_name" in
-  armeb|hppa|lm32|m68k|microblaze|mips|mipsn32|mips64|moxie|or32|ppc|ppcemb|ppc64|ppc64abi32|s390x|sh4eb|sparc|sparc64|sparc32plus|xtensaeb)
+  armeb|hppa|lm32|m68k|microblaze|mips|mipsn32|mips64|moxie|or1k|ppc|ppcemb|ppc64|ppc64abi32|s390x|sh4eb|sparc|sparc64|sparc32plus|xtensaeb)
  target_bigendian=yes
  ;;
 esac
@@ -5879,6 +6013,7 @@ mkdir -p $target_dir
 echo "# Automatically generated by configure - do not modify" > $config_target_mak

 bflt="no"
+mttcg="no"
 interp_prefix1=$(echo "$interp_prefix" | sed "s/%M/$target_name/g")
 gdb_xml_files=""

@@ -5893,15 +6028,18 @@ case "$target_name" in
    TARGET_BASE_ARCH=i386
  ;;
  alpha)
+    mttcg="yes"
  ;;
  arm|armeb)
    TARGET_ARCH=arm
    bflt="yes"
+    mttcg="yes"
    gdb_xml_files="arm-core.xml arm-vfp.xml arm-vfp3.xml arm-neon.xml"
  ;;
  aarch64)
    TARGET_BASE_ARCH=arm
    bflt="yes"
+    mttcg="yes"
    gdb_xml_files="aarch64-core.xml aarch64-fpu.xml arm-core.xml arm-vfp.xml arm-vfp3.xml arm-neon.xml"
  ;;
  cris)
@@ -5937,7 +6075,7 @@ case "$target_name" in
  ;;
  nios2)
  ;;
-  or32)
+  or1k)
    TARGET_ARCH=openrisc
    TARGET_BASE_ARCH=openrisc
  ;;
@@ -6066,6 +6204,9 @@ if test "$target_bigendian" = "yes" ; then
 fi
 if test "$target_softmmu" = "yes" ; then
  echo "CONFIG_SOFTMMU=y" >> $config_target_mak
+  if test "$mttcg" = "yes" ; then
+    echo "TARGET_SUPPORTS_MTTCG=y" >> $config_target_mak
+  fi
 fi
 if test "$target_user_only" = "yes" ; then
  echo "CONFIG_USER_ONLY=y" >> $config_target_mak
@@ -6145,7 +6286,7 @@ for i in $ARCH $TARGET_BASE_ARCH ; do
  nios2)
    disas_config "NIOS2"
  ;;
-  or32)
+  or1k)
    disas_config "OPENRISC"
  ;;
  ppc*)
--- a/cpu-exec-common.c
+++ b/cpu-exec-common.c
@@ -23,9 +23,6 @@
 #include "exec/exec-all.h"
 #include "exec/memory-internal.h"

-bool exit_request;
-CPUState *tcg_current_cpu;
-
 /* exit the current TB, but without causing any exception to be raised */
 void cpu_loop_exit_noexc(CPUState *cpu)
 {
@@ -38,7 +35,7 @@ void cpu_loop_exit_noexc(CPUState *cpu)
 #if defined(CONFIG_SOFTMMU)
 void cpu_reloading_memory_map(void)
 {
-    if (qemu_in_vcpu_thread()) {
+    if (qemu_in_vcpu_thread() && current_cpu->running) {
        /* The guest can in theory prolong the RCU critical section as long
         * as it feels like. The major problem with this is that because it
         * can do multiple reconfigurations of the memory map within the
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -29,9 +29,11 @@
 #include "qemu/rcu.h"
 #include "exec/tb-hash.h"
 #include "exec/log.h"
+#include "qemu/main-loop.h"
 #if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
 #include "hw/i386/apic.h"
 #endif
+#include "sysemu/cpus.h"
 #include "sysemu/replay.h"

 /* -icount align implementation. */
@@ -185,12 +187,6 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
            cc->set_pc(cpu, last_tb->pc);
        }
    }
-    if (tb_exit == TB_EXIT_REQUESTED) {
-        /* We were asked to stop executing TBs (probably a pending
-         * interrupt. We've now stopped, so clear the flag.
-         */
-        atomic_set(&cpu->tcg_exit_req, 0);
-    }
    return ret;
 }

@@ -227,20 +223,43 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,

 static void cpu_exec_step(CPUState *cpu)
 {
+    CPUClass *cc = CPU_GET_CLASS(cpu);
    CPUArchState *env = (CPUArchState *)cpu->env_ptr;
    TranslationBlock *tb;
    target_ulong cs_base, pc;
    uint32_t flags;

    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
+    if (sigsetjmp(cpu->jmp_env, 0) == 0) {
+        mmap_lock();
+        tb_lock();
        tb = tb_gen_code(cpu, pc, cs_base, flags,
                         1 | CF_NOCACHE | CF_IGNORE_ICOUNT);
        tb->orig_tb = NULL;
+        tb_unlock();
+        mmap_unlock();
+
+        cc->cpu_exec_enter(cpu);
        /* execute the generated code */
        trace_exec_tb_nocache(tb, pc);
        cpu_tb_exec(cpu, tb);
+        cc->cpu_exec_exit(cpu);
+
+        tb_lock();
        tb_phys_invalidate(tb, -1);
        tb_free(tb);
+        tb_unlock();
+    } else {
+        /* We may have exited due to another problem here, so we need
+         * to reset any tb_locks we may have taken but didn't release.
+         * The mmap_lock is dropped by tb_gen_code if it runs out of
+         * memory.
+         */
+#ifndef CONFIG_SOFTMMU
+        tcg_debug_assert(!have_mmap_lock());
+#endif
+        tb_lock_reset();
+    }
 }

 void cpu_exec_step_atomic(CPUState *cpu)
@@ -384,12 +403,13 @@ static inline bool cpu_handle_halt(CPUState *cpu)
        if ((cpu->interrupt_request & CPU_INTERRUPT_POLL)
            && replay_interrupt()) {
            X86CPU *x86_cpu = X86_CPU(cpu);
+            qemu_mutex_lock_iothread();
            apic_poll_irq(x86_cpu->apic_state);
            cpu_reset_interrupt(cpu, CPU_INTERRUPT_POLL);
+            qemu_mutex_unlock_iothread();
        }
 #endif
        if (!cpu_has_work(cpu)) {
-            current_cpu = NULL;
            return true;
        }

@@ -439,7 +459,9 @@ static inline bool cpu_handle_exception(CPUState *cpu, int *ret)
 #else
            if (replay_exception()) {
                CPUClass *cc = CPU_GET_CLASS(cpu);
+                qemu_mutex_lock_iothread();
                cc->do_interrupt(cpu);
+                qemu_mutex_unlock_iothread();
                cpu->exception_index = -1;
            } else if (!replay_has_interrupt()) {
                /* give a chance to iothread in replay mode */
@@ -461,13 +483,15 @@ static inline bool cpu_handle_exception(CPUState *cpu, int *ret)
    return false;
 }

-static inline void cpu_handle_interrupt(CPUState *cpu,
+static inline bool cpu_handle_interrupt(CPUState *cpu,
                                        TranslationBlock **last_tb)
 {
    CPUClass *cc = CPU_GET_CLASS(cpu);
-    int interrupt_request = cpu->interrupt_request;

-    if (unlikely(interrupt_request)) {
+    if (unlikely(atomic_read(&cpu->interrupt_request))) {
+        int interrupt_request;
+        qemu_mutex_lock_iothread();
+        interrupt_request = cpu->interrupt_request;
        if (unlikely(cpu->singlestep_enabled & SSTEP_NOIRQ)) {
            /* Mask out external interrupts for this step. */
            interrupt_request &= ~CPU_INTERRUPT_SSTEP_MASK;
@@ -475,7 +499,8 @@ static inline void cpu_handle_interrupt(CPUState *cpu,
        if (interrupt_request & CPU_INTERRUPT_DEBUG) {
            cpu->interrupt_request &= ~CPU_INTERRUPT_DEBUG;
            cpu->exception_index = EXCP_DEBUG;
-            cpu_loop_exit(cpu);
+            qemu_mutex_unlock_iothread();
+            return true;
        }
        if (replay_mode == REPLAY_MODE_PLAY && !replay_has_interrupt()) {
            /* Do nothing */
@@ -484,23 +509,26 @@ static inline void cpu_handle_interrupt(CPUState *cpu,
            cpu->interrupt_request &= ~CPU_INTERRUPT_HALT;
            cpu->halted = 1;
            cpu->exception_index = EXCP_HLT;
-            cpu_loop_exit(cpu);
+            qemu_mutex_unlock_iothread();
+            return true;
        }
 #if defined(TARGET_I386)
        else if (interrupt_request & CPU_INTERRUPT_INIT) {
            X86CPU *x86_cpu = X86_CPU(cpu);
            CPUArchState *env = &x86_cpu->env;
            replay_interrupt();
-            cpu_svm_check_intercept_param(env, SVM_EXIT_INIT, 0);
+            cpu_svm_check_intercept_param(env, SVM_EXIT_INIT, 0, 0);
            do_cpu_init(x86_cpu);
            cpu->exception_index = EXCP_HALTED;
-            cpu_loop_exit(cpu);
+            qemu_mutex_unlock_iothread();
+            return true;
        }
 #else
        else if (interrupt_request & CPU_INTERRUPT_RESET) {
            replay_interrupt();
            cpu_reset(cpu);
-            cpu_loop_exit(cpu);
+            qemu_mutex_unlock_iothread();
+            return true;
        }
 #endif
        /* The target hook has 3 exit conditions:
@@ -522,72 +550,71 @@ static inline void cpu_handle_interrupt(CPUState *cpu,
               the program flow was changed */
            *last_tb = NULL;
        }
+
+        /* If we exit via cpu_loop_exit/longjmp it is reset in cpu_exec */
+        qemu_mutex_unlock_iothread();
    }
-    if (unlikely(atomic_read(&cpu->exit_request) || replay_has_interrupt())) {
+
+    /* Finally, check if we need to exit to the main loop.  */
+    if (unlikely(atomic_read(&cpu->exit_request)
+        || (use_icount && cpu->icount_decr.u16.low + cpu->icount_extra == 0))) {
        atomic_set(&cpu->exit_request, 0);
        cpu->exception_index = EXCP_INTERRUPT;
-        cpu_loop_exit(cpu);
+        return true;
    }
+
+    return false;
 }

 static inline void cpu_loop_exec_tb(CPUState *cpu, TranslationBlock *tb,
-                                    TranslationBlock **last_tb, int *tb_exit,
-                                    SyncClocks *sc)
+                                    TranslationBlock **last_tb, int *tb_exit)
 {
    uintptr_t ret;
-
-    if (unlikely(atomic_read(&cpu->exit_request))) {
-        return;
-    }
+    int32_t insns_left;

    trace_exec_tb(tb, tb->pc);
    ret = cpu_tb_exec(cpu, tb);
-    *last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
+    tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
    *tb_exit = ret & TB_EXIT_MASK;
-    switch (*tb_exit) {
-    case TB_EXIT_REQUESTED:
-        /* Something asked us to stop executing
-         * chained TBs; just continue round the main
-         * loop. Whatever requested the exit will also
-         * have set something else (eg exit_request or
-         * interrupt_request) which we will handle
-         * next time around the loop.  But we need to
-         * ensure the tcg_exit_req read in generated code
+    if (*tb_exit != TB_EXIT_REQUESTED) {
+        *last_tb = tb;
+        return;
+    }
+
+    *last_tb = NULL;
+    insns_left = atomic_read(&cpu->icount_decr.u32);
+    atomic_set(&cpu->icount_decr.u16.high, 0);
+    if (insns_left < 0) {
+        /* Something asked us to stop executing chained TBs; just
+         * continue round the main loop. Whatever requested the exit
+         * will also have set something else (eg exit_request or
+         * interrupt_request) which we will handle next time around
+         * the loop.  But we need to ensure the zeroing of icount_decr
         * comes before the next read of cpu->exit_request
         * or cpu->interrupt_request.
         */
-        smp_rmb();
-        *last_tb = NULL;
-        break;
-    case TB_EXIT_ICOUNT_EXPIRED:
-    {
+        smp_mb();
+        return;
+    }
+
    /* Instruction counter expired.  */
-#ifdef CONFIG_USER_ONLY
-        abort();
-#else
-        int insns_left = cpu->icount_decr.u32;
-        if (cpu->icount_extra && insns_left >= 0) {
+    assert(use_icount);
+#ifndef CONFIG_USER_ONLY
+    /* Ensure global icount has gone forward */
+    cpu_update_icount(cpu);
    /* Refill decrementer and continue execution.  */
-            cpu->icount_extra += insns_left;
-            insns_left = MIN(0xffff, cpu->icount_extra);
-            cpu->icount_extra -= insns_left;
+    insns_left = MIN(0xffff, cpu->icount_budget);
    cpu->icount_decr.u16.low = insns_left;
-        } else {
+    cpu->icount_extra = cpu->icount_budget - insns_left;
+    if (!cpu->icount_extra) {
+        /* Execute any remaining instructions, then let the main loop
+         * handle the next event.
+         */
        if (insns_left > 0) {
-                /* Execute remaining instructions.  */
-                cpu_exec_nocache(cpu, insns_left, *last_tb, false);
-                align_clocks(sc, cpu);
+            cpu_exec_nocache(cpu, insns_left, tb, false);
        }
-            cpu->exception_index = EXCP_INTERRUPT;
-            *last_tb = NULL;
-            cpu_loop_exit(cpu);
    }
-        break;
 #endif
-    }
-    default:
-        break;
-    }
 }

 /* main execution loop */
@@ -596,7 +623,7 @@ int cpu_exec(CPUState *cpu)
 {
    CPUClass *cc = CPU_GET_CLASS(cpu);
    int ret;
-    SyncClocks sc;
+    SyncClocks sc = { 0 };

    /* replay_interrupt may need current_cpu */
    current_cpu = cpu;
@@ -605,13 +632,8 @@ int cpu_exec(CPUState *cpu)
        return EXCP_HALTED;
    }

-    atomic_mb_set(&tcg_current_cpu, cpu);
    rcu_read_lock();

-    if (unlikely(atomic_mb_read(&exit_request))) {
-        cpu->exit_request = 1;
-    }
-
    cc->cpu_exec_enter(cpu);

    /* Calculate difference between guest clock and host clock.
@@ -621,26 +643,8 @@ int cpu_exec(CPUState *cpu)
     */
    init_delay_params(&sc, cpu);

-    for(;;) {
    /* prepare setjmp context for exception handling */
-        if (sigsetjmp(cpu->jmp_env, 0) == 0) {
-            TranslationBlock *tb, *last_tb = NULL;
-            int tb_exit = 0;
-
-            /* if an exception is pending, we execute it here */
-            if (cpu_handle_exception(cpu, &ret)) {
-                break;
-            }
-
-            for(;;) {
-                cpu_handle_interrupt(cpu, &last_tb);
-                tb = tb_find(cpu, last_tb, tb_exit);
-                cpu_loop_exec_tb(cpu, tb, &last_tb, &tb_exit, &sc);
-                /* Try to align the host and virtual clocks
-                   if the guest is in advance */
-                align_clocks(&sc, cpu);
-            } /* for(;;) */
-        } else {
+    if (sigsetjmp(cpu->jmp_env, 0) != 0) {
 #if defined(__clang__) || !QEMU_GNUC_PREREQ(4, 6)
        /* Some compilers wrongly smash all local variables after
         * siglongjmp. There were bug reports for gcc 4.5.0 and clang.
@@ -655,16 +659,27 @@ int cpu_exec(CPUState *cpu)
 #endif /* buggy compiler */
        cpu->can_do_io = 1;
        tb_lock_reset();
+        if (qemu_mutex_iothread_locked()) {
+            qemu_mutex_unlock_iothread();
+        }
+    }
+
+    /* if an exception is pending, we execute it here */
+    while (!cpu_handle_exception(cpu, &ret)) {
+        TranslationBlock *last_tb = NULL;
+        int tb_exit = 0;
+
+        while (!cpu_handle_interrupt(cpu, &last_tb)) {
+            TranslationBlock *tb = tb_find(cpu, last_tb, tb_exit);
+            cpu_loop_exec_tb(cpu, tb, &last_tb, &tb_exit);
+            /* Try to align the host and virtual clocks
+               if the guest is in advance */
+            align_clocks(&sc, cpu);
+        }
    }
-    } /* for(;;) */

    cc->cpu_exec_exit(cpu);
    rcu_read_unlock();

-    /* fail safe : never use current_cpu outside cpu_exec() */
-    current_cpu = NULL;
-
-    /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
-    atomic_set(&tcg_current_cpu, NULL);
    return ret;
 }
--- a/cpus.c
+++ b/cpus.c
@@ -25,6 +25,7 @@
 /* Needed early for CONFIG_BSD etc. */
 #include "qemu/osdep.h"
 #include "qemu-common.h"
+#include "qemu/config-file.h"
 #include "cpu.h"
 #include "monitor/monitor.h"
 #include "qapi/qmp/qerror.h"
@@ -45,14 +46,11 @@
 #include "qemu/main-loop.h"
 #include "qemu/bitmap.h"
 #include "qemu/seqlock.h"
+#include "tcg.h"
 #include "qapi-event.h"
 #include "hw/nmi.h"
 #include "sysemu/replay.h"

-#ifndef _WIN32
-#include "qemu/compatfd.h"
-#endif
-
 #ifdef CONFIG_LINUX

 #include <sys/prctl.h>
@@ -150,21 +148,126 @@ typedef struct TimersState {
 } TimersState;

 static TimersState timers_state;
+bool mttcg_enabled;
+
+/*
+ * We default to false if we know other options have been enabled
+ * which are currently incompatible with MTTCG. Otherwise when each
+ * guest (target) has been updated to support:
+ *   - atomic instructions
+ *   - memory ordering primitives (barriers)
+ * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
+ *
+ * Once a guest architecture has been converted to the new primitives
+ * there are two remaining limitations to check.
+ *
+ * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
+ * - The host must have a stronger memory order than the guest
+ *
+ * It may be possible in future to support strong guests on weak hosts
+ * but that will require tagging all load/stores in a guest with their
+ * implicit memory order requirements which would likely slow things
+ * down a lot.
+ */
+
+static bool check_tcg_memory_orders_compatible(void)
+{
+#if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
+    return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
+#else
+    return false;
+#endif
+}
+
+static bool default_mttcg_enabled(void)
+{
+    if (use_icount || TCG_OVERSIZED_GUEST) {
+        return false;
+    } else {
+#ifdef TARGET_SUPPORTS_MTTCG
+        return check_tcg_memory_orders_compatible();
+#else
+        return false;
+#endif
+    }
+}
+
+void qemu_tcg_configure(QemuOpts *opts, Error **errp)
+{
+    const char *t = qemu_opt_get(opts, "thread");
+    if (t) {
+        if (strcmp(t, "multi") == 0) {
+            if (TCG_OVERSIZED_GUEST) {
+                error_setg(errp, "No MTTCG when guest word size > hosts");
+            } else if (use_icount) {
+                error_setg(errp, "No MTTCG when icount is enabled");
+            } else {
+#ifndef TARGET_SUPPORTS_MTTCG
+                error_report("Guest not yet converted to MTTCG - "
+                             "you may get unexpected results");
+#endif
+                if (!check_tcg_memory_orders_compatible()) {
+                    error_report("Guest expects a stronger memory ordering "
+                                 "than the host provides");
+                    error_printf("This may cause strange/hard to debug errors\n");
+                }
+                mttcg_enabled = true;
+            }
+        } else if (strcmp(t, "single") == 0) {
+            mttcg_enabled = false;
+        } else {
+            error_setg(errp, "Invalid 'thread' setting %s", t);
+        }
+    } else {
+        mttcg_enabled = default_mttcg_enabled();
+    }
+}
+
+/* The current number of executed instructions is based on what we
+ * originally budgeted minus the current state of the decrementing
+ * icount counters in extra/u16.low.
+ */
+static int64_t cpu_get_icount_executed(CPUState *cpu)
+{
+    return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
+}
+
+/*
+ * Update the global shared timer_state.qemu_icount to take into
+ * account executed instructions. This is done by the TCG vCPU
+ * thread so the main-loop can see time has moved forward.
+ */
+void cpu_update_icount(CPUState *cpu)
+{
+    int64_t executed = cpu_get_icount_executed(cpu);
+    cpu->icount_budget -= executed;
+
+#ifdef CONFIG_ATOMIC64
+    atomic_set__nocheck(&timers_state.qemu_icount,
+                        atomic_read__nocheck(&timers_state.qemu_icount) +
+                        executed);
+#else /* FIXME: we need 64bit atomics to do this safely */
+    timers_state.qemu_icount += executed;
+#endif
+}

 int64_t cpu_get_icount_raw(void)
 {
-    int64_t icount;
    CPUState *cpu = current_cpu;

-    icount = timers_state.qemu_icount;
-    if (cpu) {
+    if (cpu && cpu->running) {
        if (!cpu->can_do_io) {
            fprintf(stderr, "Bad icount read\n");
            exit(1);
        }
-        icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
+        /* Take into account what has run */
+        cpu_update_icount(cpu);
    }
-    return icount;
+#ifdef CONFIG_ATOMIC64
+    return atomic_read__nocheck(&timers_state.qemu_icount);
+#else /* FIXME: we need 64bit atomics to do this safely */
+    return timers_state.qemu_icount;
+#endif
 }

 /* Return the virtual CPU time, based on the instruction counter.  */
@@ -694,6 +797,84 @@ void configure_icount(QemuOpts *opts, Error **errp)
                   NANOSECONDS_PER_SECOND / 10);
 }

+/***********************************************************/
+/* TCG vCPU kick timer
+ *
+ * The kick timer is responsible for moving single threaded vCPU
+ * emulation on to the next vCPU. If more than one vCPU is running a
+ * timer event with force a cpu->exit so the next vCPU can get
+ * scheduled.
+ *
+ * The timer is removed if all vCPUs are idle and restarted again once
+ * idleness is complete.
+ */
+
+static QEMUTimer *tcg_kick_vcpu_timer;
+static CPUState *tcg_current_rr_cpu;
+
+#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
+
+static inline int64_t qemu_tcg_next_kick(void)
+{
+    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
+}
+
+/* Kick the currently round-robin scheduled vCPU */
+static void qemu_cpu_kick_rr_cpu(void)
+{
+    CPUState *cpu;
+    do {
+        cpu = atomic_mb_read(&tcg_current_rr_cpu);
+        if (cpu) {
+            cpu_exit(cpu);
+        }
+    } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
+}
+
+static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
+{
+}
+
+void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
+{
+    if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
+        qemu_notify_event();
+        return;
+    }
+
+    if (!qemu_in_vcpu_thread() && first_cpu) {
+        /* qemu_cpu_kick is not enough to kick a halted CPU out of
+         * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
+         * causes cpu_thread_is_idle to return false.  This way,
+         * handle_icount_deadline can run.
+         */
+        async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
+    }
+}
+
+static void kick_tcg_thread(void *opaque)
+{
+    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
+    qemu_cpu_kick_rr_cpu();
+}
+
+static void start_tcg_kick_timer(void)
+{
+    if (!mttcg_enabled && !tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
+        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
+                                           kick_tcg_thread, NULL);
+        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
+    }
+}
+
+static void stop_tcg_kick_timer(void)
+{
+    if (tcg_kick_vcpu_timer) {
+        timer_del(tcg_kick_vcpu_timer);
+        tcg_kick_vcpu_timer = NULL;
+    }
+}
+
 /***********************************************************/
 void hw_error(const char *fmt, ...)
 {
@@ -794,13 +975,23 @@ static void sigbus_reraise(void)
    abort();
 }

-static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
-                           void *ctx)
+static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
 {
-    if (kvm_on_sigbus(siginfo->ssi_code,
-                      (void *)(intptr_t)siginfo->ssi_addr)) {
+    if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
        sigbus_reraise();
    }
+
+    if (current_cpu) {
+        /* Called asynchronously in VCPU thread.  */
+        if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
+            sigbus_reraise();
+        }
+    } else {
+        /* Called synchronously (via signalfd) in main thread.  */
+        if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
+            sigbus_reraise();
+        }
+    }
 }

 static void qemu_init_sigbus(void)
@@ -809,95 +1000,18 @@ static void qemu_init_sigbus(void)

    memset(&action, 0, sizeof(action));
    action.sa_flags = SA_SIGINFO;
-    action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
+    action.sa_sigaction = sigbus_handler;
    sigaction(SIGBUS, &action, NULL);

    prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
 }
-
-static void qemu_kvm_eat_signals(CPUState *cpu)
-{
-    struct timespec ts = { 0, 0 };
-    siginfo_t siginfo;
-    sigset_t waitset;
-    sigset_t chkset;
-    int r;
-
-    sigemptyset(&waitset);
-    sigaddset(&waitset, SIG_IPI);
-    sigaddset(&waitset, SIGBUS);
-
-    do {
-        r = sigtimedwait(&waitset, &siginfo, &ts);
-        if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
-            perror("sigtimedwait");
-            exit(1);
-        }
-
-        switch (r) {
-        case SIGBUS:
-            if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
-                sigbus_reraise();
-            }
-            break;
-        default:
-            break;
-        }
-
-        r = sigpending(&chkset);
-        if (r == -1) {
-            perror("sigpending");
-            exit(1);
-        }
-    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
-}
-
 #else /* !CONFIG_LINUX */
-
 static void qemu_init_sigbus(void)
 {
 }
-
-static void qemu_kvm_eat_signals(CPUState *cpu)
-{
-}
 #endif /* !CONFIG_LINUX */

-#ifndef _WIN32
-static void dummy_signal(int sig)
-{
-}
-
-static void qemu_kvm_init_cpu_signals(CPUState *cpu)
-{
-    int r;
-    sigset_t set;
-    struct sigaction sigact;
-
-    memset(&sigact, 0, sizeof(sigact));
-    sigact.sa_handler = dummy_signal;
-    sigaction(SIG_IPI, &sigact, NULL);
-
-    pthread_sigmask(SIG_BLOCK, NULL, &set);
-    sigdelset(&set, SIG_IPI);
-    sigdelset(&set, SIGBUS);
-    r = kvm_set_signal_mask(cpu, &set);
-    if (r) {
-        fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
-        exit(1);
-    }
-}
-
-#else /* _WIN32 */
-static void qemu_kvm_init_cpu_signals(CPUState *cpu)
-{
-    abort();
-}
-#endif /* _WIN32 */
-
 static QemuMutex qemu_global_mutex;
-static QemuCond qemu_io_proceeded_cond;
-static unsigned iothread_requesting_mutex;

 static QemuThread io_thread;

@@ -911,7 +1025,6 @@ void qemu_init_cpu_loop(void)
    qemu_init_sigbus();
    qemu_cond_init(&qemu_cpu_cond);
    qemu_cond_init(&qemu_pause_cond);
-    qemu_cond_init(&qemu_io_proceeded_cond);
    qemu_mutex_init(&qemu_global_mutex);

    qemu_thread_get_self(&io_thread);
@@ -936,28 +1049,34 @@ static void qemu_tcg_destroy_vcpu(CPUState *cpu)

 static void qemu_wait_io_event_common(CPUState *cpu)
 {
+    atomic_mb_set(&cpu->thread_kicked, false);
    if (cpu->stop) {
        cpu->stop = false;
        cpu->stopped = true;
        qemu_cond_broadcast(&qemu_pause_cond);
    }
    process_queued_cpu_work(cpu);
-    cpu->thread_kicked = false;
+}
+
+static bool qemu_tcg_should_sleep(CPUState *cpu)
+{
+    if (mttcg_enabled) {
+        return cpu_thread_is_idle(cpu);
+    } else {
+        return all_cpu_threads_idle();
+    }
 }

 static void qemu_tcg_wait_io_event(CPUState *cpu)
 {
-    while (all_cpu_threads_idle()) {
+    while (qemu_tcg_should_sleep(cpu)) {
+        stop_tcg_kick_timer();
        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
    }

-    while (iothread_requesting_mutex) {
-        qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
-    }
+    start_tcg_kick_timer();

-    CPU_FOREACH(cpu) {
    qemu_wait_io_event_common(cpu);
-    }
 }

 static void qemu_kvm_wait_io_event(CPUState *cpu)
@@ -966,7 +1085,6 @@ static void qemu_kvm_wait_io_event(CPUState *cpu)
        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
    }

-    qemu_kvm_eat_signals(cpu);
    qemu_wait_io_event_common(cpu);
 }

@@ -989,7 +1107,7 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
        exit(1);
    }

-    qemu_kvm_init_cpu_signals(cpu);
+    kvm_init_cpu_signals(cpu);

    /* signal CPU creation */
    cpu->created = true;
@@ -1028,6 +1146,7 @@ static void *qemu_dummy_cpu_thread_fn(void *arg)
    qemu_thread_get_self(cpu->thread);
    cpu->thread_id = qemu_get_thread_id();
    cpu->can_do_io = 1;
+    current_cpu = cpu;

    sigemptyset(&waitset);
    sigaddset(&waitset, SIG_IPI);
@@ -1036,9 +1155,7 @@ static void *qemu_dummy_cpu_thread_fn(void *arg)
    cpu->created = true;
    qemu_cond_signal(&qemu_cpu_cond);

-    current_cpu = cpu;
    while (1) {
-        current_cpu = NULL;
        qemu_mutex_unlock_iothread();
        do {
            int sig;
@@ -1049,7 +1166,6 @@ static void *qemu_dummy_cpu_thread_fn(void *arg)
            exit(1);
        }
        qemu_mutex_lock_iothread();
-        current_cpu = cpu;
        qemu_wait_io_event_common(cpu);
    }

@@ -1081,16 +1197,54 @@ static int64_t tcg_get_icount_limit(void)

 static void handle_icount_deadline(void)
 {
+    assert(qemu_in_vcpu_thread());
    if (use_icount) {
        int64_t deadline =
            qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);

        if (deadline == 0) {
+            /* Wake up other AioContexts.  */
            qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
+            qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
        }
    }
 }

+static void prepare_icount_for_run(CPUState *cpu)
+{
+    if (use_icount) {
+        int insns_left;
+
+        /* These should always be cleared by process_icount_data after
+         * each vCPU execution. However u16.high can be raised
+         * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
+         */
+        g_assert(cpu->icount_decr.u16.low == 0);
+        g_assert(cpu->icount_extra == 0);
+
+        cpu->icount_budget = tcg_get_icount_limit();
+        insns_left = MIN(0xffff, cpu->icount_budget);
+        cpu->icount_decr.u16.low = insns_left;
+        cpu->icount_extra = cpu->icount_budget - insns_left;
+    }
+}
+
+static void process_icount_data(CPUState *cpu)
+{
+    if (use_icount) {
+        /* Account for executed instructions */
+        cpu_update_icount(cpu);
+
+        /* Reset the counters */
+        cpu->icount_decr.u16.low = 0;
+        cpu->icount_extra = 0;
+        cpu->icount_budget = 0;
+
+        replay_account_executed_instructions();
+    }
+}
+
+
 static int tcg_cpu_exec(CPUState *cpu)
 {
    int ret;
@@ -1101,35 +1255,14 @@ static int tcg_cpu_exec(CPUState *cpu)
 #ifdef CONFIG_PROFILER
    ti = profile_getclock();
 #endif
-    if (use_icount) {
-        int64_t count;
-        int decr;
-        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
-                                    + cpu->icount_extra);
-        cpu->icount_decr.u16.low = 0;
-        cpu->icount_extra = 0;
-        count = tcg_get_icount_limit();
-        timers_state.qemu_icount += count;
-        decr = (count > 0xffff) ? 0xffff : count;
-        count -= decr;
-        cpu->icount_decr.u16.low = decr;
-        cpu->icount_extra = count;
-    }
+    qemu_mutex_unlock_iothread();
    cpu_exec_start(cpu);
    ret = cpu_exec(cpu);
    cpu_exec_end(cpu);
+    qemu_mutex_lock_iothread();
 #ifdef CONFIG_PROFILER
    tcg_time += profile_getclock() - ti;
 #endif
-    if (use_icount) {
-        /* Fold pending instructions back into the
-           instruction counter, and clear the interrupt flag.  */
-        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
-                        + cpu->icount_extra);
-        cpu->icount_decr.u32 = 0;
-        cpu->icount_extra = 0;
-        replay_account_executed_instructions();
-    }
    return ret;
 }

@@ -1150,7 +1283,16 @@ static void deal_with_unplugged_cpus(void)
    }
 }

-static void *qemu_tcg_cpu_thread_fn(void *arg)
+/* Single-threaded TCG
+ *
+ * In the single-threaded case each vCPU is simulated in turn. If
+ * there is more than a single vCPU we create a simple timer to kick
+ * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
+ * This is done explicitly rather than relying on side-effects
+ * elsewhere.
+ */
+
+static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
 {
    CPUState *cpu = arg;

@@ -1172,50 +1314,75 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)

        /* process any pending work */
        CPU_FOREACH(cpu) {
+            current_cpu = cpu;
            qemu_wait_io_event_common(cpu);
        }
    }

-    /* process any pending work */
-    atomic_mb_set(&exit_request, 1);
+    start_tcg_kick_timer();

    cpu = first_cpu;

+    /* process any pending work */
+    cpu->exit_request = 1;
+
    while (1) {
        /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
        qemu_account_warp_timer();

+        /* Run the timers here.  This is much more efficient than
+         * waking up the I/O thread and waiting for completion.
+         */
+        handle_icount_deadline();
+
        if (!cpu) {
            cpu = first_cpu;
        }

-        for (; cpu != NULL && !exit_request; cpu = CPU_NEXT(cpu)) {
+        while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
+
+            atomic_mb_set(&tcg_current_rr_cpu, cpu);
+            current_cpu = cpu;

            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);

            if (cpu_can_run(cpu)) {
                int r;
+
+                prepare_icount_for_run(cpu);
+
                r = tcg_cpu_exec(cpu);
+
+                process_icount_data(cpu);
+
                if (r == EXCP_DEBUG) {
                    cpu_handle_guest_debug(cpu);
                    break;
+                } else if (r == EXCP_ATOMIC) {
+                    qemu_mutex_unlock_iothread();
+                    cpu_exec_step_atomic(cpu);
+                    qemu_mutex_lock_iothread();
+                    break;
                }
-            } else if (cpu->stop || cpu->stopped) {
+            } else if (cpu->stop) {
                if (cpu->unplug) {
                    cpu = CPU_NEXT(cpu);
                }
                break;
            }

-        } /* for cpu.. */
+            cpu = CPU_NEXT(cpu);
+        } /* while (cpu && !cpu->exit_request).. */

-        /* Pairs with smp_wmb in qemu_cpu_kick.  */
-        atomic_mb_set(&exit_request, 0);
+        /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
+        atomic_set(&tcg_current_rr_cpu, NULL);

-        handle_icount_deadline();
+        if (cpu && cpu->exit_request) {
+            atomic_mb_set(&cpu->exit_request, 0);
+        }

-        qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
+        qemu_tcg_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
        deal_with_unplugged_cpus();
    }

@@ -1226,8 +1393,9 @@ static void *qemu_hax_cpu_thread_fn(void *arg)
 {
    CPUState *cpu = arg;
    int r;
+
+    qemu_mutex_lock_iothread();
    qemu_thread_get_self(cpu->thread);
-    qemu_mutex_lock(&qemu_global_mutex);

    cpu->thread_id = qemu_get_thread_id();
    cpu->created = true;
@@ -1262,6 +1430,68 @@ static void CALLBACK dummy_apc_func(ULONG_PTR unused)
 }
 #endif

+/* Multi-threaded TCG
+ *
+ * In the multi-threaded case each vCPU has its own thread. The TLS
+ * variable current_cpu can be used deep in the code to find the
+ * current CPUState for a given thread.
+ */
+
+static void *qemu_tcg_cpu_thread_fn(void *arg)
+{
+    CPUState *cpu = arg;
+
+    g_assert(!use_icount);
+
+    rcu_register_thread();
+
+    qemu_mutex_lock_iothread();
+    qemu_thread_get_self(cpu->thread);
+
+    cpu->thread_id = qemu_get_thread_id();
+    cpu->created = true;
+    cpu->can_do_io = 1;
+    current_cpu = cpu;
+    qemu_cond_signal(&qemu_cpu_cond);
+
+    /* process any pending work */
+    cpu->exit_request = 1;
+
+    while (1) {
+        if (cpu_can_run(cpu)) {
+            int r;
+            r = tcg_cpu_exec(cpu);
+            switch (r) {
+            case EXCP_DEBUG:
+                cpu_handle_guest_debug(cpu);
+                break;
+            case EXCP_HALTED:
+                /* during start-up the vCPU is reset and the thread is
+                 * kicked several times. If we don't ensure we go back
+                 * to sleep in the halted state we won't cleanly
+                 * start-up when the vCPU is enabled.
+                 *
+                 * cpu->halted should ensure we sleep in wait_io_event
+                 */
+                g_assert(cpu->halted);
+                break;
+            case EXCP_ATOMIC:
+                qemu_mutex_unlock_iothread();
+                cpu_exec_step_atomic(cpu);
+                qemu_mutex_lock_iothread();
+            default:
+                /* Ignore everything else? */
+                break;
+            }
+        }
+
+        atomic_mb_set(&cpu->exit_request, 0);
+        qemu_tcg_wait_io_event(cpu);
+    }
+
+    return NULL;
+}
+
 static void qemu_cpu_kick_thread(CPUState *cpu)
 {
 #ifndef _WIN32
@@ -1287,24 +1517,13 @@ static void qemu_cpu_kick_thread(CPUState *cpu)
 #endif
 }

-static void qemu_cpu_kick_no_halt(void)
-{
-    CPUState *cpu;
-    /* Ensure whatever caused the exit has reached the CPU threads before
-     * writing exit_request.
-     */
-    atomic_mb_set(&exit_request, 1);
-    cpu = atomic_mb_read(&tcg_current_cpu);
-    if (cpu) {
-        cpu_exit(cpu);
-    }
-}
-
 void qemu_cpu_kick(CPUState *cpu)
 {
    qemu_cond_broadcast(cpu->halt_cond);
    if (tcg_enabled()) {
-        qemu_cpu_kick_no_halt();
+        cpu_exit(cpu);
+        /* NOP unless doing single-thread RR */
+        qemu_cpu_kick_rr_cpu();
    } else {
        if (hax_enabled()) {
            /*
@@ -1342,27 +1561,14 @@ bool qemu_mutex_iothread_locked(void)

 void qemu_mutex_lock_iothread(void)
 {
-    atomic_inc(&iothread_requesting_mutex);
-    /* In the simple case there is no need to bump the VCPU thread out of
-     * TCG code execution.
-     */
-    if (!tcg_enabled() || qemu_in_vcpu_thread() ||
-        !first_cpu || !first_cpu->created) {
+    g_assert(!qemu_mutex_iothread_locked());
    qemu_mutex_lock(&qemu_global_mutex);
-        atomic_dec(&iothread_requesting_mutex);
-    } else {
-        if (qemu_mutex_trylock(&qemu_global_mutex)) {
-            qemu_cpu_kick_no_halt();
-            qemu_mutex_lock(&qemu_global_mutex);
-        }
-        atomic_dec(&iothread_requesting_mutex);
-        qemu_cond_broadcast(&qemu_io_proceeded_cond);
-    }
    iothread_locked = true;
 }

 void qemu_mutex_unlock_iothread(void)
 {
+    g_assert(qemu_mutex_iothread_locked());
    iothread_locked = false;
    qemu_mutex_unlock(&qemu_global_mutex);
 }
@@ -1392,13 +1598,6 @@ void pause_all_vcpus(void)

    if (qemu_in_vcpu_thread()) {
        cpu_stop_current();
-        if (!kvm_enabled()) {
-            CPU_FOREACH(cpu) {
-                cpu->stop = false;
-                cpu->stopped = true;
-            }
-            return;
-        }
    }

    while (!all_vcpus_paused()) {
@@ -1447,29 +1646,43 @@ void cpu_remove_sync(CPUState *cpu)
 static void qemu_tcg_init_vcpu(CPUState *cpu)
 {
    char thread_name[VCPU_THREAD_NAME_SIZE];
-    static QemuCond *tcg_halt_cond;
-    static QemuThread *tcg_cpu_thread;
+    static QemuCond *single_tcg_halt_cond;
+    static QemuThread *single_tcg_cpu_thread;

-    /* share a single thread for all cpus with TCG */
-    if (!tcg_cpu_thread) {
+    if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
        cpu->thread = g_malloc0(sizeof(QemuThread));
        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
        qemu_cond_init(cpu->halt_cond);
-        tcg_halt_cond = cpu->halt_cond;
+
+        if (qemu_tcg_mttcg_enabled()) {
+            /* create a thread per vCPU with TCG (MTTCG) */
+            parallel_cpus = true;
            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
                 cpu->cpu_index);
+
            qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
                               cpu, QEMU_THREAD_JOINABLE);
+
+        } else {
+            /* share a single thread for all cpus with TCG */
+            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
+            qemu_thread_create(cpu->thread, thread_name,
+                               qemu_tcg_rr_cpu_thread_fn,
+                               cpu, QEMU_THREAD_JOINABLE);
+
+            single_tcg_halt_cond = cpu->halt_cond;
+            single_tcg_cpu_thread = cpu->thread;
+        }
 #ifdef _WIN32
        cpu->hThread = qemu_thread_get_handle(cpu->thread);
 #endif
        while (!cpu->created) {
            qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
        }
-        tcg_cpu_thread = cpu->thread;
    } else {
-        cpu->thread = tcg_cpu_thread;
-        cpu->halt_cond = tcg_halt_cond;
+        /* For non-MTTCG cases we share the thread */
+        cpu->thread = single_tcg_cpu_thread;
+        cpu->halt_cond = single_tcg_halt_cond;
    }
 }

@@ -1578,6 +1791,48 @@ int vm_stop(RunState state)
    return do_vm_stop(state);
 }

+/**
+ * Prepare for (re)starting the VM.
+ * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
+ * running or in case of an error condition), 0 otherwise.
+ */
+int vm_prepare_start(void)
+{
+    RunState requested;
+    int res = 0;
+
+    qemu_vmstop_requested(&requested);
+    if (runstate_is_running() && requested == RUN_STATE__MAX) {
+        return -1;
+    }
+
+    /* Ensure that a STOP/RESUME pair of events is emitted if a
+     * vmstop request was pending.  The BLOCK_IO_ERROR event, for
+     * example, according to documentation is always followed by
+     * the STOP event.
+     */
+    if (runstate_is_running()) {
+        qapi_event_send_stop(&error_abort);
+        res = -1;
+    } else {
+        replay_enable_events();
+        cpu_enable_ticks();
+        runstate_set(RUN_STATE_RUNNING);
+        vm_state_notify(1, RUN_STATE_RUNNING);
+    }
+
+    /* We are sending this now, but the CPUs will be resumed shortly later */
+    qapi_event_send_resume(&error_abort);
+    return res;
+}
+
+void vm_start(void)
+{
+    if (!vm_prepare_start()) {
+        resume_all_vcpus();
+    }
+}
+
 /* does a state transition even if the VM is already stopped,
   current state is forgotten forever */
 int vm_stop_force_state(RunState state)
--- a/cputlb.c
+++ b/cputlb.c
@@ -18,6 +18,7 @@
 */

 #include "qemu/osdep.h"
+#include "qemu/main-loop.h"
 #include "cpu.h"
 #include "exec/exec-all.h"
 #include "exec/memory.h"
@@ -57,6 +58,40 @@
    } \
 } while (0)

+#define assert_cpu_is_self(this_cpu) do {                         \
+        if (DEBUG_TLB_GATE) {                                     \
+            g_assert(!cpu->created || qemu_cpu_is_self(cpu));     \
+        }                                                         \
+    } while (0)
+
+/* run_on_cpu_data.target_ptr should always be big enough for a
+ * target_ulong even on 32 bit builds */
+QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(run_on_cpu_data));
+
+/* We currently can't handle more than 16 bits in the MMUIDX bitmask.
+ */
+QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16);
+#define ALL_MMUIDX_BITS ((1 << NB_MMU_MODES) - 1)
+
+/* flush_all_helper: run fn across all cpus
+ *
+ * If the wait flag is set then the src cpu's helper will be queued as
+ * "safe" work and the loop exited creating a synchronisation point
+ * where all queued work will be finished before execution starts
+ * again.
+ */
+static void flush_all_helper(CPUState *src, run_on_cpu_func fn,
+                             run_on_cpu_data d)
+{
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu) {
+        if (cpu != src) {
+            async_run_on_cpu(cpu, fn, d);
+        }
+    }
+}
+
 /* statistics */
 int tlb_flush_count;

@@ -65,10 +100,22 @@ int tlb_flush_count;
 * flushing more entries than required is only an efficiency issue,
 * not a correctness issue.
 */
-void tlb_flush(CPUState *cpu)
+static void tlb_flush_nocheck(CPUState *cpu)
 {
    CPUArchState *env = cpu->env_ptr;

+    /* The QOM tests will trigger tlb_flushes without setting up TCG
+     * so we bug out here in that case.
+     */
+    if (!tcg_enabled()) {
+        return;
+    }
+
+    assert_cpu_is_self(cpu);
+    tlb_debug("(count: %d)\n", tlb_flush_count++);
+
+    tb_lock();
+
    memset(env->tlb_table, -1, sizeof(env->tlb_table));
    memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
    memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
@@ -76,39 +123,117 @@ void tlb_flush(CPUState *cpu)
    env->vtlb_index = 0;
    env->tlb_flush_addr = -1;
    env->tlb_flush_mask = 0;
-    tlb_flush_count++;
+
+    tb_unlock();
+
+    atomic_mb_set(&cpu->pending_tlb_flush, 0);
 }

-static inline void v_tlb_flush_by_mmuidx(CPUState *cpu, va_list argp)
+static void tlb_flush_global_async_work(CPUState *cpu, run_on_cpu_data data)
+{
+    tlb_flush_nocheck(cpu);
+}
+
+void tlb_flush(CPUState *cpu)
+{
+    if (cpu->created && !qemu_cpu_is_self(cpu)) {
+        if (atomic_mb_read(&cpu->pending_tlb_flush) != ALL_MMUIDX_BITS) {
+            atomic_mb_set(&cpu->pending_tlb_flush, ALL_MMUIDX_BITS);
+            async_run_on_cpu(cpu, tlb_flush_global_async_work,
+                             RUN_ON_CPU_NULL);
+        }
+    } else {
+        tlb_flush_nocheck(cpu);
+    }
+}
+
+void tlb_flush_all_cpus(CPUState *src_cpu)
+{
+    const run_on_cpu_func fn = tlb_flush_global_async_work;
+    flush_all_helper(src_cpu, fn, RUN_ON_CPU_NULL);
+    fn(src_cpu, RUN_ON_CPU_NULL);
+}
+
+void tlb_flush_all_cpus_synced(CPUState *src_cpu)
+{
+    const run_on_cpu_func fn = tlb_flush_global_async_work;
+    flush_all_helper(src_cpu, fn, RUN_ON_CPU_NULL);
+    async_safe_run_on_cpu(src_cpu, fn, RUN_ON_CPU_NULL);
+}
+
+static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
 {
    CPUArchState *env = cpu->env_ptr;
+    unsigned long mmu_idx_bitmask = data.host_int;
+    int mmu_idx;

-    tlb_debug("start\n");
+    assert_cpu_is_self(cpu);

-    for (;;) {
-        int mmu_idx = va_arg(argp, int);
+    tb_lock();

-        if (mmu_idx < 0) {
-            break;
-        }
+    tlb_debug("start: mmu_idx:0x%04lx\n", mmu_idx_bitmask);

+    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
+
+        if (test_bit(mmu_idx, &mmu_idx_bitmask)) {
            tlb_debug("%d\n", mmu_idx);

            memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0]));
            memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
        }
+    }

    memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
+
+    tlb_debug("done\n");
+
+    tb_unlock();
 }

-void tlb_flush_by_mmuidx(CPUState *cpu, ...)
+void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap)
 {
-    va_list argp;
-    va_start(argp, cpu);
-    v_tlb_flush_by_mmuidx(cpu, argp);
-    va_end(argp);
+    tlb_debug("mmu_idx: 0x%" PRIx16 "\n", idxmap);
+
+    if (!qemu_cpu_is_self(cpu)) {
+        uint16_t pending_flushes = idxmap;
+        pending_flushes &= ~atomic_mb_read(&cpu->pending_tlb_flush);
+
+        if (pending_flushes) {
+            tlb_debug("reduced mmu_idx: 0x%" PRIx16 "\n", pending_flushes);
+
+            atomic_or(&cpu->pending_tlb_flush, pending_flushes);
+            async_run_on_cpu(cpu, tlb_flush_by_mmuidx_async_work,
+                             RUN_ON_CPU_HOST_INT(pending_flushes));
+        }
+    } else {
+        tlb_flush_by_mmuidx_async_work(cpu,
+                                       RUN_ON_CPU_HOST_INT(idxmap));
+    }
 }

+void tlb_flush_by_mmuidx_all_cpus(CPUState *src_cpu, uint16_t idxmap)
+{
+    const run_on_cpu_func fn = tlb_flush_by_mmuidx_async_work;
+
+    tlb_debug("mmu_idx: 0x%"PRIx16"\n", idxmap);
+
+    flush_all_helper(src_cpu, fn, RUN_ON_CPU_HOST_INT(idxmap));
+    fn(src_cpu, RUN_ON_CPU_HOST_INT(idxmap));
+}
+
+void tlb_flush_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
+                                                       uint16_t idxmap)
+{
+    const run_on_cpu_func fn = tlb_flush_by_mmuidx_async_work;
+
+    tlb_debug("mmu_idx: 0x%"PRIx16"\n", idxmap);
+
+    flush_all_helper(src_cpu, fn, RUN_ON_CPU_HOST_INT(idxmap));
+    async_safe_run_on_cpu(src_cpu, fn, RUN_ON_CPU_HOST_INT(idxmap));
+}
+
+
+
 static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr)
 {
    if (addr == (tlb_entry->addr_read &
@@ -121,12 +246,15 @@ static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr)
    }
 }

-void tlb_flush_page(CPUState *cpu, target_ulong addr)
+static void tlb_flush_page_async_work(CPUState *cpu, run_on_cpu_data data)
 {
    CPUArchState *env = cpu->env_ptr;
+    target_ulong addr = (target_ulong) data.target_ptr;
    int i;
    int mmu_idx;

+    assert_cpu_is_self(cpu);
+
    tlb_debug("page :" TARGET_FMT_lx "\n", addr);

    /* Check if we need to flush due to large pages.  */
@@ -156,15 +284,62 @@ void tlb_flush_page(CPUState *cpu, target_ulong addr)
    tb_flush_jmp_cache(cpu, addr);
 }

-void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)
+void tlb_flush_page(CPUState *cpu, target_ulong addr)
+{
+    tlb_debug("page :" TARGET_FMT_lx "\n", addr);
+
+    if (!qemu_cpu_is_self(cpu)) {
+        async_run_on_cpu(cpu, tlb_flush_page_async_work,
+                         RUN_ON_CPU_TARGET_PTR(addr));
+    } else {
+        tlb_flush_page_async_work(cpu, RUN_ON_CPU_TARGET_PTR(addr));
+    }
+}
+
+/* As we are going to hijack the bottom bits of the page address for a
+ * mmuidx bit mask we need to fail to build if we can't do that
+ */
+QEMU_BUILD_BUG_ON(NB_MMU_MODES > TARGET_PAGE_BITS_MIN);
+
+static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu,
+                                                run_on_cpu_data data)
 {
    CPUArchState *env = cpu->env_ptr;
-    int i, k;
-    va_list argp;
+    target_ulong addr_and_mmuidx = (target_ulong) data.target_ptr;
+    target_ulong addr = addr_and_mmuidx & TARGET_PAGE_MASK;
+    unsigned long mmu_idx_bitmap = addr_and_mmuidx & ALL_MMUIDX_BITS;
+    int page = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    int mmu_idx;
+    int i;

-    va_start(argp, addr);
+    assert_cpu_is_self(cpu);

-    tlb_debug("addr "TARGET_FMT_lx"\n", addr);
+    tlb_debug("page:%d addr:"TARGET_FMT_lx" mmu_idx:0x%lx\n",
+              page, addr, mmu_idx_bitmap);
+
+    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
+        if (test_bit(mmu_idx, &mmu_idx_bitmap)) {
+            tlb_flush_entry(&env->tlb_table[mmu_idx][page], addr);
+
+            /* check whether there are vltb entries that need to be flushed */
+            for (i = 0; i < CPU_VTLB_SIZE; i++) {
+                tlb_flush_entry(&env->tlb_v_table[mmu_idx][i], addr);
+            }
+        }
+    }
+
+    tb_flush_jmp_cache(cpu, addr);
+}
+
+static void tlb_check_page_and_flush_by_mmuidx_async_work(CPUState *cpu,
+                                                          run_on_cpu_data data)
+{
+    CPUArchState *env = cpu->env_ptr;
+    target_ulong addr_and_mmuidx = (target_ulong) data.target_ptr;
+    target_ulong addr = addr_and_mmuidx & TARGET_PAGE_MASK;
+    unsigned long mmu_idx_bitmap = addr_and_mmuidx & ALL_MMUIDX_BITS;
+
+    tlb_debug("addr:"TARGET_FMT_lx" mmu_idx: %04lx\n", addr, mmu_idx_bitmap);

    /* Check if we need to flush due to large pages.  */
    if ((addr & env->tlb_flush_mask) == env->tlb_flush_addr) {
@@ -172,33 +347,80 @@ void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)
                  TARGET_FMT_lx "/" TARGET_FMT_lx ")\n",
                  env->tlb_flush_addr, env->tlb_flush_mask);

-        v_tlb_flush_by_mmuidx(cpu, argp);
-        va_end(argp);
-        return;
+        tlb_flush_by_mmuidx_async_work(cpu,
+                                       RUN_ON_CPU_HOST_INT(mmu_idx_bitmap));
+    } else {
+        tlb_flush_page_by_mmuidx_async_work(cpu, data);
    }
+}

-    addr &= TARGET_PAGE_MASK;
-    i = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, uint16_t idxmap)
+{
+    target_ulong addr_and_mmu_idx;

-    for (;;) {
-        int mmu_idx = va_arg(argp, int);
+    tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%" PRIx16 "\n", addr, idxmap);

-        if (mmu_idx < 0) {
-            break;
+    /* This should already be page aligned */
+    addr_and_mmu_idx = addr & TARGET_PAGE_MASK;
+    addr_and_mmu_idx |= idxmap;
+
+    if (!qemu_cpu_is_self(cpu)) {
+        async_run_on_cpu(cpu, tlb_check_page_and_flush_by_mmuidx_async_work,
+                         RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
+    } else {
+        tlb_check_page_and_flush_by_mmuidx_async_work(
+            cpu, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
    }
+}

-        tlb_debug("idx %d\n", mmu_idx);
+void tlb_flush_page_by_mmuidx_all_cpus(CPUState *src_cpu, target_ulong addr,
+                                       uint16_t idxmap)
+{
+    const run_on_cpu_func fn = tlb_check_page_and_flush_by_mmuidx_async_work;
+    target_ulong addr_and_mmu_idx;

-        tlb_flush_entry(&env->tlb_table[mmu_idx][i], addr);
+    tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%"PRIx16"\n", addr, idxmap);

-        /* check whether there are vltb entries that need to be flushed */
-        for (k = 0; k < CPU_VTLB_SIZE; k++) {
-            tlb_flush_entry(&env->tlb_v_table[mmu_idx][k], addr);
-        }
-    }
-    va_end(argp);
+    /* This should already be page aligned */
+    addr_and_mmu_idx = addr & TARGET_PAGE_MASK;
+    addr_and_mmu_idx |= idxmap;

-    tb_flush_jmp_cache(cpu, addr);
+    flush_all_helper(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
+    fn(src_cpu, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
+}
+
+void tlb_flush_page_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
+                                                            target_ulong addr,
+                                                            uint16_t idxmap)
+{
+    const run_on_cpu_func fn = tlb_check_page_and_flush_by_mmuidx_async_work;
+    target_ulong addr_and_mmu_idx;
+
+    tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%"PRIx16"\n", addr, idxmap);
+
+    /* This should already be page aligned */
+    addr_and_mmu_idx = addr & TARGET_PAGE_MASK;
+    addr_and_mmu_idx |= idxmap;
+
+    flush_all_helper(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
+    async_safe_run_on_cpu(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
+}
+
+void tlb_flush_page_all_cpus(CPUState *src, target_ulong addr)
+{
+    const run_on_cpu_func fn = tlb_flush_page_async_work;
+
+    flush_all_helper(src, fn, RUN_ON_CPU_TARGET_PTR(addr));
+    fn(src, RUN_ON_CPU_TARGET_PTR(addr));
+}
+
+void tlb_flush_page_all_cpus_synced(CPUState *src,
+                                                  target_ulong addr)
+{
+    const run_on_cpu_func fn = tlb_flush_page_async_work;
+
+    flush_all_helper(src, fn, RUN_ON_CPU_TARGET_PTR(addr));
+    async_safe_run_on_cpu(src, fn, RUN_ON_CPU_TARGET_PTR(addr));
 }

 /* update the TLBs so that writes to code in the virtual page 'addr'
@@ -216,36 +438,84 @@ void tlb_unprotect_code(ram_addr_t ram_addr)
    cpu_physical_memory_set_dirty_flag(ram_addr, DIRTY_MEMORY_CODE);
 }

-static bool tlb_is_dirty_ram(CPUTLBEntry *tlbe)
-{
-    return (tlbe->addr_write & (TLB_INVALID_MASK|TLB_MMIO|TLB_NOTDIRTY)) == 0;
-}

-void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry, uintptr_t start,
+/*
+ * Dirty write flag handling
+ *
+ * When the TCG code writes to a location it looks up the address in
+ * the TLB and uses that data to compute the final address. If any of
+ * the lower bits of the address are set then the slow path is forced.
+ * There are a number of reasons to do this but for normal RAM the
+ * most usual is detecting writes to code regions which may invalidate
+ * generated code.
+ *
+ * Because we want other vCPUs to respond to changes straight away we
+ * update the te->addr_write field atomically. If the TLB entry has
+ * been changed by the vCPU in the mean time we skip the update.
+ *
+ * As this function uses atomic accesses we also need to ensure
+ * updates to tlb_entries follow the same access rules. We don't need
+ * to worry about this for oversized guests as MTTCG is disabled for
+ * them.
+ */
+
+static void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry, uintptr_t start,
                           uintptr_t length)
 {
-    uintptr_t addr;
+#if TCG_OVERSIZED_GUEST
+    uintptr_t addr = tlb_entry->addr_write;

-    if (tlb_is_dirty_ram(tlb_entry)) {
-        addr = (tlb_entry->addr_write & TARGET_PAGE_MASK) + tlb_entry->addend;
+    if ((addr & (TLB_INVALID_MASK | TLB_MMIO | TLB_NOTDIRTY)) == 0) {
+        addr &= TARGET_PAGE_MASK;
+        addr += tlb_entry->addend;
        if ((addr - start) < length) {
            tlb_entry->addr_write |= TLB_NOTDIRTY;
        }
    }
-}
+#else
+    /* paired with atomic_mb_set in tlb_set_page_with_attrs */
+    uintptr_t orig_addr = atomic_mb_read(&tlb_entry->addr_write);
+    uintptr_t addr = orig_addr;

-static inline ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
-{
-    ram_addr_t ram_addr;
-
-    ram_addr = qemu_ram_addr_from_host(ptr);
-    if (ram_addr == RAM_ADDR_INVALID) {
-        fprintf(stderr, "Bad ram pointer %p\n", ptr);
-        abort();
+    if ((addr & (TLB_INVALID_MASK | TLB_MMIO | TLB_NOTDIRTY)) == 0) {
+        addr &= TARGET_PAGE_MASK;
+        addr += atomic_read(&tlb_entry->addend);
+        if ((addr - start) < length) {
+            uintptr_t notdirty_addr = orig_addr | TLB_NOTDIRTY;
+            atomic_cmpxchg(&tlb_entry->addr_write, orig_addr, notdirty_addr);
        }
-    return ram_addr;
+    }
+#endif
 }

+/* For atomic correctness when running MTTCG we need to use the right
+ * primitives when copying entries */
+static inline void copy_tlb_helper(CPUTLBEntry *d, CPUTLBEntry *s,
+                                   bool atomic_set)
+{
+#if TCG_OVERSIZED_GUEST
+    *d = *s;
+#else
+    if (atomic_set) {
+        d->addr_read = s->addr_read;
+        d->addr_code = s->addr_code;
+        atomic_set(&d->addend, atomic_read(&s->addend));
+        /* Pairs with flag setting in tlb_reset_dirty_range */
+        atomic_mb_set(&d->addr_write, atomic_read(&s->addr_write));
+    } else {
+        d->addr_read = s->addr_read;
+        d->addr_write = atomic_read(&s->addr_write);
+        d->addr_code = s->addr_code;
+        d->addend = atomic_read(&s->addend);
+    }
+#endif
+}
+
+/* This is a cross vCPU call (i.e. another vCPU resetting the flags of
+ * the target vCPU). As such care needs to be taken that we don't
+ * dangerously race with another vCPU update. The only thing actually
+ * updated is the target TLB entry ->addr_write flags.
+ */
 void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
 {
    CPUArchState *env;
@@ -283,6 +553,8 @@ void tlb_set_dirty(CPUState *cpu, target_ulong vaddr)
    int i;
    int mmu_idx;

+    assert_cpu_is_self(cpu);
+
    vaddr &= TARGET_PAGE_MASK;
    i = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
@@ -337,11 +609,12 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
    target_ulong address;
    target_ulong code_address;
    uintptr_t addend;
-    CPUTLBEntry *te;
+    CPUTLBEntry *te, *tv, tn;
    hwaddr iotlb, xlat, sz;
    unsigned vidx = env->vtlb_index++ % CPU_VTLB_SIZE;
    int asidx = cpu_asidx_from_attrs(cpu, attrs);

+    assert_cpu_is_self(cpu);
    assert(size >= TARGET_PAGE_SIZE);
    if (size != TARGET_PAGE_SIZE) {
        tlb_add_large_page(env, vaddr, size);
@@ -371,41 +644,50 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,

    index = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
    te = &env->tlb_table[mmu_idx][index];
-
    /* do not discard the translation in te, evict it into a victim tlb */
-    env->tlb_v_table[mmu_idx][vidx] = *te;
+    tv = &env->tlb_v_table[mmu_idx][vidx];
+
+    /* addr_write can race with tlb_reset_dirty_range */
+    copy_tlb_helper(tv, te, true);
+
    env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index];

    /* refill the tlb */
    env->iotlb[mmu_idx][index].addr = iotlb - vaddr;
    env->iotlb[mmu_idx][index].attrs = attrs;
-    te->addend = addend - vaddr;
+
+    /* Now calculate the new entry */
+    tn.addend = addend - vaddr;
    if (prot & PAGE_READ) {
-        te->addr_read = address;
+        tn.addr_read = address;
    } else {
-        te->addr_read = -1;
+        tn.addr_read = -1;
    }

    if (prot & PAGE_EXEC) {
-        te->addr_code = code_address;
+        tn.addr_code = code_address;
    } else {
-        te->addr_code = -1;
+        tn.addr_code = -1;
    }
+
+    tn.addr_write = -1;
    if (prot & PAGE_WRITE) {
        if ((memory_region_is_ram(section->mr) && section->readonly)
            || memory_region_is_romd(section->mr)) {
            /* Write access calls the I/O callback.  */
-            te->addr_write = address | TLB_MMIO;
+            tn.addr_write = address | TLB_MMIO;
        } else if (memory_region_is_ram(section->mr)
                   && cpu_physical_memory_is_clean(
                        memory_region_get_ram_addr(section->mr) + xlat)) {
-            te->addr_write = address | TLB_NOTDIRTY;
+            tn.addr_write = address | TLB_NOTDIRTY;
        } else {
-            te->addr_write = address;
+            tn.addr_write = address;
        }
-    } else {
-        te->addr_write = -1;
    }
+
+    /* Pairs with flag setting in tlb_reset_dirty_range */
+    copy_tlb_helper(te, &tn, true);
+    /* atomic_mb_set(&te->addr_write, write_address); */
 }

 /* Add a new TLB entry, but without specifying the memory
@@ -452,6 +734,18 @@ static void report_bad_exec(CPUState *cpu, target_ulong addr)
    log_cpu_state_mask(LOG_GUEST_ERROR, cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
 }

+static inline ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
+{
+    ram_addr_t ram_addr;
+
+    ram_addr = qemu_ram_addr_from_host(ptr);
+    if (ram_addr == RAM_ADDR_INVALID) {
+        error_report("Bad ram pointer %p", ptr);
+        abort();
+    }
+    return ram_addr;
+}
+
 /* NOTE: this function can trigger an exception */
 /* NOTE2: the returned address is not exactly the physical address: it
 * is actually a ram_addr_t (in system mode; the user mode emulation
@@ -475,15 +769,14 @@ tb_page_addr_t get_page_addr_code(CPUArchState *env1, target_ulong addr)
    pd = iotlbentry->addr & ~TARGET_PAGE_MASK;
    mr = iotlb_to_region(cpu, pd, iotlbentry->attrs);
    if (memory_region_is_unassigned(mr)) {
-        CPUClass *cc = CPU_GET_CLASS(cpu);
-
-        if (cc->do_unassigned_access) {
-            cc->do_unassigned_access(cpu, addr, false, true, 0, 4);
-        } else {
+        cpu_unassigned_access(cpu, addr, false, true, 0, 4);
+        /* The CPU's unassigned access hook might have longjumped out
+         * with an exception. If it didn't (or there was no hook) then
+         * we can't proceed further.
+         */
        report_bad_exec(cpu, addr);
        exit(1);
    }
-    }
    p = (void *)((uintptr_t)addr + env1->tlb_table[mmu_idx][page_index].addend);
    return qemu_ram_addr_from_host_nofail(p);
 }
@@ -495,6 +788,7 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
    hwaddr physaddr = iotlbentry->addr;
    MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
    uint64_t val;
+    bool locked = false;

    physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
    cpu->mem_io_pc = retaddr;
@@ -503,7 +797,16 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
    }

    cpu->mem_io_vaddr = addr;
+
+    if (mr->global_locking) {
+        qemu_mutex_lock_iothread();
+        locked = true;
+    }
    memory_region_dispatch_read(mr, physaddr, &val, size, iotlbentry->attrs);
+    if (locked) {
+        qemu_mutex_unlock_iothread();
+    }
+
    return val;
 }

@@ -514,15 +817,23 @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
    CPUState *cpu = ENV_GET_CPU(env);
    hwaddr physaddr = iotlbentry->addr;
    MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
+    bool locked = false;

    physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
        cpu_io_recompile(cpu, retaddr);
    }
-
    cpu->mem_io_vaddr = addr;
    cpu->mem_io_pc = retaddr;
+
+    if (mr->global_locking) {
+        qemu_mutex_lock_iothread();
+        locked = true;
+    }
    memory_region_dispatch_write(mr, physaddr, val, size, iotlbentry->attrs);
+    if (locked) {
+        qemu_mutex_unlock_iothread();
+    }
 }

 /* Return true if ADDR is present in the victim tlb, and has been copied
@@ -538,10 +849,13 @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
        if (cmp == page) {
            /* Found entry in victim tlb, swap tlb and iotlb.  */
            CPUTLBEntry tmptlb, *tlb = &env->tlb_table[mmu_idx][index];
+
+            copy_tlb_helper(&tmptlb, tlb, false);
+            copy_tlb_helper(tlb, vtlb, true);
+            copy_tlb_helper(vtlb, &tmptlb, true);
+
            CPUIOTLBEntry tmpio, *io = &env->iotlb[mmu_idx][index];
            CPUIOTLBEntry *vio = &env->iotlb_v[mmu_idx][vidx];
-
-            tmptlb = *tlb; *tlb = *vtlb; *vtlb = tmptlb;
            tmpio = *io; *io = *vio; *vio = tmpio;
            return true;
        }
--- a/crypto/block-luks.c
+++ b/crypto/block-luks.c
@@ -473,10 +473,10 @@ qcrypto_block_luks_load_key(QCryptoBlock *block,
     * then encrypted.
     */
    rv = readfunc(block,
+                  opaque,
                  slot->key_offset * QCRYPTO_BLOCK_LUKS_SECTOR_SIZE,
                  splitkey, splitkeylen,
-                  errp,
-                  opaque);
+                  errp);
    if (rv < 0) {
        goto cleanup;
    }
@@ -676,11 +676,10 @@ qcrypto_block_luks_open(QCryptoBlock *block,

    /* Read the entire LUKS header, minus the key material from
     * the underlying device */
-    rv = readfunc(block, 0,
+    rv = readfunc(block, opaque, 0,
                  (uint8_t *)&luks->header,
                  sizeof(luks->header),
-                  errp,
-                  opaque);
+                  errp);
    if (rv < 0) {
        ret = rv;
        goto fail;
@@ -1246,7 +1245,7 @@ qcrypto_block_luks_create(QCryptoBlock *block,
        QCRYPTO_BLOCK_LUKS_SECTOR_SIZE;

    /* Reserve header space to match payload offset */
-    initfunc(block, block->payload_offset, &local_err, opaque);
+    initfunc(block, opaque, block->payload_offset, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        goto error;
@@ -1268,11 +1267,10 @@ qcrypto_block_luks_create(QCryptoBlock *block,


    /* Write out the partition header and key slot headers */
-    writefunc(block, 0,
+    writefunc(block, opaque, 0,
              (const uint8_t *)&luks->header,
              sizeof(luks->header),
-              &local_err,
-              opaque);
+              &local_err);

    /* Delay checking local_err until we've byte-swapped */

@@ -1297,12 +1295,11 @@ qcrypto_block_luks_create(QCryptoBlock *block,

    /* Write out the master key material, starting at the
     * sector immediately following the partition header. */
-    if (writefunc(block,
+    if (writefunc(block, opaque,
                  luks->header.key_slots[0].key_offset *
                  QCRYPTO_BLOCK_LUKS_SECTOR_SIZE,
                  splitkey, splitkeylen,
-                  errp,
-                  opaque) != splitkeylen) {
+                  errp) != splitkeylen) {
        goto error;
    }

--- a/crypto/cipher.c
+++ b/crypto/cipher.c
@@ -63,18 +63,14 @@ static bool mode_need_iv[QCRYPTO_CIPHER_MODE__MAX] = {

 size_t qcrypto_cipher_get_block_len(QCryptoCipherAlgorithm alg)
 {
-    if (alg >= G_N_ELEMENTS(alg_key_len)) {
-        return 0;
-    }
+    assert(alg < G_N_ELEMENTS(alg_key_len));
    return alg_block_len[alg];
 }


 size_t qcrypto_cipher_get_key_len(QCryptoCipherAlgorithm alg)
 {
-    if (alg >= G_N_ELEMENTS(alg_key_len)) {
-        return 0;
-    }
+    assert(alg < G_N_ELEMENTS(alg_key_len));
    return alg_key_len[alg];
 }

--- a/crypto/ivgen-essiv.c
+++ b/crypto/ivgen-essiv.c
@@ -48,6 +48,7 @@ static int qcrypto_ivgen_essiv_init(QCryptoIVGen *ivgen,
                           &salt, &nhash,
                           errp) < 0) {
        g_free(essiv);
+        g_free(salt);
        return -1;
    }

--- a/default-configs/arm-softmmu.mak
+++ b/default-configs/arm-softmmu.mak
@@ -29,6 +29,7 @@ CONFIG_LAN9118=y
 CONFIG_SMC91C111=y
 CONFIG_ALLWINNER_EMAC=y
 CONFIG_IMX_FEC=y
+CONFIG_FTGMAC100=y
 CONFIG_DS1338=y
 CONFIG_PFLASH_CFI01=y
 CONFIG_PFLASH_CFI02=y
@@ -42,6 +43,8 @@ CONFIG_ARM11MPCORE=y
 CONFIG_A9MPCORE=y
 CONFIG_A15MPCORE=y

+CONFIG_ARM_V7M=y
+
 CONFIG_ARM_GIC=y
 CONFIG_ARM_GIC_KVM=$(CONFIG_KVM)
 CONFIG_ARM_TIMER=y
@@ -95,6 +98,8 @@ CONFIG_VERSATILE_PCI=y
 CONFIG_VERSATILE_I2C=y

 CONFIG_PCI_GENERIC=y
+CONFIG_VFIO_XGMAC=y
+CONFIG_VFIO_AMD_XGBE=y

 CONFIG_SDHCI=y
 CONFIG_INTEGRATOR_DEBUG=y
--- a/default-configs/i386-softmmu.mak
+++ b/default-configs/i386-softmmu.mak
@@ -39,7 +39,6 @@ CONFIG_TPM_TIS=$(CONFIG_TPM)
 CONFIG_MC146818RTC=y
 CONFIG_PCI_PIIX=y
 CONFIG_WDT_IB700=y
-CONFIG_XEN_I386=$(CONFIG_XEN)
 CONFIG_ISA_DEBUG=y
 CONFIG_ISA_TESTDEV=y
 CONFIG_VMPORT=y
@@ -59,3 +58,4 @@ CONFIG_I82801B11=y
 CONFIG_SMBIOS=y
 CONFIG_HYPERV_TESTDEV=$(CONFIG_KVM)
 CONFIG_PXB=y
+CONFIG_ACPI_VMGENID=y
--- a/default-configs/mips64el-softmmu.mak
+++ b/default-configs/mips64el-softmmu.mak
@@ -10,3 +10,6 @@ CONFIG_JAZZ=y
 CONFIG_G364FB=y
 CONFIG_JAZZ_LED=y
 CONFIG_VT82C686=y
+CONFIG_MIPS_BOSTON=y
+CONFIG_FITLOADER=y
+CONFIG_PCI_XILINX=y
--- a/default-configs/or1k-linux-user.mak
+++ b/default-configs/or1k-linux-user.mak
@@ -0,0 +1 @@
+# Default configuration for or1k-linux-user
--- a/default-configs/or1k-softmmu.mak
+++ b/default-configs/or1k-softmmu.mak
@@ -0,0 +1,4 @@
+# Default configuration for or1k-softmmu
+
+CONFIG_SERIAL=y
+CONFIG_OPENCORES_ETH=y
--- a/default-configs/or32-linux-user.mak
+++ b/default-configs/or32-linux-user.mak
@@ -1 +0,0 @@
-# Default configuration for or32-linux-user
--- a/default-configs/or32-softmmu.mak
+++ b/default-configs/or32-softmmu.mak
@@ -1,4 +0,0 @@
-# Default configuration for or32-softmmu
-
-CONFIG_SERIAL=y
-CONFIG_OPENCORES_ETH=y
--- a/default-configs/ppc-softmmu.mak
+++ b/default-configs/ppc-softmmu.mak
@@ -45,6 +45,7 @@ CONFIG_OPENPIC_KVM=$(and $(CONFIG_E500),$(CONFIG_KVM))
 CONFIG_PLATFORM_BUS=y
 CONFIG_ETSEC=y
 CONFIG_LIBDECNUMBER=y
+CONFIG_SM501=y
 # For PReP
 CONFIG_SERIAL_ISA=y
 CONFIG_MC146818RTC=y
--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .8.50
 .9.50
				`@@ -0,0 +1 @@`
				`# Default configuration for or1k-linux-user`
				`@@ -1 +0,0 @@`
				`# Default configuration for or32-linux-user`