MAINTAINERS: Add Python scripts

Cleber and I are volunteering to review and queue patches for the Python scripts and modules in scripts/. I'm setting "M: Odd fixes" because not all scripts are actively maintained. Signed-off-by: Eduardo Habkost <ehabkost@redhat.com> Message-Id: <20170915230744.22942-1-ehabkost@redhat.com> [ehabkost: add tests/*.py too] Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
qemu.py: Call logging.basicConfig() automatically
2017-09-22 11:39:55 -03:00 · 2017-09-22 11:39:17 -03:00 · 2017-09-21 14:40:32 +01:00 · 2017-09-21 13:25:41 +01:00 · 2017-09-21 13:25:37 +01:00 · 2017-09-21 13:25:30 +01:00
1357 changed files with 69289 additions and 45670 deletions
--- a/.editorconfig
+++ b/.editorconfig
@@ -0,0 +1,15 @@
+# http://editorconfig.org
+root = true
+
+[*]
+end_of_line = lf
+insert_final_newline = true
+charset = utf-8
+
+[Makefile*]
+indent_style = tab
+indent_size = 8
+
+[*.{c,h}]
+indent_style = space
+indent_size = 4
--- a/.gitignore
+++ b/.gitignore
@@ -116,6 +116,8 @@ tags
 TAGS
 docker-src.*
 *~
+*.ast_raw
+*.depend_raw
 trace.h
 trace.c
 trace-ust.h
--- a/.gitmodules
+++ b/.gitmodules
@@ -22,9 +22,6 @@
 [submodule "roms/sgabios"]
 	path = roms/sgabios
 	url = git://git.qemu-project.org/sgabios.git
-[submodule "pixman"]
-	path = pixman
-	url = git://anongit.freedesktop.org/pixman
 [submodule "dtc"]
 	path = dtc
 	url = git://git.qemu-project.org/dtc.git
--- a/.shippable.yml
+++ b/.shippable.yml
@@ -5,15 +5,28 @@ env:
  global:
    - LC_ALL=C
  matrix:
+    - IMAGE=debian-amd64
+      TARGET_LIST=x86_64-softmmu,x86_64-linux-user
+    - IMAGE=debian-win32-cross
+      TARGET_LIST=arm-softmmu,i386-softmmu,lm32-softmmu
+    - IMAGE=debian-win64-cross
+      TARGET_LIST=aarch64-softmmu,sparc64-softmmu,x86_64-softmmu
+    - IMAGE=debian-armel-cross
+      TARGET_LIST=arm-softmmu,arm-linux-user,armeb-linux-user
    - IMAGE=debian-armhf-cross
      TARGET_LIST=arm-softmmu,arm-linux-user,armeb-linux-user
    - IMAGE=debian-arm64-cross
      TARGET_LIST=aarch64-softmmu,aarch64-linux-user
    - IMAGE=debian-s390x-cross
      TARGET_LIST=s390x-softmmu,s390x-linux-user
-    # mips64el-softmmu disabled due to libfdt problem
-    - IMAGE=debian-mipsel-cross
-      TARGET_LIST=mipsel-softmmu,mipsel-linux-user,mips64el-linux-user
+    - IMAGE=debian-mips-cross
+      TARGET_LIST=mips-softmmu,mipsel-linux-user
+    - IMAGE=debian-mips64el-cross
+      TARGET_LIST=mips64el-softmmu,mips64el-linux-user
+    - IMAGE=debian-powerpc-cross
+      TARGET_LIST=ppc-softmmu,ppcemb-softmmu,ppc-linux-user
+    - IMAGE=debian-ppc64el-cross
+      TARGET_LIST=ppc64-softmmu,ppc64-linux-user,ppc64abi32-linux-user
 build:
  pre_ci:
    - make docker-image-${IMAGE} V=1
--- a/.travis.yml
+++ b/.travis.yml
@@ -46,6 +46,7 @@ notifications:
 env:
  global:
    - TEST_CMD="make check"
+    - MAKEFLAGS="-j3"
  matrix:
    - CONFIG=""
    - CONFIG="--enable-debug --enable-debug-tcg --enable-trace-backends=log"
@@ -64,7 +65,7 @@ before_install:
 before_script:
  - ./configure ${CONFIG}
 script:
-  - make -j3 && ${TEST_CMD}
+  - make ${MAKEFLAGS} && ${TEST_CMD}
 matrix:
  include:
    # Test with CLang for compile portability
@@ -86,6 +87,9 @@ matrix:
    - env: CONFIG="--enable-trace-backends=ust"
           TEST_CMD=""
      compiler: gcc
+    - env: CONFIG="--disable-tcg"
+           TEST_CMD=""
+      compiler: gcc
    - env: CONFIG=""
      os: osx
      compiler: clang
--- a/35
+++ b/35
@@ -123,3 +123,38 @@ We use traditional C-style /* */ comments and avoid // comments.

 Rationale: The // form is valid in C99, so this is purely a matter of
 consistency of style. The checkpatch script will warn you about this.
+
+8. trace-events style
+
+8.1 0x prefix
+
+In trace-events files, use a '0x' prefix to specify hex numbers, as in:
+
+some_trace(unsigned x, uint64_t y) "x 0x%x y 0x" PRIx64
+
+An exception is made for groups of numbers that are hexadecimal by
+convention and separated by the symbols '.', '/', ':', or ' ' (such as
+PCI bus id):
+
+another_trace(int cssid, int ssid, int dev_num) "bus id: %x.%x.%04x"
+
+However, you can use '0x' for such groups if you want. Anyway, be sure that
+it is obvious that numbers are in hex, ex.:
+
+data_dump(uint8_t c1, uint8_t c2, uint8_t c3) "bytes (in hex): %02x %02x %02x"
+
+Rationale: hex numbers are hard to read in logs when there is no 0x prefix,
+especially when (occasionally) the representation doesn't contain any letters
+and especially in one line with other decimal numbers. Number groups are allowed
+to not use '0x' because for some things notations like %x.%x.%x are used not
+only in Qemu. Also dumping raw data bytes with '0x' is less readable.
+
+8.2 '#' printf flag
+
+Do not use printf flag '#', like '%#x'.
+
+Rationale: there are two ways to add a '0x' prefix to printed number: '0x%...'
+and '%#...'. For consistency the only one way should be used. Arguments for
+'0x%' are:
+ - it is more popular
+ - '%#' omits the 0x for the value 0 which makes output inconsistent
--- a/COPYING.PYTHON
+++ b/COPYING.PYTHON
@@ -0,0 +1,270 @@
+A. HISTORY OF THE SOFTWARE
+==========================
+
+Python was created in the early 1990s by Guido van Rossum at Stichting
+Mathematisch Centrum (CWI, see http://www.cwi.nl) in the Netherlands
+as a successor of a language called ABC.  Guido remains Python's
+principal author, although it includes many contributions from others.
+
+In 1995, Guido continued his work on Python at the Corporation for
+National Research Initiatives (CNRI, see http://www.cnri.reston.va.us)
+in Reston, Virginia where he released several versions of the
+software.
+
+In May 2000, Guido and the Python core development team moved to
+BeOpen.com to form the BeOpen PythonLabs team.  In October of the same
+year, the PythonLabs team moved to Digital Creations (now Zope
+Corporation, see http://www.zope.com).  In 2001, the Python Software
+Foundation (PSF, see http://www.python.org/psf/) was formed, a
+non-profit organization created specifically to own Python-related
+Intellectual Property.  Zope Corporation is a sponsoring member of
+the PSF.
+
+All Python releases are Open Source (see http://www.opensource.org for
+the Open Source Definition).  Historically, most, but not all, Python
+releases have also been GPL-compatible; the table below summarizes
+the various releases.
+
+    Release         Derived     Year        Owner       GPL-
+                    from                                compatible? (1)
+
+    0.9.0 thru 1.2              1991-1995   CWI         yes
+    1.3 thru 1.5.2  1.2         1995-1999   CNRI        yes
+    1.6             1.5.2       2000        CNRI        no
+    2.0             1.6         2000        BeOpen.com  no
+    1.6.1           1.6         2001        CNRI        yes (2)
+    2.1             2.0+1.6.1   2001        PSF         no
+    2.0.1           2.0+1.6.1   2001        PSF         yes
+    2.1.1           2.1+2.0.1   2001        PSF         yes
+    2.2             2.1.1       2001        PSF         yes
+    2.1.2           2.1.1       2002        PSF         yes
+    2.1.3           2.1.2       2002        PSF         yes
+    2.2.1           2.2         2002        PSF         yes
+    2.2.2           2.2.1       2002        PSF         yes
+    2.2.3           2.2.2       2003        PSF         yes
+    2.3             2.2.2       2002-2003   PSF         yes
+    2.3.1           2.3         2002-2003   PSF         yes
+    2.3.2           2.3.1       2002-2003   PSF         yes
+    2.3.3           2.3.2       2002-2003   PSF         yes
+    2.3.4           2.3.3       2004        PSF         yes
+    2.3.5           2.3.4       2005        PSF         yes
+    2.4             2.3         2004        PSF         yes
+    2.4.1           2.4         2005        PSF         yes
+    2.4.2           2.4.1       2005        PSF         yes
+    2.4.3           2.4.2       2006        PSF         yes
+    2.5             2.4         2006        PSF         yes
+    2.7             2.6         2010        PSF         yes
+
+Footnotes:
+
+(1) GPL-compatible doesn't mean that we're distributing Python under
+    the GPL.  All Python licenses, unlike the GPL, let you distribute
+    a modified version without making your changes open source.  The
+    GPL-compatible licenses make it possible to combine Python with
+    other software that is released under the GPL; the others don't.
+
+(2) According to Richard Stallman, 1.6.1 is not GPL-compatible,
+    because its license has a choice of law clause.  According to
+    CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1
+    is "not incompatible" with the GPL.
+
+Thanks to the many outside volunteers who have worked under Guido's
+direction to make these releases possible.
+
+
+B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON
+===============================================================
+
+PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
+--------------------------------------------
+
+1. This LICENSE AGREEMENT is between the Python Software Foundation
+("PSF"), and the Individual or Organization ("Licensee") accessing and
+otherwise using this software ("Python") in source or binary form and
+its associated documentation.
+
+2. Subject to the terms and conditions of this License Agreement, PSF
+hereby grants Licensee a nonexclusive, royalty-free, world-wide
+license to reproduce, analyze, test, perform and/or display publicly,
+prepare derivative works, distribute, and otherwise use Python
+alone or in any derivative version, provided, however, that PSF's
+License Agreement and PSF's notice of copyright, i.e., "Copyright (c)
+2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation; All Rights
+Reserved" are retained in Python alone or in any derivative version 
+prepared by Licensee.
+
+3. In the event Licensee prepares a derivative work that is based on
+or incorporates Python or any part thereof, and wants to make
+the derivative work available to others as provided herein, then
+Licensee hereby agrees to include in any such work a brief summary of
+the changes made to Python.
+
+4. PSF is making Python available to Licensee on an "AS IS"
+basis.  PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
+DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
+INFRINGE ANY THIRD PARTY RIGHTS.
+
+5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
+FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
+A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
+OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+
+6. This License Agreement will automatically terminate upon a material
+breach of its terms and conditions.
+
+7. Nothing in this License Agreement shall be deemed to create any
+relationship of agency, partnership, or joint venture between PSF and
+Licensee.  This License Agreement does not grant permission to use PSF
+trademarks or trade name in a trademark sense to endorse or promote
+products or services of Licensee, or any third party.
+
+8. By copying, installing or otherwise using Python, Licensee
+agrees to be bound by the terms and conditions of this License
+Agreement.
+
+
+BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0
+-------------------------------------------
+
+BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1
+
+1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an
+office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the
+Individual or Organization ("Licensee") accessing and otherwise using
+this software in source or binary form and its associated
+documentation ("the Software").
+
+2. Subject to the terms and conditions of this BeOpen Python License
+Agreement, BeOpen hereby grants Licensee a non-exclusive,
+royalty-free, world-wide license to reproduce, analyze, test, perform
+and/or display publicly, prepare derivative works, distribute, and
+otherwise use the Software alone or in any derivative version,
+provided, however, that the BeOpen Python License is retained in the
+Software, alone or in any derivative version prepared by Licensee.
+
+3. BeOpen is making the Software available to Licensee on an "AS IS"
+basis.  BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND
+DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT
+INFRINGE ANY THIRD PARTY RIGHTS.
+
+4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE
+SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS
+AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY
+DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+
+5. This License Agreement will automatically terminate upon a material
+breach of its terms and conditions.
+
+6. This License Agreement shall be governed by and interpreted in all
+respects by the law of the State of California, excluding conflict of
+law provisions.  Nothing in this License Agreement shall be deemed to
+create any relationship of agency, partnership, or joint venture
+between BeOpen and Licensee.  This License Agreement does not grant
+permission to use BeOpen trademarks or trade names in a trademark
+sense to endorse or promote products or services of Licensee, or any
+third party.  As an exception, the "BeOpen Python" logos available at
+http://www.pythonlabs.com/logos.html may be used according to the
+permissions granted on that web page.
+
+7. By copying, installing or otherwise using the software, Licensee
+agrees to be bound by the terms and conditions of this License
+Agreement.
+
+
+CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1
+---------------------------------------
+
+1. This LICENSE AGREEMENT is between the Corporation for National
+Research Initiatives, having an office at 1895 Preston White Drive,
+Reston, VA 20191 ("CNRI"), and the Individual or Organization
+("Licensee") accessing and otherwise using Python 1.6.1 software in
+source or binary form and its associated documentation.
+
+2. Subject to the terms and conditions of this License Agreement, CNRI
+hereby grants Licensee a nonexclusive, royalty-free, world-wide
+license to reproduce, analyze, test, perform and/or display publicly,
+prepare derivative works, distribute, and otherwise use Python 1.6.1
+alone or in any derivative version, provided, however, that CNRI's
+License Agreement and CNRI's notice of copyright, i.e., "Copyright (c)
+1995-2001 Corporation for National Research Initiatives; All Rights
+Reserved" are retained in Python 1.6.1 alone or in any derivative
+version prepared by Licensee.  Alternately, in lieu of CNRI's License
+Agreement, Licensee may substitute the following text (omitting the
+quotes): "Python 1.6.1 is made available subject to the terms and
+conditions in CNRI's License Agreement.  This Agreement together with
+Python 1.6.1 may be located on the Internet using the following
+unique, persistent identifier (known as a handle): 1895.22/1013.  This
+Agreement may also be obtained from a proxy server on the Internet
+using the following URL: http://hdl.handle.net/1895.22/1013".
+
+3. In the event Licensee prepares a derivative work that is based on
+or incorporates Python 1.6.1 or any part thereof, and wants to make
+the derivative work available to others as provided herein, then
+Licensee hereby agrees to include in any such work a brief summary of
+the changes made to Python 1.6.1.
+
+4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS"
+basis.  CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND
+DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT
+INFRINGE ANY THIRD PARTY RIGHTS.
+
+5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
+1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
+A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1,
+OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+
+6. This License Agreement will automatically terminate upon a material
+breach of its terms and conditions.
+
+7. This License Agreement shall be governed by the federal
+intellectual property law of the United States, including without
+limitation the federal copyright law, and, to the extent such
+U.S. federal law does not apply, by the law of the Commonwealth of
+Virginia, excluding Virginia's conflict of law provisions.
+Notwithstanding the foregoing, with regard to derivative works based
+on Python 1.6.1 that incorporate non-separable material that was
+previously distributed under the GNU General Public License (GPL), the
+law of the Commonwealth of Virginia shall govern this License
+Agreement only as to issues arising under or with respect to
+Paragraphs 4, 5, and 7 of this License Agreement.  Nothing in this
+License Agreement shall be deemed to create any relationship of
+agency, partnership, or joint venture between CNRI and Licensee.  This
+License Agreement does not grant permission to use CNRI trademarks or
+trade name in a trademark sense to endorse or promote products or
+services of Licensee, or any third party.
+
+8. By clicking on the "ACCEPT" button where indicated, or by copying,
+installing or otherwise using Python 1.6.1, Licensee agrees to be
+bound by the terms and conditions of this License Agreement.
+
+        ACCEPT
+
+
+CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2
+--------------------------------------------------
+
+Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam,
+The Netherlands.  All rights reserved.
+
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose and without fee is hereby granted,
+provided that the above copyright notice appear in all copies and that
+both that copyright notice and this permission notice appear in
+supporting documentation, and that the name of Stichting Mathematisch
+Centrum or CWI not be used in advertising or publicity pertaining to
+distribution of the software without specific, written prior
+permission.
+
+STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
+THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
+FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
--- a/123
+++ b/123
@@ -84,14 +84,9 @@ M: Paolo Bonzini <pbonzini@redhat.com>
 M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
 M: Richard Henderson <rth@twiddle.net>
 S: Maintained
-F: cpu-exec.c
-F: cpu-exec-common.c
 F: cpus.c
-F: cputlb.c
 F: exec.c
-F: softmmu_template.h
-F: translate-all.*
-F: translate-common.c
+F: accel/tcg/
 F: include/exec/cpu*.h
 F: include/exec/exec-all.h
 F: include/exec/helper*.h
@@ -254,6 +249,7 @@ S: Maintained
 F: target/i386/
 F: hw/i386/
 F: disas/i386.c
+T: git git://github.com/ehabkost/qemu.git x86-next

 Xtensa
 M: Max Filippov <jcmvbkbc@gmail.com>
@@ -277,8 +273,8 @@ Overall
 M: Paolo Bonzini <pbonzini@redhat.com>
 L: kvm@vger.kernel.org
 S: Supported
-F: kvm-*
 F: */kvm.*
+F: accel/kvm/
 F: include/sysemu/kvm*.h

 ARM
@@ -299,7 +295,7 @@ F: target/ppc/kvm.c

 S390
 M: Christian Borntraeger <borntraeger@de.ibm.com>
-M: Cornelia Huck <cornelia.huck@de.ibm.com>
+M: Cornelia Huck <cohuck@redhat.com>
 M: Alexander Graf <agraf@suse.de>
 S: Maintained
 F: target/s390x/kvm.c
@@ -327,7 +323,6 @@ M: Stefano Stabellini <sstabellini@kernel.org>
 M: Anthony Perard <anthony.perard@citrix.com>
 L: xen-devel@lists.xenproject.org
 S: Supported
-F: xen-*
 F: */xen*
 F: hw/9pfs/xen-9p-backend.c
 F: hw/char/xen_console.c
@@ -358,7 +353,7 @@ NETBSD
 L: qemu-devel@nongnu.org
 M: Kamil Rytarowski <kamil@netbsd.org>
 S: Maintained
-K: (?i)NetBSD
+K: ^Subject:.*(?i)NetBSD

 W32, W64
 L: qemu-devel@nongnu.org
@@ -380,7 +375,7 @@ F: hw/*/allwinner*
 F: include/hw/*/allwinner*
 F: hw/arm/cubieboard.c

-ARM PrimeCell
+ARM PrimeCell and CMSDK devices
 M: Peter Maydell <peter.maydell@linaro.org>
 L: qemu-arm@nongnu.org
 S: Maintained
@@ -394,6 +389,10 @@ F: hw/intc/pl190.c
 F: hw/sd/pl181.c
 F: hw/timer/pl031.c
 F: include/hw/arm/primecell.h
+F: hw/timer/cmsdk-apb-timer.c
+F: include/hw/timer/cmsdk-apb-timer.h
+F: hw/char/cmsdk-apb-uart.c
+F: include/hw/char/cmsdk-apb-uart.h

 ARM cores
 M: Peter Maydell <peter.maydell@linaro.org>
@@ -455,6 +454,14 @@ S: Maintained
 F: hw/arm/integratorcp.c
 F: hw/misc/arm_integrator_debug.c

+MPS2
+M: Peter Maydell <peter.maydell@linaro.org>
+L: qemu-arm@nongnu.org
+S: Maintained
+F: hw/arm/mps2.c
+F: hw/misc/mps2-scc.c
+F: include/hw/misc/mps2-scc.h
+
 Musicpal
 M: Jan Kiszka <jan.kiszka@web.de>
 L: qemu-arm@nongnu.org
@@ -778,11 +785,12 @@ F: include/hw/sparc/grlib.h
 S390 Machines
 -------------
 S390 Virtio-ccw
-M: Cornelia Huck <cornelia.huck@de.ibm.com>
+M: Cornelia Huck <cohuck@redhat.com>
 M: Christian Borntraeger <borntraeger@de.ibm.com>
 M: Alexander Graf <agraf@suse.de>
 S: Supported
 F: hw/char/sclp*.[hc]
+F: hw/char/terminal3270.c
 F: hw/s390x/
 F: include/hw/s390x/
 F: pc-bios/s390-ccw/
@@ -851,6 +859,7 @@ S: Supported
 F: hw/core/machine.c
 F: hw/core/null-machine.c
 F: include/hw/boards.h
+T: git git://github.com/ehabkost/qemu.git machine-next

 Xtensa Machines
 ---------------
@@ -962,7 +971,9 @@ SCSI
 M: Paolo Bonzini <pbonzini@redhat.com>
 S: Supported
 F: include/hw/scsi/*
+F: include/scsi/*
 F: hw/scsi/*
+F: util/scsi*
 F: tests/virtio-scsi-test.c
 T: git git://github.com/bonzini/qemu.git scsi-next

@@ -992,6 +1003,7 @@ F: docs/usb2.txt
 F: docs/usb-storage.txt
 F: include/hw/usb.h
 F: include/hw/usb/
+F: default-configs/usb.mak

 USB (serial adapter)
 M: Gerd Hoffmann <kraxel@redhat.com>
@@ -1006,7 +1018,7 @@ F: hw/vfio/*
 F: include/hw/vfio/

 vfio-ccw
-M: Cornelia Huck <cornelia.huck@de.ibm.com>
+M: Cornelia Huck <cohuck@redhat.com>
 S: Supported
 F: hw/vfio/ccw.c
 F: hw/s390x/s390-ccw.c
@@ -1048,7 +1060,7 @@ F: tests/virtio-blk-test.c
 T: git git://github.com/stefanha/qemu.git block

 virtio-ccw
-M: Cornelia Huck <cornelia.huck@de.ibm.com>
+M: Cornelia Huck <cohuck@redhat.com>
 M: Christian Borntraeger <borntraeger@de.ibm.com>
 S: Supported
 F: hw/s390x/virtio-ccw.[hc]
@@ -1094,11 +1106,12 @@ F: hw/block/nvme*
 F: tests/nvme-test.c

 megasas
-M: Hannes Reinecke <hare@suse.de>
+M: Hannes Reinecke <hare@suse.com>
 L: qemu-block@nongnu.org
 S: Supported
 F: hw/scsi/megasas.c
 F: hw/scsi/mfi.h
+F: tests/megasas-test.c

 Network packet abstractions
 M: Dmitry Fleytman <dmitry@daynix.com>
@@ -1122,7 +1135,7 @@ F: tests/rocker/
 F: docs/specs/rocker.txt

 NVDIMM
-M: Xiao Guangrong <guangrong.xiao@linux.intel.com>
+M: Xiao Guangrong <xiaoguangrong.eric@gmail.com>
 S: Maintained
 F: hw/acpi/nvdimm.c
 F: hw/mem/nvdimm.c
@@ -1160,6 +1173,13 @@ F: docs/specs/vmgenid.txt
 F: tests/vmgenid-test.c
 F: stubs/vmgenid.c

+Unimplemented device
+M: Peter Maydell <peter.maydell@linaro.org>
+R: Philippe Mathieu-Daudé <f4bug@amsat.org>
+S: Maintained
+F: include/hw/misc/unimp.h
+F: hw/misc/unimp.c
+
 Subsystems
 ----------
 Audio
@@ -1199,6 +1219,13 @@ F: migration/block*
 F: include/block/aio.h
 T: git git://github.com/stefanha/qemu.git block

+Block SCSI subsystem
+M: Paolo Bonzini <pbonzini@redhat.com>
+L: qemu-block@nongnu.org
+S: Supported
+F: include/scsi/*
+F: scsi/*
+
 Block Jobs
 M: Jeff Cody <jcody@redhat.com>
 L: qemu-block@nongnu.org
@@ -1217,6 +1244,7 @@ S: Supported
 F: blockdev.c
 F: block/qapi.c
 F: qapi/block*.json
+F: qapi/transaction.json
 T: git git://repo.or.cz/qemu/armbru.git block-next

 Dirty Bitmaps
@@ -1239,6 +1267,7 @@ M: Marc-André Lureau <marcandre.lureau@redhat.com>
 S: Maintained
 F: chardev/
 F: include/chardev/
+F: qapi/char.json

 Character Devices (Braille)
 M: Samuel Thibault <samuel.thibault@ens-lyon.org>
@@ -1305,12 +1334,14 @@ F: include/ui/spice-display.h
 F: ui/spice-*.c
 F: audio/spiceaudio.c
 F: hw/display/qxl*
+F: qapi/ui.json

 Graphics
 M: Gerd Hoffmann <kraxel@redhat.com>
 S: Odd Fixes
 F: ui/
 F: include/ui/
+F: qapi/ui.json

 Cocoa graphics
 M: Peter Maydell <peter.maydell@linaro.org>
@@ -1324,6 +1355,7 @@ F: cpus.c
 F: util/main-loop.c
 F: util/qemu-timer.c
 F: vl.c
+F: qapi/run-state.json

 Human Monitor (HMP)
 M: Dr. David Alan Gilbert <dgilbert@redhat.com>
@@ -1332,6 +1364,7 @@ F: monitor.c
 F: hmp.[ch]
 F: hmp-commands*.hx
 F: include/monitor/hmp-target.h
+F: tests/test-hmp.c

 Network device backends
 M: Jason Wang <jasowang@redhat.com>
@@ -1339,6 +1372,7 @@ S: Maintained
 F: net/
 F: include/net/
 T: git git://github.com/jasowang/qemu.git net
+F: qapi/net.json

 Netmap network backend
 M: Luigi Rizzo <rizzo@iet.unipi.it>
@@ -1348,21 +1382,12 @@ W: http://info.iet.unipi.it/~luigi/netmap/
 S: Maintained
 F: net/netmap.c

-Network Block Device (NBD)
-M: Paolo Bonzini <pbonzini@redhat.com>
-S: Odd Fixes
-F: block/nbd*
-F: nbd/
-F: include/block/nbd*
-F: qemu-nbd.c
-T: git git://github.com/bonzini/qemu.git nbd-next
-
 NUMA
 M: Eduardo Habkost <ehabkost@redhat.com>
 S: Maintained
 F: numa.c
 F: include/sysemu/numa.h
-T: git git://github.com/ehabkost/qemu.git numa
+T: git git://github.com/ehabkost/qemu.git machine-next

 Host Memory Backends
 M: Eduardo Habkost <ehabkost@redhat.com>
@@ -1370,6 +1395,7 @@ M: Igor Mammedov <imammedo@redhat.com>
 S: Maintained
 F: backends/hostmem*.c
 F: include/sysemu/hostmem.h
+T: git git://github.com/ehabkost/qemu.git machine-next

 Cryptodev Backends
 M: Gonglei <arei.gonglei@huawei.com>
@@ -1377,6 +1403,14 @@ S: Maintained
 F: include/sysemu/cryptodev*.h
 F: backends/cryptodev*.c

+Python scripts
+M: Eduardo Habkost <ehabkost@redhat.com>
+M: Cleber Rosa <crosa@redhat.com>
+S: Odd fixes
+F: scripts/qmp/*
+F: scripts/*.py
+F: tests/*.py
+
 QAPI
 M: Markus Armbruster <armbru@redhat.com>
 M: Michael Roth <mdroth@linux.vnet.ibm.com>
@@ -1474,6 +1508,14 @@ F: scripts/tracetool/
 F: docs/tracing.txt
 T: git git://github.com/stefanha/qemu.git tracing

+TPM
+S: Orphan
+F: tpm.c
+F: hw/tpm/*
+F: include/hw/acpi/tpm.h
+F: include/sysemu/tpm*
+F: qapi/tpm.json
+
 Checkpatch
 S: Odd Fixes
 F: scripts/checkpatch.pl
@@ -1487,9 +1529,10 @@ F: migration/
 F: scripts/vmstate-static-checker.py
 F: tests/vmstate-static-checker-data/
 F: docs/migration.txt
+F: qapi/migration.json

 Seccomp
-M: Eduardo Otubo <eduardo.otubo@profitbricks.com>
+M: Eduardo Otubo <otubo@redhat.com>
 S: Supported
 F: qemu-seccomp.c
 F: include/sysemu/seccomp.h
@@ -1529,6 +1572,7 @@ M: Paolo Bonzini <pbonzini@redhat.com>
 S: Maintained
 F: include/qemu/sockets.h
 F: util/qemu-sockets.c
+F: qapi/sockets.json

 Throttling infrastructure
 M: Alberto Garcia <berto@igalia.com>
@@ -1555,7 +1599,7 @@ F: include/migration/failover.h
 F: docs/COLO-FT.txt

 COLO Proxy
-M: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
+M: Zhang Chen <zhangckid@gmail.com>
 M: Li Zhijian <lizhijian@cn.fujitsu.com>
 S: Supported
 F: docs/colo-proxy.txt
@@ -1581,8 +1625,7 @@ Overall
 M: Riku Voipio <riku.voipio@iki.fi>
 S: Maintained
 F: thunk.c
-F: user-exec.c
-F: user-exec-stub.c
+F: accel/tcg/user-exec*.c

 BSD user
 S: Orphan
@@ -1625,12 +1668,6 @@ S: Maintained
 F: tcg/i386/
 F: disas/i386.c

-IA64 target
-M: Aurelien Jarno <aurelien@aurel32.net>
-S: Maintained
-F: tcg/ia64/
-F: disas/ia64.c
-
 MIPS target
 M: Aurelien Jarno <aurelien@aurel32.net>
 S: Maintained
@@ -1659,7 +1696,7 @@ TCI target
 M: Stefan Weil <sw@weilnetz.de>
 S: Maintained
 F: tcg/tci/
-F: tci.c
+F: tcg/tci.c
 F: disas/tci.c

 Block drivers
@@ -1710,6 +1747,18 @@ S: Supported
 F: block/iscsi.c
 F: block/iscsi-opts.c

+Network Block Device (NBD)
+M: Eric Blake <eblake@redhat.com>
+M: Paolo Bonzini <pbonzini@redhat.com>
+L: qemu-block@nongnu.org
+S: Maintained
+F: block/nbd*
+F: nbd/
+F: include/block/nbd*
+F: qemu-nbd.*
+F: blockdev-nbd.c
+T: git git://repo.or.cz/qemu/ericb.git nbd
+
 NFS
 M: Jeff Cody <jcody@redhat.com>
 M: Peter Lieven <pl@kamp.de>
--- a/40
+++ b/40
@@ -281,9 +281,7 @@ dummy := $(call unnest-vars,, \
                common-obj-m \
                trace-obj-y)

-ifneq ($(wildcard config-host.mak),)
 include $(SRC_PATH)/tests/Makefile.include
-endif

 all: $(DOCS) $(TOOLS) $(HELPERS-y) recurse-all modules

@@ -327,15 +325,6 @@ $(SOFTMMU_SUBDIR_RULES): config-all-devices.mak
 subdir-%:
 	$(call quiet-command,$(MAKE) $(SUBDIR_MAKEFLAGS) -C $* V="$(V)" TARGET_DIR="$*/" all,)

-subdir-pixman: pixman/Makefile
-	$(call quiet-command,$(MAKE) $(SUBDIR_MAKEFLAGS) -C pixman V="$(V)" all,)
-
-pixman/Makefile: $(SRC_PATH)/pixman/configure
-	(cd pixman; CFLAGS="$(CFLAGS) -fPIC $(extra_cflags) $(extra_ldflags)" $(SRC_PATH)/pixman/configure $(AUTOCONF_HOST) --disable-gtk --disable-shared --enable-static)
-
-$(SRC_PATH)/pixman/configure:
-	(cd $(SRC_PATH)/pixman; autoreconf -v --install)
-
 DTC_MAKE_ARGS=-I$(SRC_PATH)/dtc VPATH=$(SRC_PATH)/dtc -C dtc V="$(V)" LIBFDT_srcdir=$(SRC_PATH)/dtc/libfdt
 DTC_CFLAGS=$(CFLAGS) $(QEMU_CFLAGS)
 DTC_CPPFLAGS=-I$(BUILD_DIR)/dtc -I$(SRC_PATH)/dtc -I$(SRC_PATH)/dtc/libfdt
@@ -346,7 +335,7 @@ subdir-dtc:dtc/libfdt dtc/tests
 dtc/%:
 	mkdir -p $@

-$(SUBDIR_RULES): libqemuutil.a libqemustub.a $(common-obj-y) $(chardev-obj-y) \
+$(SUBDIR_RULES): libqemuutil.a $(common-obj-y) $(chardev-obj-y) \
 	$(qom-obj-y) $(crypto-aes-obj-$(CONFIG_USER_ONLY))

 ROMSUBDIR_RULES=$(patsubst %,romsubdir-%, $(ROMS))
@@ -366,12 +355,11 @@ Makefile: $(version-obj-y)
 ######################################################################
 # Build libraries

-libqemustub.a: $(stub-obj-y)
-libqemuutil.a: $(util-obj-y) $(trace-obj-y)
+libqemuutil.a: $(util-obj-y) $(trace-obj-y) $(stub-obj-y)

 ######################################################################

-COMMON_LDADDS = libqemuutil.a libqemustub.a
+COMMON_LDADDS = libqemuutil.a

 qemu-img.o: qemu-img-cmds.h

@@ -412,9 +400,18 @@ $(SRC_PATH)/qga/qapi-schema.json $(SRC_PATH)/scripts/qapi-commands.py $(qapi-py)

 qapi-modules = $(SRC_PATH)/qapi-schema.json $(SRC_PATH)/qapi/common.json \
               $(SRC_PATH)/qapi/block.json $(SRC_PATH)/qapi/block-core.json \
-               $(SRC_PATH)/qapi/event.json $(SRC_PATH)/qapi/introspect.json \
-               $(SRC_PATH)/qapi/crypto.json $(SRC_PATH)/qapi/rocker.json \
-               $(SRC_PATH)/qapi/trace.json
+               $(SRC_PATH)/qapi/char.json \
+               $(SRC_PATH)/qapi/crypto.json \
+               $(SRC_PATH)/qapi/introspect.json \
+               $(SRC_PATH)/qapi/migration.json \
+               $(SRC_PATH)/qapi/net.json \
+               $(SRC_PATH)/qapi/rocker.json \
+               $(SRC_PATH)/qapi/run-state.json \
+               $(SRC_PATH)/qapi/sockets.json \
+               $(SRC_PATH)/qapi/tpm.json \
+               $(SRC_PATH)/qapi/trace.json \
+               $(SRC_PATH)/qapi/transaction.json \
+               $(SRC_PATH)/qapi/ui.json

 qapi-types.c qapi-types.h :\
 $(qapi-modules) $(SRC_PATH)/scripts/qapi-types.py $(qapi-py)
@@ -443,7 +440,7 @@ $(qapi-modules) $(SRC_PATH)/scripts/qapi-introspect.py $(qapi-py)
 		"GEN","$@")

 QGALIB_GEN=$(addprefix qga/qapi-generated/, qga-qapi-types.h qga-qapi-visit.h qga-qmp-commands.h)
-$(qga-obj-y) qemu-ga.o: $(QGALIB_GEN)
+$(qga-obj-y): $(QGALIB_GEN)

 qemu-ga$(EXESUF): $(qga-obj-y) $(COMMON_LDADDS)
 	$(call LINK, $^)
@@ -470,10 +467,12 @@ ifneq ($(EXESUF),)
 qemu-ga: qemu-ga$(EXESUF) $(QGA_VSS_PROVIDER) $(QEMU_GA_MSI)
 endif

+ifdef CONFIG_IVSHMEM
 ivshmem-client$(EXESUF): $(ivshmem-client-obj-y) $(COMMON_LDADDS)
 	$(call LINK, $^)
 ivshmem-server$(EXESUF): $(ivshmem-server-obj-y) $(COMMON_LDADDS)
 	$(call LINK, $^)
+endif
 vhost-user-scsi$(EXESUF): $(vhost-user-scsi-obj-y)
 	$(call LINK, $^)

@@ -532,7 +531,6 @@ distclean: clean
 	rm -rf $$d || exit 1 ; \
        done
 	rm -Rf .sdk
-	if test -f pixman/config.log; then $(MAKE) -C pixman distclean; fi
 	if test -f dtc/version_gen.h; then $(MAKE) $(DTC_MAKE_ARGS) clean; fi

 KEYMAPS=da     en-gb  et  fr     fr-ch  is  lt  modifiers  no  pt-br  sv \
@@ -553,7 +551,7 @@ efi-e1000e.rom efi-vmxnet3.rom \
 qemu-icon.bmp qemu_logo_no_text.svg \
 bamboo.dtb petalogix-s3adsp1800.dtb petalogix-ml605.dtb \
 multiboot.bin linuxboot.bin linuxboot_dma.bin kvmvapic.bin \
-s390-ccw.img \
+s390-ccw.img s390-netboot.img \
 spapr-rtas.bin slof.bin skiboot.lid \
 palcode-clipper \
 u-boot.e500 \
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -11,7 +11,7 @@ chardev-obj-y = chardev/

 block-obj-y += nbd/
 block-obj-y += block.o blockjob.o
-block-obj-y += block/
+block-obj-y += block/ scsi/
 block-obj-y += qemu-io-cmds.o
 block-obj-$(CONFIG_REPLICATION) += replication.o

@@ -40,7 +40,7 @@ io-obj-y = io/

 ifeq ($(CONFIG_SOFTMMU),y)
 common-obj-y = blockdev.o blockdev-nbd.o block/
-common-obj-y += iothread.o
+common-obj-y += bootdevice.o iothread.o
 common-obj-y += net/
 common-obj-y += qdev-monitor.o device-hotplug.o
 common-obj-$(CONFIG_WIN32) += os-win32.o
@@ -70,6 +70,8 @@ common-obj-y += backends/
 common-obj-y += chardev/

 common-obj-$(CONFIG_SECCOMP) += qemu-seccomp.o
+qemu-seccomp.o-cflags := $(SECCOMP_CFLAGS)
+qemu-seccomp.o-libs := $(SECCOMP_LIBS)

 common-obj-$(CONFIG_FDT) += device_tree.o

@@ -107,8 +109,8 @@ qga-vss-dll-obj-y = qga/

 ######################################################################
 # contrib
-ivshmem-client-obj-y = contrib/ivshmem-client/
-ivshmem-server-obj-y = contrib/ivshmem-server/
+ivshmem-client-obj-$(CONFIG_IVSHMEM) = contrib/ivshmem-client/
+ivshmem-server-obj-$(CONFIG_IVSHMEM) = contrib/ivshmem-server/
 libvhost-user-obj-y = contrib/libvhost-user/
 vhost-user-scsi.o-cflags := $(LIBISCSI_CFLAGS)
 vhost-user-scsi.o-libs := $(LIBISCSI_LIBS)
@@ -122,7 +124,6 @@ trace-events-subdirs += crypto
 trace-events-subdirs += io
 trace-events-subdirs += migration
 trace-events-subdirs += block
-trace-events-subdirs += backends
 trace-events-subdirs += chardev
 trace-events-subdirs += hw/block
 trace-events-subdirs += hw/block/dataplane
@@ -154,6 +155,7 @@ trace-events-subdirs += hw/acpi
 trace-events-subdirs += hw/arm
 trace-events-subdirs += hw/alpha
 trace-events-subdirs += hw/xen
+trace-events-subdirs += hw/ide
 trace-events-subdirs += ui
 trace-events-subdirs += audio
 trace-events-subdirs += net
@@ -168,6 +170,7 @@ trace-events-subdirs += linux-user
 trace-events-subdirs += qapi
 trace-events-subdirs += accel/tcg
 trace-events-subdirs += accel/kvm
+trace-events-subdirs += nbd

 trace-events-files = $(SRC_PATH)/trace-events $(trace-events-subdirs:%=$(SRC_PATH)/%/trace-events)

--- a/Makefile.target
+++ b/Makefile.target
@@ -48,7 +48,10 @@ else
 TARGET_TYPE=system
 endif

-$(QEMU_PROG).stp-installed: $(BUILD_DIR)/trace-events-all
+tracetool-y = $(SRC_PATH)/scripts/tracetool.py
+tracetool-y += $(shell find $(SRC_PATH)/scripts/tracetool -name "*.py")
+
+$(QEMU_PROG).stp-installed: $(BUILD_DIR)/trace-events-all $(tracetool-y)
 	$(call quiet-command,$(TRACETOOL) \
 		--group=all \
 		--format=stap \
@@ -58,7 +61,7 @@ $(QEMU_PROG).stp-installed: $(BUILD_DIR)/trace-events-all
 		--target-type=$(TARGET_TYPE) \
 		$< > $@,"GEN","$(TARGET_DIR)$(QEMU_PROG).stp-installed")

-$(QEMU_PROG).stp: $(BUILD_DIR)/trace-events-all
+$(QEMU_PROG).stp: $(BUILD_DIR)/trace-events-all $(tracetool-y)
 	$(call quiet-command,$(TRACETOOL) \
 		--group=all \
 		--format=stap \
@@ -68,7 +71,7 @@ $(QEMU_PROG).stp: $(BUILD_DIR)/trace-events-all
 		--target-type=$(TARGET_TYPE) \
 		$< > $@,"GEN","$(TARGET_DIR)$(QEMU_PROG).stp")

-$(QEMU_PROG)-simpletrace.stp: $(BUILD_DIR)/trace-events-all
+$(QEMU_PROG)-simpletrace.stp: $(BUILD_DIR)/trace-events-all $(tracetool-y)
 	$(call quiet-command,$(TRACETOOL) \
 		--group=all \
 		--format=simpletrace-stap \
@@ -90,15 +93,14 @@ all: $(PROGS) stap
 # cpu emulator library
 obj-y += exec.o
 obj-y += accel/
-obj-y += tcg/tcg.o tcg/tcg-op.o tcg/optimize.o
-obj-y += tcg/tcg-common.o tcg/tcg-runtime.o
+obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/optimize.o
+obj-$(CONFIG_TCG) += tcg/tcg-common.o
 obj-$(CONFIG_TCG_INTERPRETER) += tcg/tci.o
 obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o
 obj-y += fpu/softfloat.o
 obj-y += target/$(TARGET_BASE_ARCH)/
 obj-y += disas.o
 obj-$(call notempty,$(TARGET_XML_FILES)) += gdbstub-xml.o
-obj-$(call lnot,$(CONFIG_HAX)) += hax-stub.o

 obj-$(CONFIG_LIBDECNUMBER) += libdecnumber/decContext.o
 obj-$(CONFIG_LIBDECNUMBER) += libdecnumber/decNumber.o
@@ -116,7 +118,7 @@ QEMU_CFLAGS+=-I$(SRC_PATH)/linux-user/$(TARGET_ABI_DIR) \
             -I$(SRC_PATH)/linux-user

 obj-y += linux-user/
-obj-y += gdbstub.o thunk.o user-exec.o user-exec-stub.o
+obj-y += gdbstub.o thunk.o

 endif #CONFIG_LINUX_USER

@@ -129,7 +131,7 @@ QEMU_CFLAGS+=-I$(SRC_PATH)/bsd-user -I$(SRC_PATH)/bsd-user/$(TARGET_ABI_DIR) \
 			 -I$(SRC_PATH)/bsd-user/$(HOST_VARIANT_DIR)

 obj-y += bsd-user/
-obj-y += gdbstub.o user-exec.o user-exec-stub.o
+obj-y += gdbstub.o

 endif #CONFIG_BSD_USER

@@ -137,7 +139,7 @@ endif #CONFIG_BSD_USER
 # System emulator target
 ifdef CONFIG_SOFTMMU
 obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o numa.o
-obj-y += qtest.o bootdevice.o
+obj-y += qtest.o
 obj-y += hw/
 obj-y += memory.o
 obj-y += memory_mapping.o
@@ -190,7 +192,7 @@ all-obj-$(CONFIG_SOFTMMU) += $(io-obj-y)

 $(QEMU_PROG_BUILD): config-devices.mak

-COMMON_LDADDS = ../libqemuutil.a ../libqemustub.a
+COMMON_LDADDS = ../libqemuutil.a

 # build either PROG or PROGW
 $(QEMU_PROG_BUILD): $(all-obj-y) $(COMMON_LDADDS)
--- a/2
+++ b/2
@@ -1 +1 @@
-2.9.50
+2.10.50
--- a/accel/Makefile.objs
+++ b/accel/Makefile.objs
@@ -1,4 +1,4 @@
 obj-$(CONFIG_SOFTMMU) += accel.o
 obj-y += kvm/
-obj-y += tcg/
+obj-$(CONFIG_TCG) += tcg/
 obj-y += stubs/
--- a/accel/accel.c
+++ b/accel/accel.c
@@ -33,6 +33,7 @@
 #include "sysemu/qtest.h"
 #include "hw/xen/xen.h"
 #include "qom/object.h"
+#include "qemu/error-report.h"

 static const TypeInfo accel_type = {
    .name = TYPE_ACCEL,
@@ -69,19 +70,20 @@ static int accel_init_machine(AccelClass *acc, MachineState *ms)

 void configure_accelerator(MachineState *ms)
 {
-    const char *p;
+    const char *accel, *p;
    char buf[10];
    int ret;
    bool accel_initialised = false;
    bool init_failed = false;
    AccelClass *acc = NULL;

-    p = qemu_opt_get(qemu_get_machine_opts(), "accel");
-    if (p == NULL) {
+    accel = qemu_opt_get(qemu_get_machine_opts(), "accel");
+    if (accel == NULL) {
        /* Use the default "accelerator", tcg */
-        p = "tcg";
+        accel = "tcg";
    }

+    p = accel;
    while (!accel_initialised && *p != '\0') {
        if (*p == ':') {
            p++;
@@ -89,7 +91,6 @@ void configure_accelerator(MachineState *ms)
        p = get_opt_name(buf, sizeof(buf), p, ':');
        acc = accel_find(buf);
        if (!acc) {
-            fprintf(stderr, "\"%s\" accelerator not found.\n", buf);
            continue;
        }
        if (acc->available && !acc->available()) {
@@ -100,9 +101,8 @@ void configure_accelerator(MachineState *ms)
        ret = accel_init_machine(acc, ms);
        if (ret < 0) {
            init_failed = true;
-            fprintf(stderr, "failed to initialize %s: %s\n",
-                    acc->name,
-                    strerror(-ret));
+            error_report("failed to initialize %s: %s",
+                         acc->name, strerror(-ret));
        } else {
            accel_initialised = true;
        }
@@ -110,16 +110,22 @@ void configure_accelerator(MachineState *ms)

    if (!accel_initialised) {
        if (!init_failed) {
-            fprintf(stderr, "No accelerator found!\n");
+            error_report("-machine accel=%s: No accelerator found", accel);
        }
        exit(1);
    }

    if (init_failed) {
-        fprintf(stderr, "Back to %s accelerator.\n", acc->name);
+        error_report("Back to %s accelerator", acc->name);
    }
 }

+void accel_register_compat_props(AccelState *accel)
+{
+    AccelClass *class = ACCEL_GET_CLASS(accel);
+    register_compat_props_array(class->global_props);
+}
+
 static void register_accel_types(void)
 {
    type_register_static(&accel_type);
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -79,7 +79,6 @@ struct KVMState
    int coalesced_mmio;
    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
    bool coalesced_flush_in_progress;
-    int broken_set_mem_region;
    int vcpu_events;
    int robust_singlestep;
    int debugregs;
@@ -127,6 +126,7 @@ static bool kvm_immediate_exit;
 static const KVMCapabilityInfo kvm_required_capabilites[] = {
    KVM_CAP_INFO(USER_MEMORY),
    KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
+    KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS),
    KVM_CAP_LAST_INFO
 };

@@ -172,7 +172,7 @@ static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)

 static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
                                         hwaddr start_addr,
-                                         hwaddr end_addr)
+                                         hwaddr size)
 {
    KVMState *s = kvm_state;
    int i;
@@ -180,8 +180,7 @@ static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
    for (i = 0; i < s->nr_slots; i++) {
        KVMSlot *mem = &kml->slots[i];

-        if (start_addr == mem->start_addr &&
-            end_addr == mem->start_addr + mem->memory_size) {
+        if (start_addr == mem->start_addr && size == mem->memory_size) {
            return mem;
        }
    }
@@ -190,31 +189,33 @@ static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
 }

 /*
- * Find overlapping slot with lowest start address
+ * Calculate and align the start address and the size of the section.
+ * Return the size. If the size is 0, the aligned section is empty.
 */
-static KVMSlot *kvm_lookup_overlapping_slot(KVMMemoryListener *kml,
-                                            hwaddr start_addr,
-                                            hwaddr end_addr)
+static hwaddr kvm_align_section(MemoryRegionSection *section,
+                                hwaddr *start)
 {
-    KVMState *s = kvm_state;
-    KVMSlot *found = NULL;
-    int i;
+    hwaddr size = int128_get64(section->size);
+    hwaddr delta;

-    for (i = 0; i < s->nr_slots; i++) {
-        KVMSlot *mem = &kml->slots[i];
+    *start = section->offset_within_address_space;

-        if (mem->memory_size == 0 ||
-            (found && found->start_addr < mem->start_addr)) {
-            continue;
-        }
-
-        if (end_addr > mem->start_addr &&
-            start_addr < mem->start_addr + mem->memory_size) {
-            found = mem;
-        }
+    /* kvm works in page size chunks, but the function may be called
+       with sub-page size and unaligned start address. Pad the start
+       address to next and truncate size to previous page boundary. */
+    delta = qemu_real_host_page_size - (*start & ~qemu_real_host_page_mask);
+    delta &= ~qemu_real_host_page_mask;
+    *start += delta;
+    if (delta > size) {
+        return 0;
+    }
+    size -= delta;
+    size &= qemu_real_host_page_mask;
+    if (*start & ~qemu_real_host_page_mask) {
+        return 0;
    }

-    return found;
+    return size;
 }

 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
@@ -318,7 +319,7 @@ int kvm_init_vcpu(CPUState *cpu)

    cpu->kvm_fd = ret;
    cpu->kvm_state = s;
-    cpu->kvm_vcpu_dirty = true;
+    cpu->vcpu_dirty = true;

    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
    if (mmap_size < 0) {
@@ -382,15 +383,21 @@ static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
 static int kvm_section_update_flags(KVMMemoryListener *kml,
                                    MemoryRegionSection *section)
 {
-    hwaddr phys_addr = section->offset_within_address_space;
-    ram_addr_t size = int128_get64(section->size);
-    KVMSlot *mem = kvm_lookup_matching_slot(kml, phys_addr, phys_addr + size);
+    hwaddr start_addr, size;
+    KVMSlot *mem;

-    if (mem == NULL)  {
+    size = kvm_align_section(section, &start_addr);
+    if (!size) {
        return 0;
-    } else {
-        return kvm_slot_update_flags(kml, mem, section->mr);
    }
+
+    mem = kvm_lookup_matching_slot(kml, start_addr, size);
+    if (!mem) {
+        fprintf(stderr, "%s: error finding slot\n", __func__);
+        abort();
+    }
+
+    return kvm_slot_update_flags(kml, mem, section->mr);
 }

 static void kvm_log_start(MemoryListener *listener,
@@ -454,18 +461,16 @@ static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
                                          MemoryRegionSection *section)
 {
    KVMState *s = kvm_state;
-    unsigned long size, allocated_size = 0;
    struct kvm_dirty_log d = {};
    KVMSlot *mem;
-    int ret = 0;
-    hwaddr start_addr = section->offset_within_address_space;
-    hwaddr end_addr = start_addr + int128_get64(section->size);
+    hwaddr start_addr, size;

-    d.dirty_bitmap = NULL;
-    while (start_addr < end_addr) {
-        mem = kvm_lookup_overlapping_slot(kml, start_addr, end_addr);
-        if (mem == NULL) {
-            break;
+    size = kvm_align_section(section, &start_addr);
+    if (size) {
+        mem = kvm_lookup_matching_slot(kml, start_addr, size);
+        if (!mem) {
+            fprintf(stderr, "%s: error finding slot\n", __func__);
+            abort();
        }

        /* XXX bad kernel interface alert
@@ -482,27 +487,20 @@ static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
         */
        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS),
                     /*HOST_LONG_BITS*/ 64) / 8;
-        if (!d.dirty_bitmap) {
-            d.dirty_bitmap = g_malloc(size);
-        } else if (size > allocated_size) {
-            d.dirty_bitmap = g_realloc(d.dirty_bitmap, size);
-        }
-        allocated_size = size;
-        memset(d.dirty_bitmap, 0, allocated_size);
+        d.dirty_bitmap = g_malloc0(size);

        d.slot = mem->slot | (kml->as_id << 16);
        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
            DPRINTF("ioctl failed %d\n", errno);
-            ret = -1;
-            break;
+            g_free(d.dirty_bitmap);
+            return -1;
        }

        kvm_get_dirty_pages_log_range(section, d.dirty_bitmap);
-        start_addr = mem->start_addr + mem->memory_size;
+        g_free(d.dirty_bitmap);
    }
-    g_free(d.dirty_bitmap);

-    return ret;
+    return 0;
 }

 static void kvm_coalesce_mmio_region(MemoryListener *listener,
@@ -696,30 +694,12 @@ kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
 static void kvm_set_phys_mem(KVMMemoryListener *kml,
                             MemoryRegionSection *section, bool add)
 {
-    KVMState *s = kvm_state;
-    KVMSlot *mem, old;
+    KVMSlot *mem;
    int err;
    MemoryRegion *mr = section->mr;
    bool writeable = !mr->readonly && !mr->rom_device;
-    hwaddr start_addr = section->offset_within_address_space;
-    ram_addr_t size = int128_get64(section->size);
-    void *ram = NULL;
-    unsigned delta;
-
-    /* kvm works in page size chunks, but the function may be called
-       with sub-page size and unaligned start address. Pad the start
-       address to next and truncate size to previous page boundary. */
-    delta = qemu_real_host_page_size - (start_addr & ~qemu_real_host_page_mask);
-    delta &= ~qemu_real_host_page_mask;
-    if (delta > size) {
-        return;
-    }
-    start_addr += delta;
-    size -= delta;
-    size &= qemu_real_host_page_mask;
-    if (!size || (start_addr & ~qemu_real_host_page_mask)) {
-        return;
-    }
+    hwaddr start_addr, size;
+    void *ram;

    if (!memory_region_is_ram(mr)) {
        if (writeable || !kvm_readonly_mem_allowed) {
@@ -731,30 +711,25 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
        }
    }

-    ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + delta;
+    size = kvm_align_section(section, &start_addr);
+    if (!size) {
+        return;
+    }

-    while (1) {
-        mem = kvm_lookup_overlapping_slot(kml, start_addr, start_addr + size);
+    ram = memory_region_get_ram_ptr(mr) + section->offset_within_region +
+          (section->offset_within_address_space - start_addr);
+
+    mem = kvm_lookup_matching_slot(kml, start_addr, size);
+    if (!add) {
        if (!mem) {
-            break;
-        }
-
-        if (add && start_addr >= mem->start_addr &&
-            (start_addr + size <= mem->start_addr + mem->memory_size) &&
-            (ram - start_addr == mem->ram - mem->start_addr)) {
-            /* The new slot fits into the existing one and comes with
-             * identical parameters - update flags and done. */
-            kvm_slot_update_flags(kml, mem, mr);
+            g_assert(!memory_region_is_ram(mr) && !writeable && !mr->romd_mode);
            return;
        }
-
-        old = *mem;
-
        if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
            kvm_physical_sync_dirty_bitmap(kml, section);
        }

-        /* unregister the overlapping slot */
+        /* unregister the slot */
        mem->memory_size = 0;
        err = kvm_set_user_memory_region(kml, mem);
        if (err) {
@@ -762,84 +737,16 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
                    __func__, strerror(-err));
            abort();
        }
-
-        /* Workaround for older KVM versions: we can't join slots, even not by
-         * unregistering the previous ones and then registering the larger
-         * slot. We have to maintain the existing fragmentation. Sigh.
-         *
-         * This workaround assumes that the new slot starts at the same
-         * address as the first existing one. If not or if some overlapping
-         * slot comes around later, we will fail (not seen in practice so far)
-         * - and actually require a recent KVM version. */
-        if (s->broken_set_mem_region &&
-            old.start_addr == start_addr && old.memory_size < size && add) {
-            mem = kvm_alloc_slot(kml);
-            mem->memory_size = old.memory_size;
-            mem->start_addr = old.start_addr;
-            mem->ram = old.ram;
-            mem->flags = kvm_mem_flags(mr);
-
-            err = kvm_set_user_memory_region(kml, mem);
-            if (err) {
-                fprintf(stderr, "%s: error updating slot: %s\n", __func__,
-                        strerror(-err));
-                abort();
-            }
-
-            start_addr += old.memory_size;
-            ram += old.memory_size;
-            size -= old.memory_size;
-            continue;
-        }
-
-        /* register prefix slot */
-        if (old.start_addr < start_addr) {
-            mem = kvm_alloc_slot(kml);
-            mem->memory_size = start_addr - old.start_addr;
-            mem->start_addr = old.start_addr;
-            mem->ram = old.ram;
-            mem->flags =  kvm_mem_flags(mr);
-
-            err = kvm_set_user_memory_region(kml, mem);
-            if (err) {
-                fprintf(stderr, "%s: error registering prefix slot: %s\n",
-                        __func__, strerror(-err));
-#ifdef TARGET_PPC
-                fprintf(stderr, "%s: This is probably because your kernel's " \
-                                "PAGE_SIZE is too big. Please try to use 4k " \
-                                "PAGE_SIZE!\n", __func__);
-#endif
-                abort();
-            }
-        }
-
-        /* register suffix slot */
-        if (old.start_addr + old.memory_size > start_addr + size) {
-            ram_addr_t size_delta;
-
-            mem = kvm_alloc_slot(kml);
-            mem->start_addr = start_addr + size;
-            size_delta = mem->start_addr - old.start_addr;
-            mem->memory_size = old.memory_size - size_delta;
-            mem->ram = old.ram + size_delta;
-            mem->flags = kvm_mem_flags(mr);
-
-            err = kvm_set_user_memory_region(kml, mem);
-            if (err) {
-                fprintf(stderr, "%s: error registering suffix slot: %s\n",
-                        __func__, strerror(-err));
-                abort();
-            }
-        }
-    }
-
-    /* in case the KVM bug workaround already "consumed" the new slot */
-    if (!size) {
        return;
    }
-    if (!add) {
+
+    if (mem) {
+        /* update the slot */
+        kvm_slot_update_flags(kml, mem, mr);
        return;
    }
+
+    /* register the new slot */
    mem = kvm_alloc_slot(kml);
    mem->memory_size = size;
    mem->start_addr = start_addr;
@@ -981,15 +888,6 @@ static MemoryListener kvm_io_listener = {
    .priority = 10,
 };

-static void kvm_handle_interrupt(CPUState *cpu, int mask)
-{
-    cpu->interrupt_request |= mask;
-
-    if (!qemu_cpu_is_self(cpu)) {
-        qemu_cpu_kick(cpu);
-    }
-}
-
 int kvm_set_irq(KVMState *s, int irq, int level)
 {
    struct kvm_irq_level event;
@@ -1257,7 +1155,7 @@ int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
    int virq;
    MSIMessage msg = {0, 0};

-    if (dev) {
+    if (pci_available && dev) {
        msg = pci_get_msi_message(dev, vector);
    }

@@ -1280,7 +1178,7 @@ int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
    kroute.u.msi.address_lo = (uint32_t)msg.address;
    kroute.u.msi.address_hi = msg.address >> 32;
    kroute.u.msi.data = le32_to_cpu(msg.data);
-    if (kvm_msi_devid_required()) {
+    if (pci_available && kvm_msi_devid_required()) {
        kroute.flags = KVM_MSI_VALID_DEVID;
        kroute.u.msi.devid = pci_requester_id(dev);
    }
@@ -1318,7 +1216,7 @@ int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
    kroute.u.msi.address_lo = (uint32_t)msg.address;
    kroute.u.msi.address_hi = msg.address >> 32;
    kroute.u.msi.data = le32_to_cpu(msg.data);
-    if (kvm_msi_devid_required()) {
+    if (pci_available && kvm_msi_devid_required()) {
        kroute.flags = KVM_MSI_VALID_DEVID;
        kroute.u.msi.devid = pci_requester_id(dev);
    }
@@ -1638,10 +1536,9 @@ static int kvm_init(MachineState *ms)

    while (nc->name) {
        if (nc->num > soft_vcpus_limit) {
-            fprintf(stderr,
-                    "Warning: Number of %s cpus requested (%d) exceeds "
-                    "the recommended cpus supported by KVM (%d)\n",
-                    nc->name, nc->num, soft_vcpus_limit);
+            warn_report("Number of %s cpus requested (%d) exceeds "
+                        "the recommended cpus supported by KVM (%d)",
+                        nc->name, nc->num, soft_vcpus_limit);

            if (nc->num > hard_vcpus_limit) {
                fprintf(stderr, "Number of %s cpus requested (%d) exceeds "
@@ -1701,12 +1598,6 @@ static int kvm_init(MachineState *ms)

    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);

-    s->broken_set_mem_region = 1;
-    ret = kvm_check_extension(s, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
-    if (ret > 0) {
-        s->broken_set_mem_region = 0;
-    }
-
 #ifdef KVM_CAP_VCPU_EVENTS
    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
 #endif
@@ -1774,8 +1665,6 @@ static int kvm_init(MachineState *ms)

    s->many_ioeventfds = kvm_check_many_ioeventfds();

-    cpu_interrupt_handler = kvm_handle_interrupt;
-
    return 0;

 err:
@@ -1864,15 +1753,15 @@ void kvm_flush_coalesced_mmio_buffer(void)

 static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
 {
-    if (!cpu->kvm_vcpu_dirty) {
+    if (!cpu->vcpu_dirty) {
        kvm_arch_get_registers(cpu);
-        cpu->kvm_vcpu_dirty = true;
+        cpu->vcpu_dirty = true;
    }
 }

 void kvm_cpu_synchronize_state(CPUState *cpu)
 {
-    if (!cpu->kvm_vcpu_dirty) {
+    if (!cpu->vcpu_dirty) {
        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
    }
 }
@@ -1880,7 +1769,7 @@ void kvm_cpu_synchronize_state(CPUState *cpu)
 static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
 {
    kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
-    cpu->kvm_vcpu_dirty = false;
+    cpu->vcpu_dirty = false;
 }

 void kvm_cpu_synchronize_post_reset(CPUState *cpu)
@@ -1891,7 +1780,7 @@ void kvm_cpu_synchronize_post_reset(CPUState *cpu)
 static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
 {
    kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
-    cpu->kvm_vcpu_dirty = false;
+    cpu->vcpu_dirty = false;
 }

 void kvm_cpu_synchronize_post_init(CPUState *cpu)
@@ -1901,7 +1790,7 @@ void kvm_cpu_synchronize_post_init(CPUState *cpu)

 static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
 {
-    cpu->kvm_vcpu_dirty = true;
+    cpu->vcpu_dirty = true;
 }

 void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
@@ -1982,9 +1871,9 @@ int kvm_cpu_exec(CPUState *cpu)
    do {
        MemTxAttrs attrs;

-        if (cpu->kvm_vcpu_dirty) {
+        if (cpu->vcpu_dirty) {
            kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE);
-            cpu->kvm_vcpu_dirty = false;
+            cpu->vcpu_dirty = false;
        }

        kvm_arch_pre_run(cpu, run);
@@ -2285,6 +2174,11 @@ int kvm_has_intx_set_mask(void)
    return kvm_state->intx_set_mask;
 }

+bool kvm_arm_supports_user_irq(void)
+{
+    return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ);
+}
+
 #ifdef KVM_CAP_SET_GUEST_DEBUG
 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu,
                                                 target_ulong pc)
--- a/accel/stubs/Makefile.objs
+++ b/accel/stubs/Makefile.objs
@@ -1 +1,3 @@
+obj-$(call lnot,$(CONFIG_HAX)) += hax-stub.o
 obj-$(call lnot,$(CONFIG_KVM)) += kvm-stub.o
+obj-$(call lnot,$(CONFIG_TCG)) += tcg-stub.o
--- a/accel/stubs/hax-stub.c
+++ b/accel/stubs/hax-stub.c
--- a/accel/stubs/kvm-stub.c
+++ b/accel/stubs/kvm-stub.c
@@ -155,4 +155,9 @@ void kvm_init_cpu_signals(CPUState *cpu)
 {
    abort();
 }
+
+bool kvm_arm_supports_user_irq(void)
+{
+    return false;
+}
 #endif
--- a/accel/stubs/tcg-stub.c
+++ b/accel/stubs/tcg-stub.c
@@ -0,0 +1,22 @@
+/*
+ * QEMU TCG accelerator stub
+ *
+ * Copyright Red Hat, Inc. 2013
+ *
+ * Author: Paolo Bonzini     <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "cpu.h"
+#include "tcg/tcg.h"
+#include "exec/cpu-common.h"
+#include "exec/exec-all.h"
+
+void tb_flush(CPUState *cpu)
+{
+}
--- a/accel/tcg/Makefile.objs
+++ b/accel/tcg/Makefile.objs
@@ -1,3 +1,8 @@
 obj-$(CONFIG_SOFTMMU) += tcg-all.o
 obj-$(CONFIG_SOFTMMU) += cputlb.o
-obj-y += cpu-exec.o cpu-exec-common.o translate-all.o translate-common.o
+obj-y += tcg-runtime.o
+obj-y += cpu-exec.o cpu-exec-common.o translate-all.o
+obj-y += translator.o
+
+obj-$(CONFIG_USER_ONLY) += user-exec.o
+obj-$(call lnot,$(CONFIG_SOFTMMU)) += user-exec-stub.o
--- a/accel/tcg/atomic_template.h
+++ b/accel/tcg/atomic_template.h
--- a/accel/tcg/cpu-exec-common.c
+++ b/accel/tcg/cpu-exec-common.c
@@ -23,6 +23,8 @@
 #include "exec/exec-all.h"
 #include "exec/memory-internal.h"

+bool tcg_allowed;
+
 /* exit the current TB, but without causing any exception to be raised */
 void cpu_loop_exit_noexc(CPUState *cpu)
 {
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -280,6 +280,7 @@ struct tb_desc {
    CPUArchState *env;
    tb_page_addr_t phys_page1;
    uint32_t flags;
+    uint32_t trace_vcpu_dstate;
 };

 static bool tb_cmp(const void *p, const void *d)
@@ -291,6 +292,7 @@ static bool tb_cmp(const void *p, const void *d)
        tb->page_addr[0] == desc->phys_page1 &&
        tb->cs_base == desc->cs_base &&
        tb->flags == desc->flags &&
+        tb->trace_vcpu_dstate == desc->trace_vcpu_dstate &&
        !atomic_read(&tb->invalid)) {
        /* check next page if needed */
        if (tb->page_addr[1] == -1) {
@@ -319,13 +321,49 @@ TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
    desc.env = (CPUArchState *)cpu->env_ptr;
    desc.cs_base = cs_base;
    desc.flags = flags;
+    desc.trace_vcpu_dstate = *cpu->trace_dstate;
    desc.pc = pc;
    phys_pc = get_page_addr_code(desc.env, pc);
    desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;
-    h = tb_hash_func(phys_pc, pc, flags);
+    h = tb_hash_func(phys_pc, pc, flags, *cpu->trace_dstate);
    return qht_lookup(&tcg_ctx.tb_ctx.htable, tb_cmp, &desc, h);
 }

+void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
+{
+    if (TCG_TARGET_HAS_direct_jump) {
+        uintptr_t offset = tb->jmp_target_arg[n];
+        uintptr_t tc_ptr = (uintptr_t)tb->tc_ptr;
+        tb_target_set_jmp_target(tc_ptr, tc_ptr + offset, addr);
+    } else {
+        tb->jmp_target_arg[n] = addr;
+    }
+}
+
+/* Called with tb_lock held.  */
+static inline void tb_add_jump(TranslationBlock *tb, int n,
+                               TranslationBlock *tb_next)
+{
+    assert(n < ARRAY_SIZE(tb->jmp_list_next));
+    if (tb->jmp_list_next[n]) {
+        /* Another thread has already done this while we were
+         * outside of the lock; nothing to do in this case */
+        return;
+    }
+    qemu_log_mask_and_addr(CPU_LOG_EXEC, tb->pc,
+                           "Linking TBs %p [" TARGET_FMT_lx
+                           "] index %d -> %p [" TARGET_FMT_lx "]\n",
+                           tb->tc_ptr, tb->pc, n,
+                           tb_next->tc_ptr, tb_next->pc);
+
+    /* patch the native jump address */
+    tb_set_jmp_target(tb, n, (uintptr_t)tb_next->tc_ptr);
+
+    /* add in TB jmp circular list */
+    tb->jmp_list_next[n] = tb_next->jmp_list_first;
+    tb_next->jmp_list_first = (uintptr_t)tb | n;
+}
+
 static inline TranslationBlock *tb_find(CPUState *cpu,
                                        TranslationBlock *last_tb,
                                        int tb_exit)
@@ -342,7 +380,8 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
    tb = atomic_rcu_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
    if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
-                 tb->flags != flags)) {
+                 tb->flags != flags ||
+                 tb->trace_vcpu_dstate != *cpu->trace_dstate)) {
        tb = tb_htable_lookup(cpu, pc, cs_base, flags);
        if (!tb) {

--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -118,7 +118,7 @@ static void tlb_flush_nocheck(CPUState *cpu)

    memset(env->tlb_table, -1, sizeof(env->tlb_table));
    memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
-    memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
+    cpu_tb_jmp_cache_clear(cpu);

    env->vtlb_index = 0;
    env->tlb_flush_addr = -1;
@@ -183,7 +183,7 @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
        }
    }

-    memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
+    cpu_tb_jmp_cache_clear(cpu);

    tlb_debug("done\n");

@@ -746,42 +746,8 @@ static inline ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
    return ram_addr;
 }

-/* NOTE: this function can trigger an exception */
-/* NOTE2: the returned address is not exactly the physical address: it
- * is actually a ram_addr_t (in system mode; the user mode emulation
- * version of this function returns a guest virtual address).
- */
-tb_page_addr_t get_page_addr_code(CPUArchState *env1, target_ulong addr)
-{
-    int mmu_idx, page_index, pd;
-    void *p;
-    MemoryRegion *mr;
-    CPUState *cpu = ENV_GET_CPU(env1);
-    CPUIOTLBEntry *iotlbentry;
-
-    page_index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-    mmu_idx = cpu_mmu_index(env1, true);
-    if (unlikely(env1->tlb_table[mmu_idx][page_index].addr_code !=
-                 (addr & TARGET_PAGE_MASK))) {
-        cpu_ldub_code(env1, addr);
-    }
-    iotlbentry = &env1->iotlb[mmu_idx][page_index];
-    pd = iotlbentry->addr & ~TARGET_PAGE_MASK;
-    mr = iotlb_to_region(cpu, pd, iotlbentry->attrs);
-    if (memory_region_is_unassigned(mr)) {
-        cpu_unassigned_access(cpu, addr, false, true, 0, 4);
-        /* The CPU's unassigned access hook might have longjumped out
-         * with an exception. If it didn't (or there was no hook) then
-         * we can't proceed further.
-         */
-        report_bad_exec(cpu, addr);
-        exit(1);
-    }
-    p = (void *)((uintptr_t)addr + env1->tlb_table[mmu_idx][page_index].addend);
-    return qemu_ram_addr_from_host_nofail(p);
-}
-
 static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
+                         int mmu_idx,
                         target_ulong addr, uintptr_t retaddr, int size)
 {
    CPUState *cpu = ENV_GET_CPU(env);
@@ -789,6 +755,7 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
    MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
    uint64_t val;
    bool locked = false;
+    MemTxResult r;

    physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
    cpu->mem_io_pc = retaddr;
@@ -802,7 +769,12 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
        qemu_mutex_lock_iothread();
        locked = true;
    }
-    memory_region_dispatch_read(mr, physaddr, &val, size, iotlbentry->attrs);
+    r = memory_region_dispatch_read(mr, physaddr,
+                                    &val, size, iotlbentry->attrs);
+    if (r != MEMTX_OK) {
+        cpu_transaction_failed(cpu, physaddr, addr, size, MMU_DATA_LOAD,
+                               mmu_idx, iotlbentry->attrs, r, retaddr);
+    }
    if (locked) {
        qemu_mutex_unlock_iothread();
    }
@@ -811,6 +783,7 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
 }

 static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
+                      int mmu_idx,
                      uint64_t val, target_ulong addr,
                      uintptr_t retaddr, int size)
 {
@@ -818,6 +791,7 @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
    hwaddr physaddr = iotlbentry->addr;
    MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
    bool locked = false;
+    MemTxResult r;

    physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
@@ -830,7 +804,12 @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
        qemu_mutex_lock_iothread();
        locked = true;
    }
-    memory_region_dispatch_write(mr, physaddr, val, size, iotlbentry->attrs);
+    r = memory_region_dispatch_write(mr, physaddr,
+                                     val, size, iotlbentry->attrs);
+    if (r != MEMTX_OK) {
+        cpu_transaction_failed(cpu, physaddr, addr, size, MMU_DATA_STORE,
+                               mmu_idx, iotlbentry->attrs, r, retaddr);
+    }
    if (locked) {
        qemu_mutex_unlock_iothread();
    }
@@ -868,6 +847,67 @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
  victim_tlb_hit(env, mmu_idx, index, offsetof(CPUTLBEntry, TY), \
                 (ADDR) & TARGET_PAGE_MASK)

+/* NOTE: this function can trigger an exception */
+/* NOTE2: the returned address is not exactly the physical address: it
+ * is actually a ram_addr_t (in system mode; the user mode emulation
+ * version of this function returns a guest virtual address).
+ */
+tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr)
+{
+    int mmu_idx, index, pd;
+    void *p;
+    MemoryRegion *mr;
+    CPUState *cpu = ENV_GET_CPU(env);
+    CPUIOTLBEntry *iotlbentry;
+    hwaddr physaddr;
+
+    index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    mmu_idx = cpu_mmu_index(env, true);
+    if (unlikely(env->tlb_table[mmu_idx][index].addr_code !=
+                 (addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK)))) {
+        if (!VICTIM_TLB_HIT(addr_read, addr)) {
+            tlb_fill(ENV_GET_CPU(env), addr, MMU_INST_FETCH, mmu_idx, 0);
+        }
+    }
+    iotlbentry = &env->iotlb[mmu_idx][index];
+    pd = iotlbentry->addr & ~TARGET_PAGE_MASK;
+    mr = iotlb_to_region(cpu, pd, iotlbentry->attrs);
+    if (memory_region_is_unassigned(mr)) {
+        qemu_mutex_lock_iothread();
+        if (memory_region_request_mmio_ptr(mr, addr)) {
+            qemu_mutex_unlock_iothread();
+            /* A MemoryRegion is potentially added so re-run the
+             * get_page_addr_code.
+             */
+            return get_page_addr_code(env, addr);
+        }
+        qemu_mutex_unlock_iothread();
+
+        /* Give the new-style cpu_transaction_failed() hook first chance
+         * to handle this.
+         * This is not the ideal place to detect and generate CPU
+         * exceptions for instruction fetch failure (for instance
+         * we don't know the length of the access that the CPU would
+         * use, and it would be better to go ahead and try the access
+         * and use the MemTXResult it produced). However it is the
+         * simplest place we have currently available for the check.
+         */
+        physaddr = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
+        cpu_transaction_failed(cpu, physaddr, addr, 0, MMU_INST_FETCH, mmu_idx,
+                               iotlbentry->attrs, MEMTX_DECODE_ERROR, 0);
+
+        cpu_unassigned_access(cpu, addr, false, true, 0, 4);
+        /* The CPU's unassigned access hook might have longjumped out
+         * with an exception. If it didn't (or there was no hook) then
+         * we can't proceed further.
+         */
+        report_bad_exec(cpu, addr);
+        exit(1);
+    }
+    p = (void *)((uintptr_t)addr + env->tlb_table[mmu_idx][index].addend);
+    return qemu_ram_addr_from_host_nofail(p);
+}
+
 /* Probe for whether the specified guest write access is permitted.
 * If it is not permitted then an exception will be taken in the same
 * way as if this were a real write access (and we will not return).
--- a/accel/tcg/softmmu_template.h
+++ b/accel/tcg/softmmu_template.h
@@ -101,7 +101,7 @@ static inline DATA_TYPE glue(io_read, SUFFIX)(CPUArchState *env,
                                              uintptr_t retaddr)
 {
    CPUIOTLBEntry *iotlbentry = &env->iotlb[mmu_idx][index];
-    return io_readx(env, iotlbentry, addr, retaddr, DATA_SIZE);
+    return io_readx(env, iotlbentry, mmu_idx, addr, retaddr, DATA_SIZE);
 }
 #endif

@@ -262,7 +262,7 @@ static inline void glue(io_write, SUFFIX)(CPUArchState *env,
                                          uintptr_t retaddr)
 {
    CPUIOTLBEntry *iotlbentry = &env->iotlb[mmu_idx][index];
-    return io_writex(env, iotlbentry, val, addr, retaddr, DATA_SIZE);
+    return io_writex(env, iotlbentry, mmu_idx, val, addr, retaddr, DATA_SIZE);
 }

 void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
--- a/accel/tcg/tcg-all.c
+++ b/accel/tcg/tcg-all.c
@@ -27,13 +27,44 @@
 #include "sysemu/accel.h"
 #include "sysemu/sysemu.h"
 #include "qom/object.h"
+#include "qemu-common.h"
+#include "qom/cpu.h"
+#include "sysemu/cpus.h"
+#include "qemu/main-loop.h"

-int tcg_tb_size;
-static bool tcg_allowed = true;
+unsigned long tcg_tb_size;
+
+#ifndef CONFIG_USER_ONLY
+/* mask must never be zero, except for A20 change call */
+static void tcg_handle_interrupt(CPUState *cpu, int mask)
+{
+    int old_mask;
+    g_assert(qemu_mutex_iothread_locked());
+
+    old_mask = cpu->interrupt_request;
+    cpu->interrupt_request |= mask;
+
+    /*
+     * If called from iothread context, wake the target cpu in
+     * case its halted.
+     */
+    if (!qemu_cpu_is_self(cpu)) {
+        qemu_cpu_kick(cpu);
+    } else {
+        cpu->icount_decr.u16.high = -1;
+        if (use_icount &&
+            !cpu->can_do_io
+            && (mask & ~old_mask) != 0) {
+            cpu_abort(cpu, "Raised interrupt while not in I/O function");
+        }
+    }
+}
+#endif

 static int tcg_init(MachineState *ms)
 {
    tcg_exec_init(tcg_tb_size * 1024 * 1024);
+    cpu_interrupt_handler = tcg_handle_interrupt;
    return 0;
 }

--- a/accel/tcg/tcg-runtime.c
+++ b/accel/tcg/tcg-runtime.c
@@ -158,7 +158,8 @@ void *HELPER(lookup_tb_ptr)(CPUArchState *env, target_ulong addr)
    if (unlikely(!(tb
                   && tb->pc == addr
                   && tb->cs_base == cs_base
-                   && tb->flags == flags))) {
+                   && tb->flags == flags
+                   && tb->trace_vcpu_dstate == *cpu->trace_dstate))) {
        tb = tb_htable_lookup(cpu, addr, cs_base, flags);
        if (!tb) {
            return tcg_ctx.code_gen_epilogue;
@@ -177,57 +178,3 @@ void HELPER(exit_atomic)(CPUArchState *env)
 {
    cpu_loop_exit_atomic(ENV_GET_CPU(env), GETPC());
 }
-
-#ifndef CONFIG_SOFTMMU
-/* The softmmu versions of these helpers are in cputlb.c.  */
-
-/* Do not allow unaligned operations to proceed.  Return the host address.  */
-static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
-                               int size, uintptr_t retaddr)
-{
-    /* Enforce qemu required alignment.  */
-    if (unlikely(addr & (size - 1))) {
-        cpu_loop_exit_atomic(ENV_GET_CPU(env), retaddr);
-    }
-    return g2h(addr);
-}
-
-/* Macro to call the above, with local variables from the use context.  */
-#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, DATA_SIZE, GETPC())
-
-#define ATOMIC_NAME(X)   HELPER(glue(glue(atomic_ ## X, SUFFIX), END))
-#define EXTRA_ARGS
-
-#define DATA_SIZE 1
-#include "atomic_template.h"
-
-#define DATA_SIZE 2
-#include "atomic_template.h"
-
-#define DATA_SIZE 4
-#include "atomic_template.h"
-
-#ifdef CONFIG_ATOMIC64
-#define DATA_SIZE 8
-#include "atomic_template.h"
-#endif
-
-/* The following is only callable from other helpers, and matches up
-   with the softmmu version.  */
-
-#ifdef CONFIG_ATOMIC128
-
-#undef EXTRA_ARGS
-#undef ATOMIC_NAME
-#undef ATOMIC_MMU_LOOKUP
-
-#define EXTRA_ARGS     , TCGMemOpIdx oi, uintptr_t retaddr
-#define ATOMIC_NAME(X) \
-    HELPER(glue(glue(glue(atomic_ ## X, SUFFIX), END), _mmu))
-#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, DATA_SIZE, retaddr)
-
-#define DATA_SIZE 16
-#include "atomic_template.h"
-#endif /* CONFIG_ATOMIC128 */
-
-#endif /* !CONFIG_SOFTMMU */
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
--- a/accel/tcg/trace-events
+++ b/accel/tcg/trace-events
@@ -4,7 +4,7 @@
 # cpu-exec.c
 disable exec_tb(void *tb, uintptr_t pc) "tb:%p pc=0x%"PRIxPTR
 disable exec_tb_nocache(void *tb, uintptr_t pc) "tb:%p pc=0x%"PRIxPTR
-disable exec_tb_exit(void *last_tb, unsigned int flags) "tb:%p flags=%x"
+disable exec_tb_exit(void *last_tb, unsigned int flags) "tb:%p flags=0x%x"

 # translate-all.c
 translate_block(void *tb, uintptr_t pc, uint8_t *tb_code) "tb:%p, pc:0x%"PRIxPTR", tb_code:%p"
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -54,6 +54,7 @@
 #include "exec/tb-hash.h"
 #include "translate-all.h"
 #include "qemu/bitmap.h"
+#include "qemu/error-report.h"
 #include "qemu/timer.h"
 #include "qemu/main-loop.h"
 #include "exec/log.h"
@@ -112,8 +113,10 @@ typedef struct PageDesc {
 #define V_L2_BITS 10
 #define V_L2_SIZE (1 << V_L2_BITS)

-uintptr_t qemu_host_page_size;
-intptr_t qemu_host_page_mask;
+/* Make sure all possible CPU event bits fit in tb->trace_vcpu_dstate */
+QEMU_BUILD_BUG_ON(CPU_TRACE_DSTATE_MAX_EVENTS >
+                  sizeof(((TranslationBlock *)0)->trace_vcpu_dstate)
+                  * BITS_PER_BYTE);

 /*
 * L1 Mapping properties
@@ -363,21 +366,6 @@ bool cpu_restore_state(CPUState *cpu, uintptr_t retaddr)
    return r;
 }

-void page_size_init(void)
-{
-    /* NOTE: we can always suppose that qemu_host_page_size >=
-       TARGET_PAGE_SIZE */
-    qemu_real_host_page_size = getpagesize();
-    qemu_real_host_page_mask = -(intptr_t)qemu_real_host_page_size;
-    if (qemu_host_page_size == 0) {
-        qemu_host_page_size = qemu_real_host_page_size;
-    }
-    if (qemu_host_page_size < TARGET_PAGE_SIZE) {
-        qemu_host_page_size = TARGET_PAGE_SIZE;
-    }
-    qemu_host_page_mask = -(intptr_t)qemu_host_page_size;
-}
-
 static void page_init(void)
 {
    page_size_init();
@@ -522,7 +510,7 @@ static inline PageDesc *page_find(tb_page_addr_t index)
 #elif defined(__powerpc__)
 # define MAX_CODE_GEN_BUFFER_SIZE  (32u * 1024 * 1024)
 #elif defined(__aarch64__)
-# define MAX_CODE_GEN_BUFFER_SIZE  (128ul * 1024 * 1024)
+# define MAX_CODE_GEN_BUFFER_SIZE  (2ul * 1024 * 1024 * 1024)
 #elif defined(__s390x__)
  /* We have a +- 4GB range on the branches; leave some slop.  */
 # define MAX_CODE_GEN_BUFFER_SIZE  (3ul * 1024 * 1024 * 1024)
@@ -802,6 +790,7 @@ static void tb_htable_init(void)
   size. */
 void tcg_exec_init(unsigned long tb_size)
 {
+    tcg_allowed = true;
    cpu_gen_init();
    page_init();
    tb_htable_init();
@@ -813,11 +802,6 @@ void tcg_exec_init(unsigned long tb_size)
 #endif
 }

-bool tcg_enabled(void)
-{
-    return tcg_ctx.code_gen_buffer != NULL;
-}
-
 /*
 * Allocate a new translation block. Flush the translation buffer if
 * too many translation blocks or too much generated code.
@@ -928,11 +912,7 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
    }

    CPU_FOREACH(cpu) {
-        int i;
-
-        for (i = 0; i < TB_JMP_CACHE_SIZE; ++i) {
-            atomic_set(&cpu->tb_jmp_cache[i], NULL);
-        }
+        cpu_tb_jmp_cache_clear(cpu);
    }

    tcg_ctx.tb_ctx.nb_tbs = 0;
@@ -1097,7 +1077,7 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)

    /* remove the TB from the hash list */
    phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK);
-    h = tb_hash_func(phys_pc, tb->pc, tb->flags);
+    h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->trace_vcpu_dstate);
    qht_remove(&tcg_ctx.tb_ctx.htable, tb, h);

    /* remove the TB from the page list */
@@ -1242,7 +1222,7 @@ static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
    }

    /* add in the hash table */
-    h = tb_hash_func(phys_pc, tb->pc, tb->flags);
+    h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->trace_vcpu_dstate);
    qht_insert(&tcg_ctx.tb_ctx.htable, tb, h);

 #ifdef DEBUG_TB_CHECK
@@ -1288,6 +1268,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
    tb->cs_base = cs_base;
    tb->flags = flags;
    tb->cflags = cflags;
+    tb->trace_vcpu_dstate = *cpu->trace_dstate;
    tb->invalid = false;

 #ifdef CONFIG_PROFILER
@@ -1299,7 +1280,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
    tcg_func_start(&tcg_ctx);

    tcg_ctx.cpu = ENV_GET_CPU(env);
-    gen_intermediate_code(env, tb);
+    gen_intermediate_code(cpu, tb);
    tcg_ctx.cpu = NULL;

    trace_translate_block(tb, tb->pc, tb->tc_ptr);
@@ -1308,13 +1289,13 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
    tb->jmp_reset_offset[0] = TB_JMP_RESET_OFFSET_INVALID;
    tb->jmp_reset_offset[1] = TB_JMP_RESET_OFFSET_INVALID;
    tcg_ctx.tb_jmp_reset_offset = tb->jmp_reset_offset;
-#ifdef USE_DIRECT_JUMP
-    tcg_ctx.tb_jmp_insn_offset = tb->jmp_insn_offset;
-    tcg_ctx.tb_jmp_target_addr = NULL;
-#else
-    tcg_ctx.tb_jmp_insn_offset = NULL;
-    tcg_ctx.tb_jmp_target_addr = tb->jmp_target_addr;
-#endif
+    if (TCG_TARGET_HAS_direct_jump) {
+        tcg_ctx.tb_jmp_insn_offset = tb->jmp_target_arg;
+        tcg_ctx.tb_jmp_target_addr = NULL;
+    } else {
+        tcg_ctx.tb_jmp_insn_offset = NULL;
+        tcg_ctx.tb_jmp_target_addr = tb->jmp_target_arg;
+    }

 #ifdef CONFIG_PROFILER
    tcg_ctx.tb_count++;
@@ -1348,7 +1329,27 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
        qemu_log_in_addr_range(tb->pc)) {
        qemu_log_lock();
        qemu_log("OUT: [size=%d]\n", gen_code_size);
-        log_disas(tb->tc_ptr, gen_code_size);
+        if (tcg_ctx.data_gen_ptr) {
+            size_t code_size = tcg_ctx.data_gen_ptr - tb->tc_ptr;
+            size_t data_size = gen_code_size - code_size;
+            size_t i;
+
+            log_disas(tb->tc_ptr, code_size);
+
+            for (i = 0; i < data_size; i += sizeof(tcg_target_ulong)) {
+                if (sizeof(tcg_target_ulong) == 8) {
+                    qemu_log("0x%08" PRIxPTR ":  .quad  0x%016" PRIx64 "\n",
+                             (uintptr_t)tcg_ctx.data_gen_ptr + i,
+                             *(uint64_t *)(tcg_ctx.data_gen_ptr + i));
+                } else {
+                    qemu_log("0x%08" PRIxPTR ":  .long  0x%08x\n",
+                             (uintptr_t)tcg_ctx.data_gen_ptr + i,
+                             *(uint32_t *)(tcg_ctx.data_gen_ptr + i));
+                }
+            }
+        } else {
+            log_disas(tb->tc_ptr, gen_code_size);
+        }
        qemu_log("\n");
        qemu_log_flush();
        qemu_log_unlock();
@@ -1813,19 +1814,21 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
    cpu_loop_exit_noexc(cpu);
 }

+static void tb_jmp_cache_clear_page(CPUState *cpu, target_ulong page_addr)
+{
+    unsigned int i, i0 = tb_jmp_cache_hash_page(page_addr);
+
+    for (i = 0; i < TB_JMP_PAGE_SIZE; i++) {
+        atomic_set(&cpu->tb_jmp_cache[i0 + i], NULL);
+    }
+}
+
 void tb_flush_jmp_cache(CPUState *cpu, target_ulong addr)
 {
-    unsigned int i;
-
    /* Discard jump cache entries for any tb which might potentially
       overlap the flushed page.  */
-    i = tb_jmp_cache_hash_page(addr - TARGET_PAGE_SIZE);
-    memset(&cpu->tb_jmp_cache[i], 0,
-           TB_JMP_PAGE_SIZE * sizeof(TranslationBlock *));
-
-    i = tb_jmp_cache_hash_page(addr);
-    memset(&cpu->tb_jmp_cache[i], 0,
-           TB_JMP_PAGE_SIZE * sizeof(TranslationBlock *));
+    tb_jmp_cache_clear_page(cpu, addr - TARGET_PAGE_SIZE);
+    tb_jmp_cache_clear_page(cpu, addr);
 }

 static void print_qht_statistics(FILE *f, fprintf_function cpu_fprintf,
@@ -2225,3 +2228,11 @@ int page_unprotect(target_ulong address, uintptr_t pc)
    return 0;
 }
 #endif /* CONFIG_USER_ONLY */
+
+/* This is a wrapper for common code that can not use CONFIG_SOFTMMU */
+void tcg_flush_softmmu_tlb(CPUState *cs)
+{
+#ifdef CONFIG_SOFTMMU
+    tlb_flush(cs);
+#endif
+}
--- a/accel/tcg/translate-common.c
+++ b/accel/tcg/translate-common.c
@@ -1,56 +0,0 @@
-/*
- *  Host code generation common components
- *
- *  Copyright (c) 2015 Peter Crosthwaite <crosthwaite.peter@gmail.com>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "qemu-common.h"
-#include "qom/cpu.h"
-#include "sysemu/cpus.h"
-#include "qemu/main-loop.h"
-
-uintptr_t qemu_real_host_page_size;
-intptr_t qemu_real_host_page_mask;
-
-#ifndef CONFIG_USER_ONLY
-/* mask must never be zero, except for A20 change call */
-static void tcg_handle_interrupt(CPUState *cpu, int mask)
-{
-    int old_mask;
-    g_assert(qemu_mutex_iothread_locked());
-
-    old_mask = cpu->interrupt_request;
-    cpu->interrupt_request |= mask;
-
-    /*
-     * If called from iothread context, wake the target cpu in
-     * case its halted.
-     */
-    if (!qemu_cpu_is_self(cpu)) {
-        qemu_cpu_kick(cpu);
-    } else {
-        cpu->icount_decr.u16.high = -1;
-        if (use_icount &&
-            !cpu->can_do_io
-            && (mask & ~old_mask) != 0) {
-            cpu_abort(cpu, "Raised interrupt while not in I/O function");
-        }
-    }
-}
-
-CPUInterruptHandler cpu_interrupt_handler = tcg_handle_interrupt;
-#endif
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -0,0 +1,138 @@
+/*
+ * Generic intermediate code generation.
+ *
+ * Copyright (C) 2016-2017 Lluís Vilanova <vilanova@ac.upc.edu>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qemu/error-report.h"
+#include "cpu.h"
+#include "tcg/tcg.h"
+#include "tcg/tcg-op.h"
+#include "exec/exec-all.h"
+#include "exec/gen-icount.h"
+#include "exec/log.h"
+#include "exec/translator.h"
+
+/* Pairs with tcg_clear_temp_count.
+   To be called by #TranslatorOps.{translate_insn,tb_stop} if
+   (1) the target is sufficiently clean to support reporting,
+   (2) as and when all temporaries are known to be consumed.
+   For most targets, (2) is at the end of translate_insn.  */
+void translator_loop_temp_check(DisasContextBase *db)
+{
+    if (tcg_check_temp_count()) {
+        qemu_log("warning: TCG temporary leaks before "
+                 TARGET_FMT_lx "\n", db->pc_next);
+    }
+}
+
+void translator_loop(const TranslatorOps *ops, DisasContextBase *db,
+                     CPUState *cpu, TranslationBlock *tb)
+{
+    int max_insns;
+
+    /* Initialize DisasContext */
+    db->tb = tb;
+    db->pc_first = tb->pc;
+    db->pc_next = db->pc_first;
+    db->is_jmp = DISAS_NEXT;
+    db->num_insns = 0;
+    db->singlestep_enabled = cpu->singlestep_enabled;
+
+    /* Instruction counting */
+    max_insns = db->tb->cflags & CF_COUNT_MASK;
+    if (max_insns == 0) {
+        max_insns = CF_COUNT_MASK;
+    }
+    if (max_insns > TCG_MAX_INSNS) {
+        max_insns = TCG_MAX_INSNS;
+    }
+    if (db->singlestep_enabled || singlestep) {
+        max_insns = 1;
+    }
+
+    max_insns = ops->init_disas_context(db, cpu, max_insns);
+    tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */
+
+    /* Reset the temp count so that we can identify leaks */
+    tcg_clear_temp_count();
+
+    /* Start translating.  */
+    gen_tb_start(db->tb);
+    ops->tb_start(db, cpu);
+    tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */
+
+    while (true) {
+        db->num_insns++;
+        ops->insn_start(db, cpu);
+        tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */
+
+        /* Pass breakpoint hits to target for further processing */
+        if (unlikely(!QTAILQ_EMPTY(&cpu->breakpoints))) {
+            CPUBreakpoint *bp;
+            QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
+                if (bp->pc == db->pc_next) {
+                    if (ops->breakpoint_check(db, cpu, bp)) {
+                        break;
+                    }
+                }
+            }
+            /* The breakpoint_check hook may use DISAS_TOO_MANY to indicate
+               that only one more instruction is to be executed.  Otherwise
+               it should use DISAS_NORETURN when generating an exception,
+               but may use a DISAS_TARGET_* value for Something Else.  */
+            if (db->is_jmp > DISAS_TOO_MANY) {
+                break;
+            }
+        }
+
+        /* Disassemble one instruction.  The translate_insn hook should
+           update db->pc_next and db->is_jmp to indicate what should be
+           done next -- either exiting this loop or locate the start of
+           the next instruction.  */
+        if (db->num_insns == max_insns && (db->tb->cflags & CF_LAST_IO)) {
+            /* Accept I/O on the last instruction.  */
+            gen_io_start();
+            ops->translate_insn(db, cpu);
+            gen_io_end();
+        } else {
+            ops->translate_insn(db, cpu);
+        }
+
+        /* Stop translation if translate_insn so indicated.  */
+        if (db->is_jmp != DISAS_NEXT) {
+            break;
+        }
+
+        /* Stop translation if the output buffer is full,
+           or we have executed all of the allowed instructions.  */
+        if (tcg_op_buf_full() || db->num_insns >= max_insns) {
+            db->is_jmp = DISAS_TOO_MANY;
+            break;
+        }
+    }
+
+    /* Emit code to exit the TB, as indicated by db->is_jmp.  */
+    ops->tb_stop(db, cpu);
+    gen_tb_end(db->tb, db->num_insns);
+
+    /* The disas_log hook may use these values rather than recompute.  */
+    db->tb->size = db->pc_next - db->pc_first;
+    db->tb->icount = db->num_insns;
+
+#ifdef DEBUG_DISAS
+    if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)
+        && qemu_log_in_addr_range(db->pc_first)) {
+        qemu_log_lock();
+        qemu_log("----------------\n");
+        ops->disas_log(db, cpu);
+        qemu_log("\n");
+        qemu_log_unlock();
+    }
+#endif
+}
--- a/accel/tcg/user-exec-stub.c
+++ b/accel/tcg/user-exec-stub.c
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -24,6 +24,7 @@
 #include "qemu/bitops.h"
 #include "exec/cpu_ldst.h"
 #include "translate-all.h"
+#include "exec/helper-proto.h"

 #undef EAX
 #undef ECX
@@ -167,7 +168,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
 #elif defined(__OpenBSD__)
    struct sigcontext *uc = puc;
 #else
-    struct ucontext *uc = puc;
+    ucontext_t *uc = puc;
 #endif
    unsigned long pc;
    int trapno;
@@ -222,7 +223,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
 #elif defined(__OpenBSD__)
    struct sigcontext *uc = puc;
 #else
-    struct ucontext *uc = puc;
+    ucontext_t *uc = puc;
 #endif

    pc = PC_sig(uc);
@@ -289,7 +290,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
    ucontext_t *uc = puc;
 #else
-    struct ucontext *uc = puc;
+    ucontext_t *uc = puc;
 #endif
    unsigned long pc;
    int is_write;
@@ -316,7 +317,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
                           void *puc)
 {
    siginfo_t *info = pinfo;
-    struct ucontext *uc = puc;
+    ucontext_t *uc = puc;
    uint32_t *pc = uc->uc_mcontext.sc_pc;
    uint32_t insn = *pc;
    int is_write = 0;
@@ -414,7 +415,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
 #if defined(__NetBSD__)
    ucontext_t *uc = puc;
 #else
-    struct ucontext *uc = puc;
+    ucontext_t *uc = puc;
 #endif
    unsigned long pc;
    int is_write;
@@ -441,7 +442,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
 int cpu_signal_handler(int host_signum, void *pinfo, void *puc)
 {
    siginfo_t *info = pinfo;
-    struct ucontext *uc = puc;
+    ucontext_t *uc = puc;
    uintptr_t pc = uc->uc_mcontext.pc;
    uint32_t insn = *(uint32_t *)pc;
    bool is_write;
@@ -474,7 +475,7 @@ int cpu_signal_handler(int host_signum, void *pinfo, void *puc)
 int cpu_signal_handler(int host_signum, void *pinfo, void *puc)
 {
    siginfo_t *info = pinfo;
-    struct ucontext *uc = puc;
+    ucontext_t *uc = puc;
    unsigned long ip;
    int is_write = 0;

@@ -505,7 +506,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
                       void *puc)
 {
    siginfo_t *info = pinfo;
-    struct ucontext *uc = puc;
+    ucontext_t *uc = puc;
    unsigned long pc;
    uint16_t *pinsn;
    int is_write = 0;
@@ -558,7 +559,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
                       void *puc)
 {
    siginfo_t *info = pinfo;
-    struct ucontext *uc = puc;
+    ucontext_t *uc = puc;
    greg_t pc = uc->uc_mcontext.pc;
    int is_write;

@@ -573,3 +574,54 @@ int cpu_signal_handler(int host_signum, void *pinfo,
 #error host CPU specific signal handler needed

 #endif
+
+/* The softmmu versions of these helpers are in cputlb.c.  */
+
+/* Do not allow unaligned operations to proceed.  Return the host address.  */
+static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
+                               int size, uintptr_t retaddr)
+{
+    /* Enforce qemu required alignment.  */
+    if (unlikely(addr & (size - 1))) {
+        cpu_loop_exit_atomic(ENV_GET_CPU(env), retaddr);
+    }
+    return g2h(addr);
+}
+
+/* Macro to call the above, with local variables from the use context.  */
+#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, DATA_SIZE, GETPC())
+
+#define ATOMIC_NAME(X)   HELPER(glue(glue(atomic_ ## X, SUFFIX), END))
+#define EXTRA_ARGS
+
+#define DATA_SIZE 1
+#include "atomic_template.h"
+
+#define DATA_SIZE 2
+#include "atomic_template.h"
+
+#define DATA_SIZE 4
+#include "atomic_template.h"
+
+#ifdef CONFIG_ATOMIC64
+#define DATA_SIZE 8
+#include "atomic_template.h"
+#endif
+
+/* The following is only callable from other helpers, and matches up
+   with the softmmu version.  */
+
+#ifdef CONFIG_ATOMIC128
+
+#undef EXTRA_ARGS
+#undef ATOMIC_NAME
+#undef ATOMIC_MMU_LOOKUP
+
+#define EXTRA_ARGS     , TCGMemOpIdx oi, uintptr_t retaddr
+#define ATOMIC_NAME(X) \
+    HELPER(glue(glue(glue(atomic_ ## X, SUFFIX), END), _mmu))
+#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, DATA_SIZE, retaddr)
+
+#define DATA_SIZE 16
+#include "atomic_template.h"
+#endif /* CONFIG_ATOMIC128 */
--- a/audio/rate_template.h
+++ b/audio/rate_template.h
@@ -71,6 +71,12 @@ void NAME (void *opaque, struct st_sample *ibuf, struct st_sample *obuf,
        while (rate->ipos <= (rate->opos >> 32)) {
            ilast = *ibuf++;
            rate->ipos++;
+
+            /* if ipos overflow, there is  a infinite loop */
+            if (rate->ipos == 0xffffffff) {
+                rate->ipos = 1;
+                rate->opos = rate->opos & 0xffffffff;
+            }
            /* See if we finished the input buffer yet */
            if (ibuf >= iend) {
                goto the_end;
--- a/audio/trace-events
+++ b/audio/trace-events
@@ -1,9 +1,9 @@
-# See docs/tracing.txt for syntax documentation.
+# See docs/devel/tracing.txt for syntax documentation.

 # audio/alsaaudio.c
 alsa_revents(int revents) "revents = %d"
 alsa_pollout(int i, int fd) "i = %d fd = %d"
-alsa_set_handler(int events, int index, int fd, int err) "events=%#x index=%d fd=%d err=%d"
+alsa_set_handler(int events, int index, int fd, int err) "events=0x%x index=%d fd=%d err=%d"
 alsa_wrote_zero(int len) "Failed to write %d frames (wrote zero)"
 alsa_read_zero(long len) "Failed to read %ld frames (read zero)"
 alsa_xrun_out(void) "Recovering from playback xrun"
@@ -13,5 +13,5 @@ alsa_resume_in(void) "Resuming suspended input stream"
 alsa_no_frames(int state) "No frames available and ALSA state is %d"

 # audio/ossaudio.c
-oss_version(int version) "OSS version = %#x"
+oss_version(int version) "OSS version = 0x%x"
 oss_invalid_available_size(int size, int bufsize) "Invalid available size, size=%d bufsize=%d"
--- a/backends/cryptodev.c
+++ b/backends/cryptodev.c
@@ -215,7 +215,7 @@ bool cryptodev_backend_is_ready(CryptoDevBackend *backend)
 }

 static bool
-cryptodev_backend_can_be_deleted(UserCreatable *uc, Error **errp)
+cryptodev_backend_can_be_deleted(UserCreatable *uc)
 {
    return !cryptodev_backend_is_used(CRYPTODEV_BACKEND(uc));
 }
--- a/backends/hostmem-file.c
+++ b/backends/hostmem-file.c
@@ -32,6 +32,7 @@ struct HostMemoryBackendFile {
    HostMemoryBackend parent_obj;

    bool share;
+    bool discard_data;
    char *mem_path;
 };

@@ -103,16 +104,44 @@ static void file_memory_backend_set_share(Object *o, bool value, Error **errp)
    fb->share = value;
 }

+static bool file_memory_backend_get_discard_data(Object *o, Error **errp)
+{
+    return MEMORY_BACKEND_FILE(o)->discard_data;
+}
+
+static void file_memory_backend_set_discard_data(Object *o, bool value,
+                                               Error **errp)
+{
+    MEMORY_BACKEND_FILE(o)->discard_data = value;
+}
+
+static void file_backend_unparent(Object *obj)
+{
+    HostMemoryBackend *backend = MEMORY_BACKEND(obj);
+    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(obj);
+
+    if (host_memory_backend_mr_inited(backend) && fb->discard_data) {
+        void *ptr = memory_region_get_ram_ptr(&backend->mr);
+        uint64_t sz = memory_region_size(&backend->mr);
+
+        qemu_madvise(ptr, sz, QEMU_MADV_REMOVE);
+    }
+}
+
 static void
 file_backend_class_init(ObjectClass *oc, void *data)
 {
    HostMemoryBackendClass *bc = MEMORY_BACKEND_CLASS(oc);

    bc->alloc = file_backend_memory_alloc;
+    oc->unparent = file_backend_unparent;

    object_class_property_add_bool(oc, "share",
        file_memory_backend_get_share, file_memory_backend_set_share,
        &error_abort);
+    object_class_property_add_bool(oc, "discard-data",
+        file_memory_backend_get_discard_data, file_memory_backend_set_discard_data,
+        &error_abort);
    object_class_property_add_str(oc, "mem-path",
        get_mem_path, set_mem_path,
        &error_abort);
--- a/backends/hostmem-ram.c
+++ b/backends/hostmem-ram.c
@@ -28,7 +28,7 @@ ram_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
    }

    path = object_get_canonical_path_component(OBJECT(backend));
-    memory_region_init_ram(&backend->mr, OBJECT(backend), path,
+    memory_region_init_ram_nomigrate(&backend->mr, OBJECT(backend), path,
                           backend->size, errp);
    g_free(path);
 }
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -304,7 +304,7 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
            return;
        } else if (maxnode == 0 && backend->policy != MPOL_DEFAULT) {
            error_setg(errp, "host-nodes must be set for policy %s",
-                       HostMemPolicy_lookup[backend->policy]);
+                       HostMemPolicy_str(backend->policy));
            return;
        }

@@ -342,7 +342,7 @@ out:
 }

 static bool
-host_memory_backend_can_be_deleted(UserCreatable *uc, Error **errp)
+host_memory_backend_can_be_deleted(UserCreatable *uc)
 {
    if (host_memory_backend_is_mapped(MEMORY_BACKEND(uc))) {
        return false;
@@ -395,7 +395,7 @@ host_memory_backend_class_init(ObjectClass *oc, void *data)
        host_memory_backend_set_host_nodes,
        NULL, NULL, &error_abort);
    object_class_property_add_enum(oc, "policy", "HostMemPolicy",
-        HostMemPolicy_lookup,
+        &HostMemPolicy_lookup,
        host_memory_backend_get_policy,
        host_memory_backend_set_policy, &error_abort);
    object_class_property_add_str(oc, "id", get_id, set_id, &error_abort);
--- a/backends/rng-egd.c
+++ b/backends/rng-egd.c
@@ -106,7 +106,7 @@ static void rng_egd_opened(RngBackend *b, Error **errp)

    /* FIXME we should resubmit pending requests when the CDS reconnects. */
    qemu_chr_fe_set_handlers(&s->chr, rng_egd_chr_can_read,
-                             rng_egd_chr_read, NULL, s, NULL, true);
+                             rng_egd_chr_read, NULL, NULL, s, NULL, true);
 }

 static void rng_egd_set_chardev(Object *obj, const char *value, Error **errp)
--- a/block.c
+++ b/block.c
@@ -42,7 +42,6 @@
 #include "qapi-event.h"
 #include "qemu/cutils.h"
 #include "qemu/id.h"
-#include "qapi/util.h"

 #ifdef CONFIG_BSD
 #include <sys/ioctl.h>
@@ -246,7 +245,8 @@ bool bdrv_is_writable(BlockDriverState *bs)
    return !bdrv_is_read_only(bs) && !(bs->open_flags & BDRV_O_INACTIVE);
 }

-int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only, Error **errp)
+int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only,
+                           bool ignore_allow_rdw, Error **errp)
 {
    /* Do not set read_only if copy_on_read is enabled */
    if (bs->copy_on_read && read_only) {
@@ -256,7 +256,9 @@ int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only, Error **errp)
    }

    /* Do not clear read_only if it is prohibited */
-    if (!read_only && !(bs->open_flags & BDRV_O_ALLOW_RDWR)) {
+    if (!read_only && !(bs->open_flags & BDRV_O_ALLOW_RDWR) &&
+        !ignore_allow_rdw)
+    {
        error_setg(errp, "Node '%s' is read only",
                   bdrv_get_device_or_node_name(bs));
        return -EPERM;
@@ -269,7 +271,7 @@ int bdrv_set_read_only(BlockDriverState *bs, bool read_only, Error **errp)
 {
    int ret = 0;

-    ret = bdrv_can_set_read_only(bs, read_only, errp);
+    ret = bdrv_can_set_read_only(bs, read_only, false, errp);
    if (ret < 0) {
        return ret;
    }
@@ -494,6 +496,8 @@ int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)

    if (drv && drv->bdrv_probe_blocksizes) {
        return drv->bdrv_probe_blocksizes(bs, bsz);
+    } else if (drv && drv->is_filter && bs->file) {
+        return bdrv_probe_blocksizes(bs->file->bs, bsz);
    }

    return -ENOTSUP;
@@ -511,6 +515,8 @@ int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo)

    if (drv && drv->bdrv_probe_geometry) {
        return drv->bdrv_probe_geometry(bs, geo);
+    } else if (drv && drv->is_filter && bs->file) {
+        return bdrv_probe_geometry(bs->file->bs, geo);
    }

    return -ENOTSUP;
@@ -1119,20 +1125,19 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
        } else {
            error_setg_errno(errp, -ret, "Could not open image");
        }
-        goto free_and_fail;
+        goto open_failed;
    }

    ret = refresh_total_sectors(bs, bs->total_sectors);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not refresh total sector count");
-        goto free_and_fail;
+        return ret;
    }

    bdrv_refresh_limits(bs, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
-        ret = -EINVAL;
-        goto free_and_fail;
+        return -EINVAL;
    }

    assert(bdrv_opt_mem_align(bs) != 0);
@@ -1140,12 +1145,14 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
    assert(is_power_of_2(bs->bl.request_alignment));

    return 0;
-
-free_and_fail:
-    /* FIXME Close bs first if already opened*/
+open_failed:
+    bs->drv = NULL;
+    if (bs->file != NULL) {
+        bdrv_unref_child(bs, bs->file);
+        bs->file = NULL;
+    }
    g_free(bs->opaque);
    bs->opaque = NULL;
-    bs->drv = NULL;
    return ret;
 }

@@ -1166,7 +1173,9 @@ BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name,
    ret = bdrv_open_driver(bs, drv, node_name, bs->options, flags, errp);
    if (ret < 0) {
        QDECREF(bs->explicit_options);
+        bs->explicit_options = NULL;
        QDECREF(bs->options);
+        bs->options = NULL;
        bdrv_unref(bs);
        return NULL;
    }
@@ -1327,9 +1336,8 @@ static int bdrv_open_common(BlockDriverState *bs, BlockBackend *file,
    detect_zeroes = qemu_opt_get(opts, "detect-zeroes");
    if (detect_zeroes) {
        BlockdevDetectZeroesOptions value =
-            qapi_enum_parse(BlockdevDetectZeroesOptions_lookup,
+            qapi_enum_parse(&BlockdevDetectZeroesOptions_lookup,
                            detect_zeroes,
-                            BLOCKDEV_DETECT_ZEROES_OPTIONS__MAX,
                            BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF,
                            &local_err);
        if (local_err) {
@@ -1930,6 +1938,8 @@ static void bdrv_replace_child(BdrvChild *child, BlockDriverState *new_bs)
    BlockDriverState *old_bs = child->bs;
    uint64_t perm, shared_perm;

+    bdrv_replace_child_noperm(child, new_bs);
+
    if (old_bs) {
        /* Update permissions for old node. This is guaranteed to succeed
         * because we're just taking a parent away, so we're loosening
@@ -1939,8 +1949,6 @@ static void bdrv_replace_child(BdrvChild *child, BlockDriverState *new_bs)
        bdrv_set_perm(old_bs, perm, shared_perm);
    }

-    bdrv_replace_child_noperm(child, new_bs);
-
    if (new_bs) {
        bdrv_get_cumulative_perm(new_bs, &perm, &shared_perm);
        bdrv_set_perm(new_bs, perm, shared_perm);
@@ -2185,6 +2193,7 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
        ret = -EINVAL;
        goto free_exit;
    }
+    bdrv_set_aio_context(backing_hd, bdrv_get_aio_context(bs));

    /* Hook up the backing file link; drop our reference, bs owns the
     * backing_hd reference now */
@@ -2573,15 +2582,7 @@ static BlockDriverState *bdrv_open_inherit(const char *filename,
        goto close_and_fail;
    }

-    if (!bdrv_key_required(bs)) {
-        bdrv_parent_cb_change_media(bs, true);
-    } else if (!runstate_check(RUN_STATE_PRELAUNCH)
-               && !runstate_check(RUN_STATE_INMIGRATE)
-               && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
-        error_setg(errp,
-                   "Guest must be stopped for opening of encrypted image");
-        goto close_and_fail;
-    }
+    bdrv_parent_cb_change_media(bs, true);

    QDECREF(options);

@@ -2607,14 +2608,12 @@ static BlockDriverState *bdrv_open_inherit(const char *filename,

 fail:
    blk_unref(file);
-    if (bs->file != NULL) {
-        bdrv_unref_child(bs, bs->file);
-    }
    QDECREF(snapshot_options);
    QDECREF(bs->explicit_options);
    QDECREF(bs->options);
    QDECREF(options);
    bs->options = NULL;
+    bs->explicit_options = NULL;
    bdrv_unref(bs);
    error_propagate(errp, local_err);
    return NULL;
@@ -2732,8 +2731,11 @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
    bdrv_join_options(bs, options, old_options);
    QDECREF(old_options);

-    /* bdrv_open() masks this flag out */
+    /* bdrv_open_inherit() sets and clears some additional flags internally */
    flags &= ~BDRV_O_PROTOCOL;
+    if (flags & BDRV_O_RDWR) {
+        flags |= BDRV_O_ALLOW_RDWR;
+    }

    QLIST_FOREACH(child, &bs->children, next) {
        QDict *new_child_options;
@@ -2913,7 +2915,7 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
     * to r/w. Attempting to set to r/w may fail if either BDRV_O_ALLOW_RDWR is
     * not set, or if the BDS still has copy_on_read enabled */
    read_only = !(reopen_state->flags & BDRV_O_RDWR);
-    ret = bdrv_can_set_read_only(reopen_state->bs, read_only, &local_err);
+    ret = bdrv_can_set_read_only(reopen_state->bs, read_only, true, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        goto error;
@@ -2989,24 +2991,45 @@ error:
 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
 {
    BlockDriver *drv;
+    BlockDriverState *bs;
+    bool old_can_write, new_can_write;

    assert(reopen_state != NULL);
-    drv = reopen_state->bs->drv;
+    bs = reopen_state->bs;
+    drv = bs->drv;
    assert(drv != NULL);

+    old_can_write =
+        !bdrv_is_read_only(bs) && !(bdrv_get_flags(bs) & BDRV_O_INACTIVE);
+
    /* If there are any driver level actions to take */
    if (drv->bdrv_reopen_commit) {
        drv->bdrv_reopen_commit(reopen_state);
    }

    /* set BDS specific flags now */
-    QDECREF(reopen_state->bs->explicit_options);
+    QDECREF(bs->explicit_options);

-    reopen_state->bs->explicit_options   = reopen_state->explicit_options;
-    reopen_state->bs->open_flags         = reopen_state->flags;
-    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
+    bs->explicit_options   = reopen_state->explicit_options;
+    bs->open_flags         = reopen_state->flags;
+    bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);

-    bdrv_refresh_limits(reopen_state->bs, NULL);
+    bdrv_refresh_limits(bs, NULL);
+
+    new_can_write =
+        !bdrv_is_read_only(bs) && !(bdrv_get_flags(bs) & BDRV_O_INACTIVE);
+    if (!old_can_write && new_can_write && drv->bdrv_reopen_bitmaps_rw) {
+        Error *local_err = NULL;
+        if (drv->bdrv_reopen_bitmaps_rw(bs, &local_err) < 0) {
+            /* This is not fatal, bitmaps just left read-only, so all following
+             * writes will fail. User can remove read-only bitmaps to unblock
+             * writes.
+             */
+            error_reportf_err(local_err,
+                              "%s: Failed to make dirty bitmaps writable: ",
+                              bdrv_get_node_name(bs));
+        }
+    }
 }

 /*
@@ -3040,9 +3063,6 @@ static void bdrv_close(BlockDriverState *bs)
    bdrv_flush(bs);
    bdrv_drain(bs); /* in case flush left pending I/O */

-    bdrv_release_named_dirty_bitmaps(bs);
-    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
-
    if (bs->drv) {
        BdrvChild *child, *next;

@@ -3072,15 +3092,18 @@ static void bdrv_close(BlockDriverState *bs)
        bs->backing_format[0] = '\0';
        bs->total_sectors = 0;
        bs->encrypted = false;
-        bs->valid_key = false;
        bs->sg = false;
        QDECREF(bs->options);
        QDECREF(bs->explicit_options);
        bs->options = NULL;
+        bs->explicit_options = NULL;
        QDECREF(bs->full_open_options);
        bs->full_open_options = NULL;
    }

+    bdrv_release_named_dirty_bitmaps(bs);
+    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
+
    QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
        g_free(ban);
    }
@@ -3398,7 +3421,8 @@ exit:
 /**
 * Truncate file to 'offset' bytes (needed only for file protocols)
 */
-int bdrv_truncate(BdrvChild *child, int64_t offset, Error **errp)
+int bdrv_truncate(BdrvChild *child, int64_t offset, PreallocMode prealloc,
+                  Error **errp)
 {
    BlockDriverState *bs = child->bs;
    BlockDriver *drv = bs->drv;
@@ -3406,11 +3430,15 @@ int bdrv_truncate(BdrvChild *child, int64_t offset, Error **errp)

    assert(child->perm & BLK_PERM_RESIZE);

+    /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
    if (!drv) {
        error_setg(errp, "No medium inserted");
        return -ENOMEDIUM;
    }
    if (!drv->bdrv_truncate) {
+        if (bs->file && drv->is_filter) {
+            return bdrv_truncate(bs->file, offset, prealloc, errp);
+        }
        error_setg(errp, "Image format driver does not support resize");
        return -ENOTSUP;
    }
@@ -3421,7 +3449,7 @@ int bdrv_truncate(BdrvChild *child, int64_t offset, Error **errp)

    assert(!(bs->open_flags & BDRV_O_INACTIVE));

-    ret = drv->bdrv_truncate(bs, offset, errp);
+    ret = drv->bdrv_truncate(bs, offset, prealloc, errp);
    if (ret == 0) {
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
        bdrv_dirty_bitmap_truncate(bs);
@@ -3450,6 +3478,41 @@ int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
    return -ENOTSUP;
 }

+/*
+ * bdrv_measure:
+ * @drv: Format driver
+ * @opts: Creation options for new image
+ * @in_bs: Existing image containing data for new image (may be NULL)
+ * @errp: Error object
+ * Returns: A #BlockMeasureInfo (free using qapi_free_BlockMeasureInfo())
+ *          or NULL on error
+ *
+ * Calculate file size required to create a new image.
+ *
+ * If @in_bs is given then space for allocated clusters and zero clusters
+ * from that image are included in the calculation.  If @opts contains a
+ * backing file that is shared by @in_bs then backing clusters may be omitted
+ * from the calculation.
+ *
+ * If @in_bs is NULL then the calculation includes no allocated clusters
+ * unless a preallocation option is given in @opts.
+ *
+ * Note that @in_bs may use a different BlockDriver from @drv.
+ *
+ * If an error occurs the @errp pointer is set.
+ */
+BlockMeasureInfo *bdrv_measure(BlockDriver *drv, QemuOpts *opts,
+                               BlockDriverState *in_bs, Error **errp)
+{
+    if (!drv->bdrv_measure) {
+        error_setg(errp, "Block driver '%s' does not support size measurement",
+                   drv->format_name);
+        return NULL;
+    }
+
+    return drv->bdrv_measure(opts, in_bs, errp);
+}
+
 /**
 * Return number of sectors on success, -errno on error.
 */
@@ -3502,72 +3565,6 @@ bool bdrv_is_encrypted(BlockDriverState *bs)
    return bs->encrypted;
 }

-bool bdrv_key_required(BlockDriverState *bs)
-{
-    BdrvChild *backing = bs->backing;
-
-    if (backing && backing->bs->encrypted && !backing->bs->valid_key) {
-        return true;
-    }
-    return (bs->encrypted && !bs->valid_key);
-}
-
-int bdrv_set_key(BlockDriverState *bs, const char *key)
-{
-    int ret;
-    if (bs->backing && bs->backing->bs->encrypted) {
-        ret = bdrv_set_key(bs->backing->bs, key);
-        if (ret < 0)
-            return ret;
-        if (!bs->encrypted)
-            return 0;
-    }
-    if (!bs->encrypted) {
-        return -EINVAL;
-    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
-        return -ENOMEDIUM;
-    }
-    ret = bs->drv->bdrv_set_key(bs, key);
-    if (ret < 0) {
-        bs->valid_key = false;
-    } else if (!bs->valid_key) {
-        /* call the change callback now, we skipped it on open */
-        bs->valid_key = true;
-        bdrv_parent_cb_change_media(bs, true);
-    }
-    return ret;
-}
-
-/*
- * Provide an encryption key for @bs.
- * If @key is non-null:
- *     If @bs is not encrypted, fail.
- *     Else if the key is invalid, fail.
- *     Else set @bs's key to @key, replacing the existing key, if any.
- * If @key is null:
- *     If @bs is encrypted and still lacks a key, fail.
- *     Else do nothing.
- * On failure, store an error object through @errp if non-null.
- */
-void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
-{
-    if (key) {
-        if (!bdrv_is_encrypted(bs)) {
-            error_setg(errp, "Node '%s' is not encrypted",
-                      bdrv_get_device_or_node_name(bs));
-        } else if (bdrv_set_key(bs, key) < 0) {
-            error_setg(errp, QERR_INVALID_PASSWORD);
-        }
-    } else {
-        if (bdrv_key_required(bs)) {
-            error_set(errp, ERROR_CLASS_DEVICE_ENCRYPTED,
-                      "'%s' (%s) is encrypted",
-                      bdrv_get_device_or_node_name(bs),
-                      bdrv_get_encrypted_filename(bs));
-        }
-    }
-}
-
 const char *bdrv_get_format_name(BlockDriverState *bs)
 {
    return bs->drv ? bs->drv->format_name : NULL;
@@ -3778,6 +3775,9 @@ int bdrv_has_zero_init(BlockDriverState *bs)
    if (bs->drv->bdrv_has_zero_init) {
        return bs->drv->bdrv_has_zero_init(bs);
    }
+    if (bs->file && bs->drv->is_filter) {
+        return bdrv_has_zero_init(bs->file->bs);
+    }

    /* safe default */
    return 0;
@@ -3832,10 +3832,16 @@ void bdrv_get_backing_filename(BlockDriverState *bs,
 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
    BlockDriver *drv = bs->drv;
-    if (!drv)
+    /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
+    if (!drv) {
        return -ENOMEDIUM;
-    if (!drv->bdrv_get_info)
+    }
+    if (!drv->bdrv_get_info) {
+        if (bs->file && drv->is_filter) {
+            return bdrv_get_info(bs->file->bs, bdi);
+        }
        return -ENOTSUP;
+    }
    memset(bdi, 0, sizeof(*bdi));
    return drv->bdrv_get_info(bs, bdi);
 }
@@ -3990,19 +3996,6 @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
    return retval;
 }

-int bdrv_get_backing_file_depth(BlockDriverState *bs)
-{
-    if (!bs->drv) {
-        return 0;
-    }
-
-    if (!bs->backing) {
-        return 0;
-    }
-
-    return 1 + bdrv_get_backing_file_depth(bs->backing->bs);
-}
-
 void bdrv_init(void)
 {
    module_call_init(MODULE_INIT_BLOCK);
@@ -4107,21 +4100,20 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs,
        }
    }

-    if (setting_flag) {
+    if (setting_flag && !(bs->open_flags & BDRV_O_INACTIVE)) {
        uint64_t perm, shared_perm;

-        bs->open_flags |= BDRV_O_INACTIVE;
-
        QLIST_FOREACH(parent, &bs->parents, next_parent) {
            if (parent->role->inactivate) {
                ret = parent->role->inactivate(parent);
                if (ret < 0) {
-                    bs->open_flags &= ~BDRV_O_INACTIVE;
                    return ret;
                }
            }
        }

+        bs->open_flags |= BDRV_O_INACTIVE;
+
        /* Update permissions, they may differ for inactive nodes */
        bdrv_get_cumulative_perm(bs, &perm, &shared_perm);
        bdrv_check_perm(bs, perm, shared_perm, NULL, &error_abort);
@@ -4135,6 +4127,10 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs,
        }
    }

+    /* At this point persistent bitmaps should be already stored by the format
+     * driver */
+    bdrv_release_persistent_dirty_bitmaps(bs);
+
    return 0;
 }

@@ -4195,20 +4191,6 @@ bool bdrv_is_inserted(BlockDriverState *bs)
    return true;
 }

-/**
- * Return whether the media changed since the last call to this
- * function, or -ENOTSUP if we don't know.  Most drivers don't know.
- */
-int bdrv_media_changed(BlockDriverState *bs)
-{
-    BlockDriver *drv = bs->drv;
-
-    if (drv && drv->bdrv_media_changed) {
-        return drv->bdrv_media_changed(bs);
-    }
-    return -ENOTSUP;
-}
-
 /**
 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
 */
@@ -4267,11 +4249,9 @@ bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
    assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
    if (!QLIST_EMPTY(&bs->op_blockers[op])) {
        blocker = QLIST_FIRST(&bs->op_blockers[op]);
-        if (errp) {
-            *errp = error_copy(blocker->reason);
-            error_prepend(errp, "Node '%s' is busy: ",
-                          bdrv_get_device_or_node_name(bs));
-        }
+        error_propagate(errp, error_copy(blocker->reason));
+        error_prepend(errp, "Node '%s' is busy: ",
+                      bdrv_get_device_or_node_name(bs));
        return true;
    }
    return false;
@@ -4411,55 +4391,65 @@ void bdrv_img_create(const char *filename, const char *fmt,

    backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);

-    // The size for the image must always be specified, with one exception:
-    // If we are using a backing file, we can obtain the size from there
+    /* The size for the image must always be specified, unless we have a backing
+     * file and we have not been forbidden from opening it. */
    size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
-    if (size == -1) {
-        if (backing_file) {
-            BlockDriverState *bs;
-            char *full_backing = g_new0(char, PATH_MAX);
-            int64_t size;
-            int back_flags;
-            QDict *backing_options = NULL;
+    if (backing_file && !(flags & BDRV_O_NO_BACKING)) {
+        BlockDriverState *bs;
+        char *full_backing = g_new0(char, PATH_MAX);
+        int back_flags;
+        QDict *backing_options = NULL;

-            bdrv_get_full_backing_filename_from_filename(filename, backing_file,
-                                                         full_backing, PATH_MAX,
-                                                         &local_err);
-            if (local_err) {
-                g_free(full_backing);
-                goto out;
-            }
-
-            /* backing files always opened read-only */
-            back_flags = flags;
-            back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
-
-            if (backing_fmt) {
-                backing_options = qdict_new();
-                qdict_put_str(backing_options, "driver", backing_fmt);
-            }
-
-            bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
-                           &local_err);
+        bdrv_get_full_backing_filename_from_filename(filename, backing_file,
+                                                     full_backing, PATH_MAX,
+                                                     &local_err);
+        if (local_err) {
            g_free(full_backing);
-            if (!bs) {
-                goto out;
-            }
-            size = bdrv_getlength(bs);
-            if (size < 0) {
-                error_setg_errno(errp, -size, "Could not get size of '%s'",
-                                 backing_file);
-                bdrv_unref(bs);
-                goto out;
-            }
-
-            qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size, &error_abort);
-
-            bdrv_unref(bs);
-        } else {
-            error_setg(errp, "Image creation needs a size parameter");
            goto out;
        }
+
+        /* backing files always opened read-only */
+        back_flags = flags;
+        back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
+
+        if (backing_fmt) {
+            backing_options = qdict_new();
+            qdict_put_str(backing_options, "driver", backing_fmt);
+        }
+
+        bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
+                       &local_err);
+        g_free(full_backing);
+        if (!bs && size != -1) {
+            /* Couldn't open BS, but we have a size, so it's nonfatal */
+            warn_reportf_err(local_err,
+                            "Could not verify backing image. "
+                            "This may become an error in future versions.\n");
+            local_err = NULL;
+        } else if (!bs) {
+            /* Couldn't open bs, do not have size */
+            error_append_hint(&local_err,
+                              "Could not open backing image to determine size.\n");
+            goto out;
+        } else {
+            if (size == -1) {
+                /* Opened BS, have no size */
+                size = bdrv_getlength(bs);
+                if (size < 0) {
+                    error_setg_errno(errp, -size, "Could not get size of '%s'",
+                                     backing_file);
+                    bdrv_unref(bs);
+                    goto out;
+                }
+                qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size, &error_abort);
+            }
+            bdrv_unref(bs);
+        }
+    } /* (backing_file && !(flags & BDRV_O_NO_BACKING)) */
+
+    if (size == -1) {
+        error_setg(errp, "Image creation needs a size parameter");
+        goto out;
    }

    if (!quiet) {
@@ -4933,3 +4923,25 @@ void bdrv_del_child(BlockDriverState *parent_bs, BdrvChild *child, Error **errp)

    parent_bs->drv->bdrv_del_child(parent_bs, child, errp);
 }
+
+bool bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name,
+                                     uint32_t granularity, Error **errp)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (!drv) {
+        error_setg_errno(errp, ENOMEDIUM,
+                         "Can't store persistent bitmaps to %s",
+                         bdrv_get_device_or_node_name(bs));
+        return false;
+    }
+
+    if (!drv->bdrv_can_store_new_dirty_bitmap) {
+        error_setg_errno(errp, ENOTSUP,
+                         "Can't store persistent bitmaps to %s",
+                         bdrv_get_device_or_node_name(bs));
+        return false;
+    }
+
+    return drv->bdrv_can_store_new_dirty_bitmap(bs, name, granularity, errp);
+}
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -1,6 +1,6 @@
 block-obj-y += raw-format.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o vvfat.o dmg.o
-block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
-block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
+block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o qcow2-bitmap.o
+block-obj-y += qed.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
 block-obj-y += vhdx.o vhdx-endian.o vhdx-log.o
 block-obj-y += quorum.o
@@ -25,6 +25,7 @@ block-obj-y += accounting.o dirty-bitmap.o
 block-obj-y += write-threshold.o
 block-obj-y += backup.o
 block-obj-$(CONFIG_REPLICATION) += replication.o
+block-obj-y += throttle.o

 block-obj-y += crypto.o

--- a/block/backup.c
+++ b/block/backup.c
@@ -39,7 +39,7 @@ typedef struct BackupBlockJob {
    BlockdevOnError on_source_error;
    BlockdevOnError on_target_error;
    CoRwlock flush_rwlock;
-    uint64_t sectors_read;
+    uint64_t bytes_read;
    unsigned long *done_bitmap;
    int64_t cluster_size;
    bool compress;
@@ -47,12 +47,6 @@ typedef struct BackupBlockJob {
    QLIST_HEAD(, CowRequest) inflight_reqs;
 } BackupBlockJob;

-/* Size of a cluster in sectors, instead of bytes. */
-static inline int64_t cluster_size_sectors(BackupBlockJob *job)
-{
-  return job->cluster_size / BDRV_SECTOR_SIZE;
-}
-
 /* See if in-flight requests overlap and wait for them to complete */
 static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
                                                       int64_t start,
@@ -64,7 +58,7 @@ static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
    do {
        retry = false;
        QLIST_FOREACH(req, &job->inflight_reqs, list) {
-            if (end > req->start && start < req->end) {
+            if (end > req->start_byte && start < req->end_byte) {
                qemu_co_queue_wait(&req->wait_queue, NULL);
                retry = true;
                break;
@@ -75,10 +69,10 @@ static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,

 /* Keep track of an in-flight request */
 static void cow_request_begin(CowRequest *req, BackupBlockJob *job,
-                                     int64_t start, int64_t end)
+                              int64_t start, int64_t end)
 {
-    req->start = start;
-    req->end = end;
+    req->start_byte = start;
+    req->end_byte = end;
    qemu_co_queue_init(&req->wait_queue);
    QLIST_INSERT_HEAD(&job->inflight_reqs, req, list);
 }
@@ -91,7 +85,7 @@ static void cow_request_end(CowRequest *req)
 }

 static int coroutine_fn backup_do_cow(BackupBlockJob *job,
-                                      int64_t sector_num, int nb_sectors,
+                                      int64_t offset, uint64_t bytes,
                                      bool *error_is_read,
                                      bool is_write_notifier)
 {
@@ -101,41 +95,37 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
    QEMUIOVector bounce_qiov;
    void *bounce_buffer = NULL;
    int ret = 0;
-    int64_t sectors_per_cluster = cluster_size_sectors(job);
-    int64_t start, end;
-    int n;
+    int64_t start, end; /* bytes */
+    int n; /* bytes */

    qemu_co_rwlock_rdlock(&job->flush_rwlock);

-    start = sector_num / sectors_per_cluster;
-    end = DIV_ROUND_UP(sector_num + nb_sectors, sectors_per_cluster);
+    start = QEMU_ALIGN_DOWN(offset, job->cluster_size);
+    end = QEMU_ALIGN_UP(bytes + offset, job->cluster_size);

-    trace_backup_do_cow_enter(job, start, sector_num, nb_sectors);
+    trace_backup_do_cow_enter(job, start, offset, bytes);

    wait_for_overlapping_requests(job, start, end);
    cow_request_begin(&cow_request, job, start, end);

-    for (; start < end; start++) {
-        if (test_bit(start, job->done_bitmap)) {
+    for (; start < end; start += job->cluster_size) {
+        if (test_bit(start / job->cluster_size, job->done_bitmap)) {
            trace_backup_do_cow_skip(job, start);
            continue; /* already copied */
        }

        trace_backup_do_cow_process(job, start);

-        n = MIN(sectors_per_cluster,
-                job->common.len / BDRV_SECTOR_SIZE -
-                start * sectors_per_cluster);
+        n = MIN(job->cluster_size, job->common.len - start);

        if (!bounce_buffer) {
            bounce_buffer = blk_blockalign(blk, job->cluster_size);
        }
        iov.iov_base = bounce_buffer;
-        iov.iov_len = n * BDRV_SECTOR_SIZE;
+        iov.iov_len = n;
        qemu_iovec_init_external(&bounce_qiov, &iov, 1);

-        ret = blk_co_preadv(blk, start * job->cluster_size,
-                            bounce_qiov.size, &bounce_qiov,
+        ret = blk_co_preadv(blk, start, bounce_qiov.size, &bounce_qiov,
                            is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0);
        if (ret < 0) {
            trace_backup_do_cow_read_fail(job, start, ret);
@@ -146,10 +136,10 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
        }

        if (buffer_is_zero(iov.iov_base, iov.iov_len)) {
-            ret = blk_co_pwrite_zeroes(job->target, start * job->cluster_size,
+            ret = blk_co_pwrite_zeroes(job->target, start,
                                       bounce_qiov.size, BDRV_REQ_MAY_UNMAP);
        } else {
-            ret = blk_co_pwritev(job->target, start * job->cluster_size,
+            ret = blk_co_pwritev(job->target, start,
                                 bounce_qiov.size, &bounce_qiov,
                                 job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0);
        }
@@ -161,13 +151,13 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
            goto out;
        }

-        set_bit(start, job->done_bitmap);
+        set_bit(start / job->cluster_size, job->done_bitmap);

        /* Publish progress, guest I/O counts as progress too.  Note that the
         * offset field is an opaque progress value, it is not a disk offset.
         */
-        job->sectors_read += n;
-        job->common.offset += n * BDRV_SECTOR_SIZE;
+        job->bytes_read += n;
+        job->common.offset += n;
    }

 out:
@@ -177,7 +167,7 @@ out:

    cow_request_end(&cow_request);

-    trace_backup_do_cow_return(job, sector_num, nb_sectors, ret);
+    trace_backup_do_cow_return(job, offset, bytes, ret);

    qemu_co_rwlock_unlock(&job->flush_rwlock);

@@ -190,14 +180,12 @@ static int coroutine_fn backup_before_write_notify(
 {
    BackupBlockJob *job = container_of(notifier, BackupBlockJob, before_write);
    BdrvTrackedRequest *req = opaque;
-    int64_t sector_num = req->offset >> BDRV_SECTOR_BITS;
-    int nb_sectors = req->bytes >> BDRV_SECTOR_BITS;

    assert(req->bs == blk_bs(job->common.blk));
-    assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert(QEMU_IS_ALIGNED(req->offset, BDRV_SECTOR_SIZE));
+    assert(QEMU_IS_ALIGNED(req->bytes, BDRV_SECTOR_SIZE));

-    return backup_do_cow(job, sector_num, nb_sectors, NULL, true);
+    return backup_do_cow(job, req->offset, req->bytes, NULL, true);
 }

 static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -208,7 +196,7 @@ static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
        return;
    }
-    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
+    ratelimit_set_speed(&s->limit, speed, SLICE_TIME);
 }

 static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
@@ -275,32 +263,29 @@ void backup_do_checkpoint(BlockJob *job, Error **errp)
    bitmap_zero(backup_job->done_bitmap, len);
 }

-void backup_wait_for_overlapping_requests(BlockJob *job, int64_t sector_num,
-                                          int nb_sectors)
+void backup_wait_for_overlapping_requests(BlockJob *job, int64_t offset,
+                                          uint64_t bytes)
 {
    BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
-    int64_t sectors_per_cluster = cluster_size_sectors(backup_job);
    int64_t start, end;

    assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP);

-    start = sector_num / sectors_per_cluster;
-    end = DIV_ROUND_UP(sector_num + nb_sectors, sectors_per_cluster);
+    start = QEMU_ALIGN_DOWN(offset, backup_job->cluster_size);
+    end = QEMU_ALIGN_UP(offset + bytes, backup_job->cluster_size);
    wait_for_overlapping_requests(backup_job, start, end);
 }

 void backup_cow_request_begin(CowRequest *req, BlockJob *job,
-                              int64_t sector_num,
-                              int nb_sectors)
+                              int64_t offset, uint64_t bytes)
 {
    BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
-    int64_t sectors_per_cluster = cluster_size_sectors(backup_job);
    int64_t start, end;

    assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP);

-    start = sector_num / sectors_per_cluster;
-    end = DIV_ROUND_UP(sector_num + nb_sectors, sectors_per_cluster);
+    start = QEMU_ALIGN_DOWN(offset, backup_job->cluster_size);
+    end = QEMU_ALIGN_UP(offset + bytes, backup_job->cluster_size);
    cow_request_begin(req, backup_job, start, end);
 }

@@ -359,8 +344,8 @@ static bool coroutine_fn yield_and_check(BackupBlockJob *job)
     */
    if (job->common.speed) {
        uint64_t delay_ns = ratelimit_calculate_delay(&job->limit,
-                                                      job->sectors_read);
-        job->sectors_read = 0;
+                                                      job->bytes_read);
+        job->bytes_read = 0;
        block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns);
    } else {
        block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0);
@@ -379,11 +364,10 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
    int ret = 0;
    int clusters_per_iter;
    uint32_t granularity;
-    int64_t sector;
+    int64_t offset;
    int64_t cluster;
    int64_t end;
    int64_t last_cluster = -1;
-    int64_t sectors_per_cluster = cluster_size_sectors(job);
    BdrvDirtyBitmapIter *dbi;

    granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
@@ -391,8 +375,8 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
    dbi = bdrv_dirty_iter_new(job->sync_bitmap, 0);

    /* Find the next dirty sector(s) */
-    while ((sector = bdrv_dirty_iter_next(dbi)) != -1) {
-        cluster = sector / sectors_per_cluster;
+    while ((offset = bdrv_dirty_iter_next(dbi) * BDRV_SECTOR_SIZE) >= 0) {
+        cluster = offset / job->cluster_size;

        /* Fake progress updates for any clusters we skipped */
        if (cluster != last_cluster + 1) {
@@ -405,8 +389,8 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
                if (yield_and_check(job)) {
                    goto out;
                }
-                ret = backup_do_cow(job, cluster * sectors_per_cluster,
-                                    sectors_per_cluster, &error_is_read,
+                ret = backup_do_cow(job, cluster * job->cluster_size,
+                                    job->cluster_size, &error_is_read,
                                    false);
                if ((ret < 0) &&
                    backup_error_action(job, error_is_read, -ret) ==
@@ -419,7 +403,8 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
        /* If the bitmap granularity is smaller than the backup granularity,
         * we need to advance the iterator pointer to the next cluster. */
        if (granularity < job->cluster_size) {
-            bdrv_set_dirty_iter(dbi, cluster * sectors_per_cluster);
+            bdrv_set_dirty_iter(dbi,
+                                cluster * job->cluster_size / BDRV_SECTOR_SIZE);
        }

        last_cluster = cluster - 1;
@@ -441,17 +426,14 @@ static void coroutine_fn backup_run(void *opaque)
    BackupBlockJob *job = opaque;
    BackupCompleteData *data;
    BlockDriverState *bs = blk_bs(job->common.blk);
-    int64_t start, end;
-    int64_t sectors_per_cluster = cluster_size_sectors(job);
+    int64_t offset;
    int ret = 0;

    QLIST_INIT(&job->inflight_reqs);
    qemu_co_rwlock_init(&job->flush_rwlock);

-    start = 0;
-    end = DIV_ROUND_UP(job->common.len, job->cluster_size);
-
-    job->done_bitmap = bitmap_new(end);
+    job->done_bitmap = bitmap_new(DIV_ROUND_UP(job->common.len,
+                                               job->cluster_size));

    job->before_write.notify = backup_before_write_notify;
    bdrv_add_before_write_notifier(bs, &job->before_write);
@@ -466,7 +448,8 @@ static void coroutine_fn backup_run(void *opaque)
        ret = backup_run_incremental(job);
    } else {
        /* Both FULL and TOP SYNC_MODE's require copying.. */
-        for (; start < end; start++) {
+        for (offset = 0; offset < job->common.len;
+             offset += job->cluster_size) {
            bool error_is_read;
            int alloced = 0;

@@ -475,12 +458,13 @@ static void coroutine_fn backup_run(void *opaque)
            }

            if (job->sync_mode == MIRROR_SYNC_MODE_TOP) {
-                int i, n;
+                int i;
+                int64_t n;

                /* Check to see if these blocks are already in the
                 * backing file. */

-                for (i = 0; i < sectors_per_cluster;) {
+                for (i = 0; i < job->cluster_size;) {
                    /* bdrv_is_allocated() only returns true/false based
                     * on the first set of sectors it comes across that
                     * are are all in the same state.
@@ -488,9 +472,8 @@ static void coroutine_fn backup_run(void *opaque)
                     * backup cluster length.  We end up copying more than
                     * needed but at some point that is always the case. */
                    alloced =
-                        bdrv_is_allocated(bs,
-                                start * sectors_per_cluster + i,
-                                sectors_per_cluster - i, &n);
+                        bdrv_is_allocated(bs, offset + i,
+                                          job->cluster_size - i, &n);
                    i += n;

                    if (alloced || n == 0) {
@@ -508,9 +491,8 @@ static void coroutine_fn backup_run(void *opaque)
            if (alloced < 0) {
                ret = alloced;
            } else {
-                ret = backup_do_cow(job, start * sectors_per_cluster,
-                                    sectors_per_cluster, &error_is_read,
-                                    false);
+                ret = backup_do_cow(job, offset, job->cluster_size,
+                                    &error_is_read, false);
            }
            if (ret < 0) {
                /* Depending on error action, fail now or retry cluster */
@@ -519,7 +501,7 @@ static void coroutine_fn backup_run(void *opaque)
                if (action == BLOCK_ERROR_ACTION_REPORT) {
                    break;
                } else {
-                    start--;
+                    offset -= job->cluster_size;
                    continue;
                }
            }
@@ -614,7 +596,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
        error_setg(errp,
                   "a sync_bitmap was provided to backup_run, "
                   "but received an incompatible sync_mode (%s)",
-                   MirrorSyncMode_lookup[sync_mode]);
+                   MirrorSyncMode_str(sync_mode));
        return NULL;
    }

@@ -657,12 +639,12 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
    ret = bdrv_get_info(target, &bdi);
    if (ret == -ENOTSUP && !target->backing) {
        /* Cluster size is not defined */
-        error_report("WARNING: The target block device doesn't provide "
-                     "information about the block size and it doesn't have a "
-                     "backing file. The default block size of %u bytes is "
-                     "used. If the actual block size of the target exceeds "
-                     "this default, the backup may be unusable",
-                     BACKUP_CLUSTER_SIZE_DEFAULT);
+        warn_report("The target block device doesn't provide "
+                    "information about the block size and it doesn't have a "
+                    "backing file. The default block size of %u bytes is "
+                    "used. If the actual block size of the target exceeds "
+                    "this default, the backup may be unusable",
+                    BACKUP_CLUSTER_SIZE_DEFAULT);
        job->cluster_size = BACKUP_CLUSTER_SIZE_DEFAULT;
    } else if (ret < 0 && !target->backing) {
        error_setg_errno(errp, -ret,
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -149,20 +149,6 @@ static QemuOptsList *config_groups[] = {
    NULL
 };

-static int get_event_by_name(const char *name, BlkdebugEvent *event)
-{
-    int i;
-
-    for (i = 0; i < BLKDBG__MAX; i++) {
-        if (!strcmp(BlkdebugEvent_lookup[i], name)) {
-            *event = i;
-            return 0;
-        }
-    }
-
-    return -1;
-}
-
 struct add_rule_data {
    BDRVBlkdebugState *s;
    int action;
@@ -173,7 +159,7 @@ static int add_rule(void *opaque, QemuOpts *opts, Error **errp)
    struct add_rule_data *d = opaque;
    BDRVBlkdebugState *s = d->s;
    const char* event_name;
-    BlkdebugEvent event;
+    int event;
    struct BlkdebugRule *rule;
    int64_t sector;

@@ -182,8 +168,9 @@ static int add_rule(void *opaque, QemuOpts *opts, Error **errp)
    if (!event_name) {
        error_setg(errp, "Missing event name for rule");
        return -1;
-    } else if (get_event_by_name(event_name, &event) < 0) {
-        error_setg(errp, "Invalid event name \"%s\"", event_name);
+    }
+    event = qapi_enum_parse(&BlkdebugEvent_lookup, event_name, -1, errp);
+    if (event < 0) {
        return -1;
    }

@@ -575,7 +562,7 @@ static int blkdebug_co_flush(BlockDriverState *bs)
 }

 static int coroutine_fn blkdebug_co_pwrite_zeroes(BlockDriverState *bs,
-                                                  int64_t offset, int count,
+                                                  int64_t offset, int bytes,
                                                  BdrvRequestFlags flags)
 {
    uint32_t align = MAX(bs->bl.request_alignment,
@@ -586,29 +573,29 @@ static int coroutine_fn blkdebug_co_pwrite_zeroes(BlockDriverState *bs,
     * preferred alignment (so that we test the fallback to writes on
     * unaligned portions), and check that the block layer never hands
     * us anything unaligned that crosses an alignment boundary.  */
-    if (count < align) {
+    if (bytes < align) {
        assert(QEMU_IS_ALIGNED(offset, align) ||
-               QEMU_IS_ALIGNED(offset + count, align) ||
+               QEMU_IS_ALIGNED(offset + bytes, align) ||
               DIV_ROUND_UP(offset, align) ==
-               DIV_ROUND_UP(offset + count, align));
+               DIV_ROUND_UP(offset + bytes, align));
        return -ENOTSUP;
    }
    assert(QEMU_IS_ALIGNED(offset, align));
-    assert(QEMU_IS_ALIGNED(count, align));
+    assert(QEMU_IS_ALIGNED(bytes, align));
    if (bs->bl.max_pwrite_zeroes) {
-        assert(count <= bs->bl.max_pwrite_zeroes);
+        assert(bytes <= bs->bl.max_pwrite_zeroes);
    }

-    err = rule_check(bs, offset, count);
+    err = rule_check(bs, offset, bytes);
    if (err) {
        return err;
    }

-    return bdrv_co_pwrite_zeroes(bs->file, offset, count, flags);
+    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
 }

 static int coroutine_fn blkdebug_co_pdiscard(BlockDriverState *bs,
-                                             int64_t offset, int count)
+                                             int64_t offset, int bytes)
 {
    uint32_t align = bs->bl.pdiscard_alignment;
    int err;
@@ -616,29 +603,29 @@ static int coroutine_fn blkdebug_co_pdiscard(BlockDriverState *bs,
    /* Only pass through requests that are larger than requested
     * minimum alignment, and ensure that unaligned requests do not
     * cross optimum discard boundaries. */
-    if (count < bs->bl.request_alignment) {
+    if (bytes < bs->bl.request_alignment) {
        assert(QEMU_IS_ALIGNED(offset, align) ||
-               QEMU_IS_ALIGNED(offset + count, align) ||
+               QEMU_IS_ALIGNED(offset + bytes, align) ||
               DIV_ROUND_UP(offset, align) ==
-               DIV_ROUND_UP(offset + count, align));
+               DIV_ROUND_UP(offset + bytes, align));
        return -ENOTSUP;
    }
    assert(QEMU_IS_ALIGNED(offset, bs->bl.request_alignment));
-    assert(QEMU_IS_ALIGNED(count, bs->bl.request_alignment));
-    if (align && count >= align) {
+    assert(QEMU_IS_ALIGNED(bytes, bs->bl.request_alignment));
+    if (align && bytes >= align) {
        assert(QEMU_IS_ALIGNED(offset, align));
-        assert(QEMU_IS_ALIGNED(count, align));
+        assert(QEMU_IS_ALIGNED(bytes, align));
    }
    if (bs->bl.max_pdiscard) {
-        assert(count <= bs->bl.max_pdiscard);
+        assert(bytes <= bs->bl.max_pdiscard);
    }

-    err = rule_check(bs, offset, count);
+    err = rule_check(bs, offset, bytes);
    if (err) {
        return err;
    }

-    return bdrv_co_pdiscard(bs->file->bs, offset, count);
+    return bdrv_co_pdiscard(bs->file->bs, offset, bytes);
 }

 static void blkdebug_close(BlockDriverState *bs)
@@ -733,13 +720,13 @@ static int blkdebug_debug_breakpoint(BlockDriverState *bs, const char *event,
 {
    BDRVBlkdebugState *s = bs->opaque;
    struct BlkdebugRule *rule;
-    BlkdebugEvent blkdebug_event;
+    int blkdebug_event;

-    if (get_event_by_name(event, &blkdebug_event) < 0) {
+    blkdebug_event = qapi_enum_parse(&BlkdebugEvent_lookup, event, -1, NULL);
+    if (blkdebug_event < 0) {
        return -ENOENT;
    }

-
    rule = g_malloc(sizeof(*rule));
    *rule = (struct BlkdebugRule) {
        .event  = blkdebug_event,
@@ -811,11 +798,6 @@ static int64_t blkdebug_getlength(BlockDriverState *bs)
    return bdrv_getlength(bs->file->bs);
 }

-static int blkdebug_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
-{
-    return bdrv_truncate(bs->file, offset, errp);
-}
-
 static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options)
 {
    BDRVBlkdebugState *s = bs->opaque;
@@ -839,9 +821,13 @@ static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options)
    }

    if (!force_json && bs->file->bs->exact_filename[0]) {
-        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "blkdebug:%s:%s", s->config_file ?: "",
-                 bs->file->bs->exact_filename);
+        int ret = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                           "blkdebug:%s:%s", s->config_file ?: "",
+                           bs->file->bs->exact_filename);
+        if (ret >= sizeof(bs->exact_filename)) {
+            /* An overflow makes the filename unusable, so do not report any */
+            bs->exact_filename[0] = 0;
+        }
    }

    opts = qdict_new();
@@ -894,6 +880,7 @@ static BlockDriver bdrv_blkdebug = {
    .format_name            = "blkdebug",
    .protocol_name          = "blkdebug",
    .instance_size          = sizeof(BDRVBlkdebugState),
+    .is_filter              = true,

    .bdrv_parse_filename    = blkdebug_parse_filename,
    .bdrv_file_open         = blkdebug_open,
@@ -902,7 +889,6 @@ static BlockDriver bdrv_blkdebug = {
    .bdrv_child_perm        = bdrv_filter_default_perms,

    .bdrv_getlength         = blkdebug_getlength,
-    .bdrv_truncate          = blkdebug_truncate,
    .bdrv_refresh_filename  = blkdebug_refresh_filename,
    .bdrv_refresh_limits    = blkdebug_refresh_limits,

@@ -911,6 +897,7 @@ static BlockDriver bdrv_blkdebug = {
    .bdrv_co_flush_to_disk  = blkdebug_co_flush,
    .bdrv_co_pwrite_zeroes  = blkdebug_co_pwrite_zeroes,
    .bdrv_co_pdiscard       = blkdebug_co_pdiscard,
+    .bdrv_co_get_block_status = bdrv_co_get_block_status_from_file,

    .bdrv_debug_event           = blkdebug_debug_event,
    .bdrv_debug_breakpoint      = blkdebug_debug_breakpoint,
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -96,10 +96,10 @@ static int coroutine_fn blkreplay_co_pwritev(BlockDriverState *bs,
 }

 static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs,
-    int64_t offset, int count, BdrvRequestFlags flags)
+    int64_t offset, int bytes, BdrvRequestFlags flags)
 {
    uint64_t reqid = blkreplay_next_id();
-    int ret = bdrv_co_pwrite_zeroes(bs->file, offset, count, flags);
+    int ret = bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
    block_request_create(reqid, bs, qemu_coroutine_self());
    qemu_coroutine_yield();

@@ -107,10 +107,10 @@ static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs,
 }

 static int coroutine_fn blkreplay_co_pdiscard(BlockDriverState *bs,
-                                              int64_t offset, int count)
+                                              int64_t offset, int bytes)
 {
    uint64_t reqid = blkreplay_next_id();
-    int ret = bdrv_co_pdiscard(bs->file->bs, offset, count);
+    int ret = bdrv_co_pdiscard(bs->file->bs, offset, bytes);
    block_request_create(reqid, bs, qemu_coroutine_self());
    qemu_coroutine_yield();

--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -301,10 +301,14 @@ static void blkverify_refresh_filename(BlockDriverState *bs, QDict *options)
    if (bs->file->bs->exact_filename[0]
        && s->test_file->bs->exact_filename[0])
    {
-        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "blkverify:%s:%s",
-                 bs->file->bs->exact_filename,
-                 s->test_file->bs->exact_filename);
+        int ret = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                           "blkverify:%s:%s",
+                           bs->file->bs->exact_filename,
+                           s->test_file->bs->exact_filename);
+        if (ret >= sizeof(bs->exact_filename)) {
+            /* An overflow makes the filename unusable, so do not report any */
+            bs->exact_filename[0] = 0;
+        }
    }
 }

--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -20,6 +20,7 @@
 #include "qapi-event.h"
 #include "qemu/id.h"
 #include "trace.h"
+#include "migration/misc.h"

 /* Number of coroutines to reserve per attached device model */
 #define COROUTINE_POOL_RESERVATION 64
@@ -68,6 +69,8 @@ struct BlockBackend {
    NotifierList remove_bs_notifiers, insert_bs_notifiers;

    int quiesce_counter;
+    VMChangeStateEntry *vmsh;
+    bool force_allow_inactivate;
 };

 typedef struct BlockBackendAIOCB {
@@ -83,7 +86,6 @@ static const AIOCBInfo block_backend_aiocb_info = {

 static void drive_info_del(DriveInfo *dinfo);
 static BlockBackend *bdrv_first_blk(BlockDriverState *bs);
-static char *blk_get_attached_dev_id(BlockBackend *blk);

 /* All BlockBackends */
 static QTAILQ_HEAD(, BlockBackend) block_backends =
@@ -130,6 +132,23 @@ static const char *blk_root_get_name(BdrvChild *child)
    return blk_name(child->opaque);
 }

+static void blk_vm_state_changed(void *opaque, int running, RunState state)
+{
+    Error *local_err = NULL;
+    BlockBackend *blk = opaque;
+
+    if (state == RUN_STATE_INMIGRATE) {
+        return;
+    }
+
+    qemu_del_vm_change_state_handler(blk->vmsh);
+    blk->vmsh = NULL;
+    blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+    }
+}
+
 /*
 * Notifies the user of the BlockBackend that migration has completed. qdev
 * devices can tighten their permissions in response (specifically revoke
@@ -148,6 +167,24 @@ static void blk_root_activate(BdrvChild *child, Error **errp)

    blk->disable_perm = false;

+    blk_set_perm(blk, blk->perm, BLK_PERM_ALL, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        blk->disable_perm = true;
+        return;
+    }
+
+    if (runstate_check(RUN_STATE_INMIGRATE)) {
+        /* Activation can happen when migration process is still active, for
+         * example when nbd_server_add is called during non-shared storage
+         * migration. Defer the shared_perm update to migration completion. */
+        if (!blk->vmsh) {
+            blk->vmsh = qemu_add_vm_change_state_handler(blk_vm_state_changed,
+                                                         blk);
+        }
+        return;
+    }
+
    blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
@@ -156,6 +193,30 @@ static void blk_root_activate(BdrvChild *child, Error **errp)
    }
 }

+void blk_set_force_allow_inactivate(BlockBackend *blk)
+{
+    blk->force_allow_inactivate = true;
+}
+
+static bool blk_can_inactivate(BlockBackend *blk)
+{
+    /* If it is a guest device, inactivate is ok. */
+    if (blk->dev || blk_name(blk)[0]) {
+        return true;
+    }
+
+    /* Inactivating means no more writes to the image can be done,
+     * even if those writes would be changes invisible to the
+     * guest.  For block job BBs that satisfy this, we can just allow
+     * it.  This is the case for mirror job source, which is required
+     * by libvirt non-shared block migration. */
+    if (!(blk->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED))) {
+        return true;
+    }
+
+    return blk->force_allow_inactivate;
+}
+
 static int blk_root_inactivate(BdrvChild *child)
 {
    BlockBackend *blk = child->opaque;
@@ -164,11 +225,7 @@ static int blk_root_inactivate(BdrvChild *child)
        return 0;
    }

-    /* Only inactivate BlockBackends for guest devices (which are inactive at
-     * this point because the VM is stopped) and unattached monitor-owned
-     * BlockBackends. If there is still any other user like a block job, then
-     * we simply can't inactivate the image. */
-    if (!blk->dev && !blk_name(blk)[0]) {
+    if (!blk_can_inactivate(blk)) {
        return -EPERM;
    }

@@ -216,9 +273,6 @@ BlockBackend *blk_new(uint64_t perm, uint64_t shared_perm)
    blk->shared_perm = shared_perm;
    blk_set_enable_write_cache(blk, true);

-    qemu_co_mutex_init(&blk->public.throttled_reqs_lock);
-    qemu_co_queue_init(&blk->public.throttled_reqs[0]);
-    qemu_co_queue_init(&blk->public.throttled_reqs[1]);
    block_acct_init(&blk->stats);

    notifier_list_init(&blk->remove_bs_notifiers);
@@ -286,12 +340,16 @@ static void blk_delete(BlockBackend *blk)
    assert(!blk->refcnt);
    assert(!blk->name);
    assert(!blk->dev);
-    if (blk->public.throttle_state) {
+    if (blk->public.throttle_group_member.throttle_state) {
        blk_io_limits_disable(blk);
    }
    if (blk->root) {
        blk_remove_bs(blk);
    }
+    if (blk->vmsh) {
+        qemu_del_vm_change_state_handler(blk->vmsh);
+        blk->vmsh = NULL;
+    }
    assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers));
    assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers));
    QTAILQ_REMOVE(&block_backends, blk, link);
@@ -343,7 +401,7 @@ void blk_unref(BlockBackend *blk)
 * Behaves similarly to blk_next() but iterates over all BlockBackends, even the
 * ones which are hidden (i.e. are not referenced by the monitor).
 */
-static BlockBackend *blk_all_next(BlockBackend *blk)
+BlockBackend *blk_all_next(BlockBackend *blk)
 {
    return blk ? QTAILQ_NEXT(blk, link)
               : QTAILQ_FIRST(&block_backends);
@@ -597,9 +655,12 @@ BlockBackend *blk_by_public(BlockBackendPublic *public)
 */
 void blk_remove_bs(BlockBackend *blk)
 {
+    ThrottleTimers *tt;
+
    notifier_list_notify(&blk->remove_bs_notifiers, blk);
-    if (blk->public.throttle_state) {
-        throttle_timers_detach_aio_context(&blk->public.throttle_timers);
+    if (blk->public.throttle_group_member.throttle_state) {
+        tt = &blk->public.throttle_group_member.throttle_timers;
+        throttle_timers_detach_aio_context(tt);
    }

    blk_update_root_state(blk);
@@ -621,9 +682,10 @@ int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
    bdrv_ref(bs);

    notifier_list_notify(&blk->insert_bs_notifiers, blk);
-    if (blk->public.throttle_state) {
+    if (blk->public.throttle_group_member.throttle_state) {
        throttle_timers_attach_aio_context(
-            &blk->public.throttle_timers, bdrv_get_aio_context(bs));
+            &blk->public.throttle_group_member.throttle_timers,
+            bdrv_get_aio_context(bs));
    }

    return 0;
@@ -726,7 +788,7 @@ void *blk_get_attached_dev(BlockBackend *blk)

 /* Return the qdev ID, or if no ID is assigned the QOM path, of the block
 * device attached to the BlockBackend. */
-static char *blk_get_attached_dev_id(BlockBackend *blk)
+char *blk_get_attached_dev_id(BlockBackend *blk)
 {
    DeviceState *dev;

@@ -985,8 +1047,9 @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
    bdrv_inc_in_flight(bs);

    /* throttling disk I/O */
-    if (blk->public.throttle_state) {
-        throttle_group_co_io_limits_intercept(blk, bytes, false);
+    if (blk->public.throttle_group_member.throttle_state) {
+        throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
+                bytes, false);
    }

    ret = bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
@@ -1009,10 +1072,10 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
    }

    bdrv_inc_in_flight(bs);
-
    /* throttling disk I/O */
-    if (blk->public.throttle_state) {
-        throttle_group_co_io_limits_intercept(blk, bytes, true);
+    if (blk->public.throttle_group_member.throttle_state) {
+        throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
+                bytes, true);
    }

    if (!blk->enable_write_cache) {
@@ -1099,9 +1162,9 @@ int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf,
 }

 int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
-                      int count, BdrvRequestFlags flags)
+                      int bytes, BdrvRequestFlags flags)
 {
-    return blk_prw(blk, offset, NULL, count, blk_write_entry,
+    return blk_prw(blk, offset, NULL, bytes, blk_write_entry,
                   flags | BDRV_REQ_ZERO_WRITE);
 }

@@ -1311,10 +1374,10 @@ static void blk_aio_pdiscard_entry(void *opaque)
 }

 BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk,
-                             int64_t offset, int count,
+                             int64_t offset, int bytes,
                             BlockCompletionFunc *cb, void *opaque)
 {
-    return blk_aio_prwv(blk, offset, count, NULL, blk_aio_pdiscard_entry, 0,
+    return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_pdiscard_entry, 0,
                        cb, opaque);
 }

@@ -1374,14 +1437,14 @@ BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
    return blk_aio_prwv(blk, req, 0, &qiov, blk_aio_ioctl_entry, 0, cb, opaque);
 }

-int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int count)
+int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
 {
-    int ret = blk_check_byte_request(blk, offset, count);
+    int ret = blk_check_byte_request(blk, offset, bytes);
    if (ret < 0) {
        return ret;
    }

-    return bdrv_co_pdiscard(blk_bs(blk), offset, count);
+    return bdrv_co_pdiscard(blk_bs(blk), offset, bytes);
 }

 int blk_co_flush(BlockBackend *blk)
@@ -1681,16 +1744,14 @@ static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
 void blk_set_aio_context(BlockBackend *blk, AioContext *new_context)
 {
    BlockDriverState *bs = blk_bs(blk);
+    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;

    if (bs) {
-        if (blk->public.throttle_state) {
-            throttle_timers_detach_aio_context(&blk->public.throttle_timers);
+        if (tgm->throttle_state) {
+            throttle_group_detach_aio_context(tgm);
+            throttle_group_attach_aio_context(tgm, new_context);
        }
        bdrv_set_aio_context(bs, new_context);
-        if (blk->public.throttle_state) {
-            throttle_timers_attach_aio_context(&blk->public.throttle_timers,
-                                               new_context);
-        }
    }
 }

@@ -1760,9 +1821,9 @@ void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
 }

 int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
-                                      int count, BdrvRequestFlags flags)
+                                      int bytes, BdrvRequestFlags flags)
 {
-    return blk_co_pwritev(blk, offset, count, NULL,
+    return blk_co_pwritev(blk, offset, bytes, NULL,
                          flags | BDRV_REQ_ZERO_WRITE);
 }

@@ -1773,14 +1834,15 @@ int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf,
                   BDRV_REQ_WRITE_COMPRESSED);
 }

-int blk_truncate(BlockBackend *blk, int64_t offset, Error **errp)
+int blk_truncate(BlockBackend *blk, int64_t offset, PreallocMode prealloc,
+                 Error **errp)
 {
    if (!blk_is_available(blk)) {
        error_setg(errp, "No medium inserted");
        return -ENOMEDIUM;
    }

-    return bdrv_truncate(blk->root, offset, errp);
+    return bdrv_truncate(blk->root, offset, prealloc, errp);
 }

 static void blk_pdiscard_entry(void *opaque)
@@ -1789,9 +1851,9 @@ static void blk_pdiscard_entry(void *opaque)
    rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, rwco->qiov->size);
 }

-int blk_pdiscard(BlockBackend *blk, int64_t offset, int count)
+int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
 {
-    return blk_prw(blk, offset, NULL, count, blk_pdiscard_entry, 0);
+    return blk_prw(blk, offset, NULL, bytes, blk_pdiscard_entry, 0);
 }

 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
@@ -1907,33 +1969,35 @@ int blk_commit_all(void)
 /* throttling disk I/O limits */
 void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
 {
-    throttle_group_config(blk, cfg);
+    throttle_group_config(&blk->public.throttle_group_member, cfg);
 }

 void blk_io_limits_disable(BlockBackend *blk)
 {
-    assert(blk->public.throttle_state);
+    assert(blk->public.throttle_group_member.throttle_state);
    bdrv_drained_begin(blk_bs(blk));
-    throttle_group_unregister_blk(blk);
+    throttle_group_unregister_tgm(&blk->public.throttle_group_member);
    bdrv_drained_end(blk_bs(blk));
 }

 /* should be called before blk_set_io_limits if a limit is set */
 void blk_io_limits_enable(BlockBackend *blk, const char *group)
 {
-    assert(!blk->public.throttle_state);
-    throttle_group_register_blk(blk, group);
+    assert(!blk->public.throttle_group_member.throttle_state);
+    throttle_group_register_tgm(&blk->public.throttle_group_member,
+                                group, blk_get_aio_context(blk));
 }

 void blk_io_limits_update_group(BlockBackend *blk, const char *group)
 {
    /* this BB is not part of any group */
-    if (!blk->public.throttle_state) {
+    if (!blk->public.throttle_group_member.throttle_state) {
        return;
    }

    /* this BB is a part of the same group than the one we want */
-    if (!g_strcmp0(throttle_group_get_name(blk), group)) {
+    if (!g_strcmp0(throttle_group_get_name(&blk->public.throttle_group_member),
+                group)) {
        return;
    }

@@ -1955,8 +2019,8 @@ static void blk_root_drained_begin(BdrvChild *child)
    /* Note that blk->root may not be accessible here yet if we are just
     * attaching to a BlockDriverState that is drained. Use child instead. */

-    if (atomic_fetch_inc(&blk->public.io_limits_disabled) == 0) {
-        throttle_group_restart_blk(blk);
+    if (atomic_fetch_inc(&blk->public.throttle_group_member.io_limits_disabled) == 0) {
+        throttle_group_restart_tgm(&blk->public.throttle_group_member);
    }
 }

@@ -1965,8 +2029,8 @@ static void blk_root_drained_end(BdrvChild *child)
    BlockBackend *blk = child->opaque;
    assert(blk->quiesce_counter);

-    assert(blk->public.io_limits_disabled);
-    atomic_dec(&blk->public.io_limits_disabled);
+    assert(blk->public.throttle_group_member.io_limits_disabled);
+    atomic_dec(&blk->public.throttle_group_member.io_limits_disabled);

    if (--blk->quiesce_counter == 0) {
        if (blk->dev_ops && blk->dev_ops->drained_end) {
--- a/block/commit.c
+++ b/block/commit.c
@@ -47,26 +47,25 @@ typedef struct CommitBlockJob {
 } CommitBlockJob;

 static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base,
-                                        int64_t sector_num, int nb_sectors,
+                                        int64_t offset, uint64_t bytes,
                                        void *buf)
 {
    int ret = 0;
    QEMUIOVector qiov;
    struct iovec iov = {
        .iov_base = buf,
-        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
+        .iov_len = bytes,
    };

+    assert(bytes < SIZE_MAX);
    qemu_iovec_init_external(&qiov, &iov, 1);

-    ret = blk_co_preadv(bs, sector_num * BDRV_SECTOR_SIZE,
-                        qiov.size, &qiov, 0);
+    ret = blk_co_preadv(bs, offset, qiov.size, &qiov, 0);
    if (ret < 0) {
        return ret;
    }

-    ret = blk_co_pwritev(base, sector_num * BDRV_SECTOR_SIZE,
-                         qiov.size, &qiov, 0);
+    ret = blk_co_pwritev(base, offset, qiov.size, &qiov, 0);
    if (ret < 0) {
        return ret;
    }
@@ -91,7 +90,9 @@ static void commit_complete(BlockJob *job, void *opaque)

    /* Make sure overlay_bs and top stay around until bdrv_set_backing_hd() */
    bdrv_ref(top);
-    bdrv_ref(overlay_bs);
+    if (overlay_bs) {
+        bdrv_ref(overlay_bs);
+    }

    /* Remove base node parent that still uses BLK_PERM_WRITE/RESIZE before
     * the normal backing chain can be restored. */
@@ -119,6 +120,13 @@ static void commit_complete(BlockJob *job, void *opaque)
    }
    g_free(s->backing_file_str);
    blk_unref(s->top);
+
+    /* If there is more than one reference to the job (e.g. if called from
+     * block_job_finish_sync()), block_job_completed() won't free it and
+     * therefore the blockers on the intermediate nodes remain. This would
+     * cause bdrv_set_backing_hd() to fail. */
+    block_job_remove_all_bdrv(job);
+
    block_job_completed(&s->common, ret);
    g_free(data);

@@ -137,17 +145,16 @@ static void coroutine_fn commit_run(void *opaque)
 {
    CommitBlockJob *s = opaque;
    CommitCompleteData *data;
-    int64_t sector_num, end;
+    int64_t offset;
    uint64_t delay_ns = 0;
    int ret = 0;
-    int n = 0;
+    int64_t n = 0; /* bytes */
    void *buf = NULL;
    int bytes_written = 0;
    int64_t base_len;

    ret = s->common.len = blk_getlength(s->top);

-
    if (s->common.len < 0) {
        goto out;
    }
@@ -158,16 +165,15 @@ static void coroutine_fn commit_run(void *opaque)
    }

    if (base_len < s->common.len) {
-        ret = blk_truncate(s->base, s->common.len, NULL);
+        ret = blk_truncate(s->base, s->common.len, PREALLOC_MODE_OFF, NULL);
        if (ret) {
            goto out;
        }
    }

-    end = s->common.len >> BDRV_SECTOR_BITS;
    buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE);

-    for (sector_num = 0; sector_num < end; sector_num += n) {
+    for (offset = 0; offset < s->common.len; offset += n) {
        bool copy;

        /* Note that even when no rate limit is applied we need to yield
@@ -179,14 +185,12 @@ static void coroutine_fn commit_run(void *opaque)
        }
        /* Copy if allocated above the base */
        ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base),
-                                      sector_num,
-                                      COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE,
-                                      &n);
+                                      offset, COMMIT_BUFFER_SIZE, &n);
        copy = (ret == 1);
-        trace_commit_one_iteration(s, sector_num, n, ret);
+        trace_commit_one_iteration(s, offset, n, ret);
        if (copy) {
-            ret = commit_populate(s->top, s->base, sector_num, n, buf);
-            bytes_written += n * BDRV_SECTOR_SIZE;
+            ret = commit_populate(s->top, s->base, offset, n, buf);
+            bytes_written += n;
        }
        if (ret < 0) {
            BlockErrorAction action =
@@ -199,7 +203,7 @@ static void coroutine_fn commit_run(void *opaque)
            }
        }
        /* Publish progress */
-        s->common.offset += n * BDRV_SECTOR_SIZE;
+        s->common.offset += n;

        if (copy && s->common.speed) {
            delay_ns = ratelimit_calculate_delay(&s->limit, n);
@@ -224,7 +228,7 @@ static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
        return;
    }
-    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
+    ratelimit_set_speed(&s->limit, speed, SLICE_TIME);
 }

 static const BlockJobDriver commit_job_driver = {
@@ -240,16 +244,6 @@ static int coroutine_fn bdrv_commit_top_preadv(BlockDriverState *bs,
    return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
 }

-static int64_t coroutine_fn bdrv_commit_top_get_block_status(
-    BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
-    BlockDriverState **file)
-{
-    *pnum = nb_sectors;
-    *file = bs->backing->bs;
-    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
-           (sector_num << BDRV_SECTOR_BITS);
-}
-
 static void bdrv_commit_top_refresh_filename(BlockDriverState *bs, QDict *opts)
 {
    bdrv_refresh_filename(bs->backing->bs);
@@ -275,7 +269,7 @@ static void bdrv_commit_top_child_perm(BlockDriverState *bs, BdrvChild *c,
 static BlockDriver bdrv_commit_top = {
    .format_name                = "commit_top",
    .bdrv_co_preadv             = bdrv_commit_top_preadv,
-    .bdrv_co_get_block_status   = bdrv_commit_top_get_block_status,
+    .bdrv_co_get_block_status   = bdrv_co_get_block_status_from_backing,
    .bdrv_refresh_filename      = bdrv_commit_top_refresh_filename,
    .bdrv_close                 = bdrv_commit_top_close,
    .bdrv_child_perm            = bdrv_commit_top_child_perm,
@@ -342,6 +336,9 @@ void commit_start(const char *job_id, BlockDriverState *bs,
    if (commit_top_bs == NULL) {
        goto fail;
    }
+    if (!filter_node_name) {
+        commit_top_bs->implicit = true;
+    }
    commit_top_bs->total_sectors = top->total_sectors;
    bdrv_set_aio_context(commit_top_bs, bdrv_get_aio_context(top));

@@ -437,7 +434,7 @@ fail:
 }


-#define COMMIT_BUF_SECTORS 2048
+#define COMMIT_BUF_SIZE (2048 * BDRV_SECTOR_SIZE)

 /* commit COW file into the raw image */
 int bdrv_commit(BlockDriverState *bs)
@@ -446,8 +443,9 @@ int bdrv_commit(BlockDriverState *bs)
    BlockDriverState *backing_file_bs = NULL;
    BlockDriverState *commit_top_bs = NULL;
    BlockDriver *drv = bs->drv;
-    int64_t sector, total_sectors, length, backing_length;
-    int n, ro, open_flags;
+    int64_t offset, length, backing_length;
+    int ro, open_flags;
+    int64_t n;
    int ret = 0;
    uint8_t *buf = NULL;
    Error *local_err = NULL;
@@ -518,37 +516,33 @@ int bdrv_commit(BlockDriverState *bs)
     * grow the backing file image if possible.  If not possible,
     * we must return an error */
    if (length > backing_length) {
-        ret = blk_truncate(backing, length, &local_err);
+        ret = blk_truncate(backing, length, PREALLOC_MODE_OFF, &local_err);
        if (ret < 0) {
            error_report_err(local_err);
            goto ro_cleanup;
        }
    }

-    total_sectors = length >> BDRV_SECTOR_BITS;
-
    /* blk_try_blockalign() for src will choose an alignment that works for
     * backing as well, so no need to compare the alignment manually. */
-    buf = blk_try_blockalign(src, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
+    buf = blk_try_blockalign(src, COMMIT_BUF_SIZE);
    if (buf == NULL) {
        ret = -ENOMEM;
        goto ro_cleanup;
    }

-    for (sector = 0; sector < total_sectors; sector += n) {
-        ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
+    for (offset = 0; offset < length; offset += n) {
+        ret = bdrv_is_allocated(bs, offset, COMMIT_BUF_SIZE, &n);
        if (ret < 0) {
            goto ro_cleanup;
        }
        if (ret) {
-            ret = blk_pread(src, sector * BDRV_SECTOR_SIZE, buf,
-                            n * BDRV_SECTOR_SIZE);
+            ret = blk_pread(src, offset, buf, n);
            if (ret < 0) {
                goto ro_cleanup;
            }

-            ret = blk_pwrite(backing, sector * BDRV_SECTOR_SIZE, buf,
-                             n * BDRV_SECTOR_SIZE, 0);
+            ret = blk_pwrite(backing, offset, buf, n, 0);
            if (ret < 0) {
                goto ro_cleanup;
            }
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -24,16 +24,10 @@
 #include "sysemu/block-backend.h"
 #include "crypto/block.h"
 #include "qapi/opts-visitor.h"
+#include "qapi/qobject-input-visitor.h"
 #include "qapi-visit.h"
 #include "qapi/error.h"
-
-#define BLOCK_CRYPTO_OPT_LUKS_KEY_SECRET "key-secret"
-#define BLOCK_CRYPTO_OPT_LUKS_CIPHER_ALG "cipher-alg"
-#define BLOCK_CRYPTO_OPT_LUKS_CIPHER_MODE "cipher-mode"
-#define BLOCK_CRYPTO_OPT_LUKS_IVGEN_ALG "ivgen-alg"
-#define BLOCK_CRYPTO_OPT_LUKS_IVGEN_HASH_ALG "ivgen-hash-alg"
-#define BLOCK_CRYPTO_OPT_LUKS_HASH_ALG "hash-alg"
-#define BLOCK_CRYPTO_OPT_LUKS_ITER_TIME "iter-time"
+#include "block/crypto.h"

 typedef struct BlockCrypto BlockCrypto;

@@ -135,11 +129,7 @@ static QemuOptsList block_crypto_runtime_opts_luks = {
    .name = "crypto",
    .head = QTAILQ_HEAD_INITIALIZER(block_crypto_runtime_opts_luks.head),
    .desc = {
-        {
-            .name = BLOCK_CRYPTO_OPT_LUKS_KEY_SECRET,
-            .type = QEMU_OPT_STRING,
-            .help = "ID of the secret that provides the encryption key",
-        },
+        BLOCK_CRYPTO_OPT_DEF_LUKS_KEY_SECRET(""),
        { /* end of list */ }
    },
 };
@@ -154,49 +144,21 @@ static QemuOptsList block_crypto_create_opts_luks = {
            .type = QEMU_OPT_SIZE,
            .help = "Virtual disk size"
        },
-        {
-            .name = BLOCK_CRYPTO_OPT_LUKS_KEY_SECRET,
-            .type = QEMU_OPT_STRING,
-            .help = "ID of the secret that provides the encryption key",
-        },
-        {
-            .name = BLOCK_CRYPTO_OPT_LUKS_CIPHER_ALG,
-            .type = QEMU_OPT_STRING,
-            .help = "Name of encryption cipher algorithm",
-        },
-        {
-            .name = BLOCK_CRYPTO_OPT_LUKS_CIPHER_MODE,
-            .type = QEMU_OPT_STRING,
-            .help = "Name of encryption cipher mode",
-        },
-        {
-            .name = BLOCK_CRYPTO_OPT_LUKS_IVGEN_ALG,
-            .type = QEMU_OPT_STRING,
-            .help = "Name of IV generator algorithm",
-        },
-        {
-            .name = BLOCK_CRYPTO_OPT_LUKS_IVGEN_HASH_ALG,
-            .type = QEMU_OPT_STRING,
-            .help = "Name of IV generator hash algorithm",
-        },
-        {
-            .name = BLOCK_CRYPTO_OPT_LUKS_HASH_ALG,
-            .type = QEMU_OPT_STRING,
-            .help = "Name of encryption hash algorithm",
-        },
-        {
-            .name = BLOCK_CRYPTO_OPT_LUKS_ITER_TIME,
-            .type = QEMU_OPT_NUMBER,
-            .help = "Time to spend in PBKDF in milliseconds",
-        },
+        BLOCK_CRYPTO_OPT_DEF_LUKS_KEY_SECRET(""),
+        BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_ALG(""),
+        BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_MODE(""),
+        BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_ALG(""),
+        BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_HASH_ALG(""),
+        BLOCK_CRYPTO_OPT_DEF_LUKS_HASH_ALG(""),
+        BLOCK_CRYPTO_OPT_DEF_LUKS_ITER_TIME(""),
        { /* end of list */ }
    },
 };


-static QCryptoBlockOpenOptions *
+QCryptoBlockOpenOptions *
 block_crypto_open_opts_init(QCryptoBlockFormat format,
-                            QemuOpts *opts,
+                            QDict *opts,
                            Error **errp)
 {
    Visitor *v;
@@ -206,7 +168,7 @@ block_crypto_open_opts_init(QCryptoBlockFormat format,
    ret = g_new0(QCryptoBlockOpenOptions, 1);
    ret->format = format;

-    v = opts_visitor_new(opts);
+    v = qobject_input_visitor_new_keyval(QOBJECT(opts));

    visit_start_struct(v, NULL, NULL, 0, &local_err);
    if (local_err) {
@@ -219,6 +181,11 @@ block_crypto_open_opts_init(QCryptoBlockFormat format,
            v, &ret->u.luks, &local_err);
        break;

+    case Q_CRYPTO_BLOCK_FORMAT_QCOW:
+        visit_type_QCryptoBlockOptionsQCow_members(
+            v, &ret->u.qcow, &local_err);
+        break;
+
    default:
        error_setg(&local_err, "Unsupported block format %d", format);
        break;
@@ -240,9 +207,9 @@ block_crypto_open_opts_init(QCryptoBlockFormat format,
 }


-static QCryptoBlockCreateOptions *
+QCryptoBlockCreateOptions *
 block_crypto_create_opts_init(QCryptoBlockFormat format,
-                              QemuOpts *opts,
+                              QDict *opts,
                              Error **errp)
 {
    Visitor *v;
@@ -252,7 +219,7 @@ block_crypto_create_opts_init(QCryptoBlockFormat format,
    ret = g_new0(QCryptoBlockCreateOptions, 1);
    ret->format = format;

-    v = opts_visitor_new(opts);
+    v = qobject_input_visitor_new_keyval(QOBJECT(opts));

    visit_start_struct(v, NULL, NULL, 0, &local_err);
    if (local_err) {
@@ -265,6 +232,11 @@ block_crypto_create_opts_init(QCryptoBlockFormat format,
            v, &ret->u.luks, &local_err);
        break;

+    case Q_CRYPTO_BLOCK_FORMAT_QCOW:
+        visit_type_QCryptoBlockOptionsQCow_members(
+            v, &ret->u.qcow, &local_err);
+        break;
+
    default:
        error_setg(&local_err, "Unsupported block format %d", format);
        break;
@@ -299,6 +271,7 @@ static int block_crypto_open_generic(QCryptoBlockFormat format,
    int ret = -EINVAL;
    QCryptoBlockOpenOptions *open_opts = NULL;
    unsigned int cflags = 0;
+    QDict *cryptoopts = NULL;

    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
                               false, errp);
@@ -313,7 +286,9 @@ static int block_crypto_open_generic(QCryptoBlockFormat format,
        goto cleanup;
    }

-    open_opts = block_crypto_open_opts_init(format, opts, errp);
+    cryptoopts = qemu_opts_to_qdict(opts, NULL);
+
+    open_opts = block_crypto_open_opts_init(format, cryptoopts, errp);
    if (!open_opts) {
        goto cleanup;
    }
@@ -321,7 +296,7 @@ static int block_crypto_open_generic(QCryptoBlockFormat format,
    if (flags & BDRV_O_NO_IO) {
        cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
    }
-    crypto->block = qcrypto_block_open(open_opts,
+    crypto->block = qcrypto_block_open(open_opts, NULL,
                                       block_crypto_read_func,
                                       bs,
                                       cflags,
@@ -333,10 +308,10 @@ static int block_crypto_open_generic(QCryptoBlockFormat format,
    }

    bs->encrypted = true;
-    bs->valid_key = true;

    ret = 0;
 cleanup:
+    QDECREF(cryptoopts);
    qapi_free_QCryptoBlockOpenOptions(open_opts);
    return ret;
 }
@@ -356,13 +331,16 @@ static int block_crypto_create_generic(QCryptoBlockFormat format,
        .opts = opts,
        .filename = filename,
    };
+    QDict *cryptoopts;

-    create_opts = block_crypto_create_opts_init(format, opts, errp);
+    cryptoopts = qemu_opts_to_qdict(opts, NULL);
+
+    create_opts = block_crypto_create_opts_init(format, cryptoopts, errp);
    if (!create_opts) {
        return -1;
    }

-    crypto = qcrypto_block_create(create_opts,
+    crypto = qcrypto_block_create(create_opts, NULL,
                                  block_crypto_init_func,
                                  block_crypto_write_func,
                                  &data,
@@ -375,6 +353,7 @@ static int block_crypto_create_generic(QCryptoBlockFormat format,

    ret = 0;
 cleanup:
+    QDECREF(cryptoopts);
    qcrypto_block_free(crypto);
    blk_unref(data.blk);
    qapi_free_QCryptoBlockCreateOptions(create_opts);
@@ -382,7 +361,7 @@ static int block_crypto_create_generic(QCryptoBlockFormat format,
 }

 static int block_crypto_truncate(BlockDriverState *bs, int64_t offset,
-                                 Error **errp)
+                                 PreallocMode prealloc, Error **errp)
 {
    BlockCrypto *crypto = bs->opaque;
    size_t payload_offset =
@@ -390,7 +369,7 @@ static int block_crypto_truncate(BlockDriverState *bs, int64_t offset,

    offset += payload_offset;

-    return bdrv_truncate(bs->file, offset, errp);
+    return bdrv_truncate(bs->file, offset, prealloc, errp);
 }

 static void block_crypto_close(BlockDriverState *bs)
--- a/block/crypto.h
+++ b/block/crypto.h
@@ -0,0 +1,101 @@
+/*
+ * QEMU block full disk encryption
+ *
+ * Copyright (c) 2015-2017 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef BLOCK_CRYPTO_H__
+#define BLOCK_CRYPTO_H__
+
+#define BLOCK_CRYPTO_OPT_DEF_KEY_SECRET(prefix, helpstr)                \
+    {                                                                   \
+        .name = prefix BLOCK_CRYPTO_OPT_QCOW_KEY_SECRET,                \
+        .type = QEMU_OPT_STRING,                                        \
+        .help = helpstr,                                                \
+    }
+
+#define BLOCK_CRYPTO_OPT_QCOW_KEY_SECRET "key-secret"
+
+#define BLOCK_CRYPTO_OPT_DEF_QCOW_KEY_SECRET(prefix)                    \
+    BLOCK_CRYPTO_OPT_DEF_KEY_SECRET(prefix,                             \
+        "ID of the secret that provides the AES encryption key")
+
+#define BLOCK_CRYPTO_OPT_LUKS_KEY_SECRET "key-secret"
+#define BLOCK_CRYPTO_OPT_LUKS_CIPHER_ALG "cipher-alg"
+#define BLOCK_CRYPTO_OPT_LUKS_CIPHER_MODE "cipher-mode"
+#define BLOCK_CRYPTO_OPT_LUKS_IVGEN_ALG "ivgen-alg"
+#define BLOCK_CRYPTO_OPT_LUKS_IVGEN_HASH_ALG "ivgen-hash-alg"
+#define BLOCK_CRYPTO_OPT_LUKS_HASH_ALG "hash-alg"
+#define BLOCK_CRYPTO_OPT_LUKS_ITER_TIME "iter-time"
+
+#define BLOCK_CRYPTO_OPT_DEF_LUKS_KEY_SECRET(prefix)                    \
+    BLOCK_CRYPTO_OPT_DEF_KEY_SECRET(prefix,                             \
+        "ID of the secret that provides the keyslot passphrase")
+
+#define BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_ALG(prefix)       \
+    {                                                      \
+        .name = prefix BLOCK_CRYPTO_OPT_LUKS_CIPHER_ALG,   \
+        .type = QEMU_OPT_STRING,                           \
+        .help = "Name of encryption cipher algorithm",     \
+    }
+
+#define BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_MODE(prefix)      \
+    {                                                      \
+        .name = prefix BLOCK_CRYPTO_OPT_LUKS_CIPHER_MODE,  \
+        .type = QEMU_OPT_STRING,                           \
+        .help = "Name of encryption cipher mode",          \
+    }
+
+#define BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_ALG(prefix)     \
+    {                                                   \
+        .name = prefix BLOCK_CRYPTO_OPT_LUKS_IVGEN_ALG, \
+        .type = QEMU_OPT_STRING,                        \
+        .help = "Name of IV generator algorithm",       \
+    }
+
+#define BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_HASH_ALG(prefix)        \
+    {                                                           \
+        .name = prefix BLOCK_CRYPTO_OPT_LUKS_IVGEN_HASH_ALG,    \
+        .type = QEMU_OPT_STRING,                                \
+        .help = "Name of IV generator hash algorithm",          \
+    }
+
+#define BLOCK_CRYPTO_OPT_DEF_LUKS_HASH_ALG(prefix)       \
+    {                                                    \
+        .name = prefix BLOCK_CRYPTO_OPT_LUKS_HASH_ALG,   \
+        .type = QEMU_OPT_STRING,                         \
+        .help = "Name of encryption hash algorithm",     \
+    }
+
+#define BLOCK_CRYPTO_OPT_DEF_LUKS_ITER_TIME(prefix)           \
+    {                                                         \
+        .name = prefix BLOCK_CRYPTO_OPT_LUKS_ITER_TIME,       \
+        .type = QEMU_OPT_NUMBER,                              \
+        .help = "Time to spend in PBKDF in milliseconds",     \
+    }
+
+QCryptoBlockCreateOptions *
+block_crypto_create_opts_init(QCryptoBlockFormat format,
+                              QDict *opts,
+                              Error **errp);
+
+QCryptoBlockOpenOptions *
+block_crypto_open_opts_init(QCryptoBlockFormat format,
+                            QDict *opts,
+                            Error **errp);
+
+#endif /* BLOCK_CRYPTO_H__ */
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -43,8 +43,18 @@ struct BdrvDirtyBitmap {
    BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */
    char *name;                 /* Optional non-empty unique ID */
    int64_t size;               /* Size of the bitmap (Number of sectors) */
-    bool disabled;              /* Bitmap is read-only */
+    bool disabled;              /* Bitmap is disabled. It ignores all writes to
+                                   the device */
    int active_iterators;       /* How many iterators are active */
+    bool readonly;              /* Bitmap is read-only. This field also
+                                   prevents the respective image from being
+                                   modified (i.e. blocks writes and discards).
+                                   Such operations must fail and both the image
+                                   and this bitmap must remain unchanged while
+                                   this flag is set. */
+    bool autoload;              /* For persistent bitmaps: bitmap must be
+                                   autoloaded on image opening */
+    bool persistent;            /* bitmap must be saved to owner disk image */
    QLIST_ENTRY(BdrvDirtyBitmap) list;
 };

@@ -93,6 +103,8 @@ void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap)
    assert(!bdrv_dirty_bitmap_frozen(bitmap));
    g_free(bitmap->name);
    bitmap->name = NULL;
+    bitmap->persistent = false;
+    bitmap->autoload = false;
 }

 /* Called with BQL taken.  */
@@ -289,6 +301,10 @@ BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs,
    bitmap->name = NULL;
    successor->name = name;
    bitmap->successor = NULL;
+    successor->persistent = bitmap->persistent;
+    bitmap->persistent = false;
+    successor->autoload = bitmap->autoload;
+    bitmap->autoload = false;
    bdrv_release_dirty_bitmap(bs, bitmap);

    return successor;
@@ -340,15 +356,20 @@ void bdrv_dirty_bitmap_truncate(BlockDriverState *bs)
    bdrv_dirty_bitmaps_unlock(bs);
 }

+static bool bdrv_dirty_bitmap_has_name(BdrvDirtyBitmap *bitmap)
+{
+    return !!bdrv_dirty_bitmap_name(bitmap);
+}
+
 /* Called with BQL taken.  */
-static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs,
-                                                  BdrvDirtyBitmap *bitmap,
-                                                  bool only_named)
+static void bdrv_do_release_matching_dirty_bitmap(
+    BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+    bool (*cond)(BdrvDirtyBitmap *bitmap))
 {
    BdrvDirtyBitmap *bm, *next;
    bdrv_dirty_bitmaps_lock(bs);
    QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
-        if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) {
+        if ((!bitmap || bm == bitmap) && (!cond || cond(bm))) {
            assert(!bm->active_iterators);
            assert(!bdrv_dirty_bitmap_frozen(bm));
            assert(!bm->meta);
@@ -373,17 +394,47 @@ out:
 /* Called with BQL taken.  */
 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
 {
-    bdrv_do_release_matching_dirty_bitmap(bs, bitmap, false);
+    bdrv_do_release_matching_dirty_bitmap(bs, bitmap, NULL);
 }

 /**
 * Release all named dirty bitmaps attached to a BDS (for use in bdrv_close()).
 * There must not be any frozen bitmaps attached.
+ * This function does not remove persistent bitmaps from the storage.
 * Called with BQL taken.
 */
 void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs)
 {
-    bdrv_do_release_matching_dirty_bitmap(bs, NULL, true);
+    bdrv_do_release_matching_dirty_bitmap(bs, NULL, bdrv_dirty_bitmap_has_name);
+}
+
+/**
+ * Release all persistent dirty bitmaps attached to a BDS (for use in
+ * bdrv_inactivate_recurse()).
+ * There must not be any frozen bitmaps attached.
+ * This function does not remove persistent bitmaps from the storage.
+ */
+void bdrv_release_persistent_dirty_bitmaps(BlockDriverState *bs)
+{
+    bdrv_do_release_matching_dirty_bitmap(bs, NULL,
+                                          bdrv_dirty_bitmap_get_persistance);
+}
+
+/**
+ * Remove persistent dirty bitmap from the storage if it exists.
+ * Absence of bitmap is not an error, because we have the following scenario:
+ * BdrvDirtyBitmap can have .persistent = true but not yet saved and have no
+ * stored version. For such bitmap bdrv_remove_persistent_dirty_bitmap() should
+ * not fail.
+ * This function doesn't release corresponding BdrvDirtyBitmap.
+ */
+void bdrv_remove_persistent_dirty_bitmap(BlockDriverState *bs,
+                                         const char *name,
+                                         Error **errp)
+{
+    if (bs->drv && bs->drv->bdrv_remove_persistent_dirty_bitmap) {
+        bs->drv->bdrv_remove_persistent_dirty_bitmap(bs, name, errp);
+    }
 }

 /* Called with BQL taken.  */
@@ -410,7 +461,7 @@ BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
        BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
        BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
-        info->count = bdrv_get_dirty_count(bm);
+        info->count = bdrv_get_dirty_count(bm) << BDRV_SECTOR_BITS;
        info->granularity = bdrv_dirty_bitmap_granularity(bm);
        info->has_name = !!bm->name;
        info->name = g_strdup(bm->name);
@@ -455,7 +506,7 @@ uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs)
    return granularity;
 }

-uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap)
+uint32_t bdrv_dirty_bitmap_granularity(const BdrvDirtyBitmap *bitmap)
 {
    return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap);
 }
@@ -504,6 +555,7 @@ void bdrv_set_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
                                  int64_t cur_sector, int64_t nr_sectors)
 {
    assert(bdrv_dirty_bitmap_enabled(bitmap));
+    assert(!bdrv_dirty_bitmap_readonly(bitmap));
    hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
 }

@@ -520,6 +572,7 @@ void bdrv_reset_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
                                    int64_t cur_sector, int64_t nr_sectors)
 {
    assert(bdrv_dirty_bitmap_enabled(bitmap));
+    assert(!bdrv_dirty_bitmap_readonly(bitmap));
    hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
 }

@@ -534,6 +587,7 @@ void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
 void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out)
 {
    assert(bdrv_dirty_bitmap_enabled(bitmap));
+    assert(!bdrv_dirty_bitmap_readonly(bitmap));
    bdrv_dirty_bitmap_lock(bitmap);
    if (!out) {
        hbitmap_reset_all(bitmap->bitmap);
@@ -550,6 +604,7 @@ void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in)
 {
    HBitmap *tmp = bitmap->bitmap;
    assert(bdrv_dirty_bitmap_enabled(bitmap));
+    assert(!bdrv_dirty_bitmap_readonly(bitmap));
    bitmap->bitmap = in;
    hbitmap_free(tmp);
 }
@@ -586,6 +641,13 @@ void bdrv_dirty_bitmap_deserialize_zeroes(BdrvDirtyBitmap *bitmap,
    hbitmap_deserialize_zeroes(bitmap->bitmap, start, count, finish);
 }

+void bdrv_dirty_bitmap_deserialize_ones(BdrvDirtyBitmap *bitmap,
+                                        uint64_t start, uint64_t count,
+                                        bool finish)
+{
+    hbitmap_deserialize_ones(bitmap->bitmap, start, count, finish);
+}
+
 void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap *bitmap)
 {
    hbitmap_deserialize_finish(bitmap->bitmap);
@@ -605,6 +667,7 @@ void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
        if (!bdrv_dirty_bitmap_enabled(bitmap)) {
            continue;
        }
+        assert(!bdrv_dirty_bitmap_readonly(bitmap));
        hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
    }
    bdrv_dirty_bitmaps_unlock(bs);
@@ -627,3 +690,78 @@ int64_t bdrv_get_meta_dirty_count(BdrvDirtyBitmap *bitmap)
 {
    return hbitmap_count(bitmap->meta);
 }
+
+bool bdrv_dirty_bitmap_readonly(const BdrvDirtyBitmap *bitmap)
+{
+    return bitmap->readonly;
+}
+
+/* Called with BQL taken. */
+void bdrv_dirty_bitmap_set_readonly(BdrvDirtyBitmap *bitmap, bool value)
+{
+    qemu_mutex_lock(bitmap->mutex);
+    bitmap->readonly = value;
+    qemu_mutex_unlock(bitmap->mutex);
+}
+
+bool bdrv_has_readonly_bitmaps(BlockDriverState *bs)
+{
+    BdrvDirtyBitmap *bm;
+    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
+        if (bm->readonly) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+/* Called with BQL taken. */
+void bdrv_dirty_bitmap_set_autoload(BdrvDirtyBitmap *bitmap, bool autoload)
+{
+    qemu_mutex_lock(bitmap->mutex);
+    bitmap->autoload = autoload;
+    qemu_mutex_unlock(bitmap->mutex);
+}
+
+bool bdrv_dirty_bitmap_get_autoload(const BdrvDirtyBitmap *bitmap)
+{
+    return bitmap->autoload;
+}
+
+/* Called with BQL taken. */
+void bdrv_dirty_bitmap_set_persistance(BdrvDirtyBitmap *bitmap, bool persistent)
+{
+    qemu_mutex_lock(bitmap->mutex);
+    bitmap->persistent = persistent;
+    qemu_mutex_unlock(bitmap->mutex);
+}
+
+bool bdrv_dirty_bitmap_get_persistance(BdrvDirtyBitmap *bitmap)
+{
+    return bitmap->persistent;
+}
+
+bool bdrv_has_changed_persistent_bitmaps(BlockDriverState *bs)
+{
+    BdrvDirtyBitmap *bm;
+    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
+        if (bm->persistent && !bm->readonly) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+BdrvDirtyBitmap *bdrv_dirty_bitmap_next(BlockDriverState *bs,
+                                        BdrvDirtyBitmap *bitmap)
+{
+    return bitmap == NULL ? QLIST_FIRST(&bs->dirty_bitmaps) :
+                            QLIST_NEXT(bitmap, list);
+}
+
+char *bdrv_dirty_bitmap_sha256(const BdrvDirtyBitmap *bitmap, Error **errp)
+{
+    return hbitmap_sha256(bitmap->bitmap, errp);
+}
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -111,7 +111,7 @@ static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk,
        uncompressed_sectors = s->sectorcounts[chunk];
        break;
    case 1: /* copy */
-        uncompressed_sectors = (s->lengths[chunk] + 511) / 512;
+        uncompressed_sectors = DIV_ROUND_UP(s->lengths[chunk], 512);
        break;
    case 2: /* zero */
        /* as the all-zeroes block may be large, it is treated specially: the
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -31,7 +31,6 @@
 #include "block/thread-pool.h"
 #include "qemu/iov.h"
 #include "block/raw-aio.h"
-#include "qapi/util.h"
 #include "qapi/qmp/qstring.h"

 #if defined(__APPLE__) && (__MACH__)
@@ -438,8 +437,9 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
    aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO)
                  ? BLOCKDEV_AIO_OPTIONS_NATIVE
                  : BLOCKDEV_AIO_OPTIONS_THREADS;
-    aio = qapi_enum_parse(BlockdevAioOptions_lookup, qemu_opt_get(opts, "aio"),
-                          BLOCKDEV_AIO_OPTIONS__MAX, aio_default, &local_err);
+    aio = qapi_enum_parse(&BlockdevAioOptions_lookup,
+                          qemu_opt_get(opts, "aio"),
+                          aio_default, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        ret = -EINVAL;
@@ -447,8 +447,9 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
    }
    s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);

-    locking = qapi_enum_parse(OnOffAuto_lookup, qemu_opt_get(opts, "locking"),
-                              ON_OFF_AUTO__MAX, ON_OFF_AUTO_AUTO, &local_err);
+    locking = qapi_enum_parse(&OnOffAuto_lookup,
+                              qemu_opt_get(opts, "locking"),
+                              ON_OFF_AUTO_AUTO, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        ret = -EINVAL;
@@ -457,22 +458,19 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
    switch (locking) {
    case ON_OFF_AUTO_ON:
        s->use_lock = true;
-#ifndef F_OFD_SETLK
-        fprintf(stderr,
-                "File lock requested but OFD locking syscall is unavailable, "
-                "falling back to POSIX file locks.\n"
-                "Due to the implementation, locks can be lost unexpectedly.\n");
-#endif
+        if (!qemu_has_ofd_lock()) {
+            fprintf(stderr,
+                    "File lock requested but OFD locking syscall is "
+                    "unavailable, falling back to POSIX file locks.\n"
+                    "Due to the implementation, locks can be lost "
+                    "unexpectedly.\n");
+        }
        break;
    case ON_OFF_AUTO_OFF:
        s->use_lock = false;
        break;
    case ON_OFF_AUTO_AUTO:
-#ifdef F_OFD_SETLK
-        s->use_lock = true;
-#else
-        s->use_lock = false;
-#endif
+        s->use_lock = qemu_has_ofd_lock();
        break;
    default:
        abort();
@@ -1339,6 +1337,9 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
 #if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS)
    BDRVRawState *s = aiocb->bs->opaque;
 #endif
+#ifdef CONFIG_FALLOCATE
+    int64_t len;
+#endif

    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
        return handle_aiocb_write_zeroes_block(aiocb);
@@ -1381,7 +1382,10 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
 #endif

 #ifdef CONFIG_FALLOCATE
-    if (s->has_fallocate && aiocb->aio_offset >= bdrv_getlength(aiocb->bs)) {
+    /* Last resort: we are trying to extend the file with zeroed data. This
+     * can be done via fallocate(fd, 0) */
+    len = bdrv_getlength(aiocb->bs);
+    if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) {
        int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
        if (ret == 0 || ret != -ENOTSUP) {
            return ret;
@@ -1485,7 +1489,7 @@ static int aio_worker(void *arg)

 static int paio_submit_co(BlockDriverState *bs, int fd,
                          int64_t offset, QEMUIOVector *qiov,
-                          int count, int type)
+                          int bytes, int type)
 {
    RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
    ThreadPool *pool;
@@ -1494,22 +1498,22 @@ static int paio_submit_co(BlockDriverState *bs, int fd,
    acb->aio_type = type;
    acb->aio_fildes = fd;

-    acb->aio_nbytes = count;
+    acb->aio_nbytes = bytes;
    acb->aio_offset = offset;

    if (qiov) {
        acb->aio_iov = qiov->iov;
        acb->aio_niov = qiov->niov;
-        assert(qiov->size == count);
+        assert(qiov->size == bytes);
    }

-    trace_paio_submit_co(offset, count, type);
+    trace_paio_submit_co(offset, bytes, type);
    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
    return thread_pool_submit_co(pool, aio_worker, acb);
 }

 static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
-        int64_t offset, QEMUIOVector *qiov, int count,
+        int64_t offset, QEMUIOVector *qiov, int bytes,
        BlockCompletionFunc *cb, void *opaque, int type)
 {
    RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
@@ -1519,7 +1523,7 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
    acb->aio_type = type;
    acb->aio_fildes = fd;

-    acb->aio_nbytes = count;
+    acb->aio_nbytes = bytes;
    acb->aio_offset = offset;

    if (qiov) {
@@ -1528,7 +1532,7 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
        assert(qiov->size == acb->aio_nbytes);
    }

-    trace_paio_submit(acb, opaque, offset, count, type);
+    trace_paio_submit(acb, opaque, offset, bytes, type);
    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
    return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
 }
@@ -1624,7 +1628,122 @@ static void raw_close(BlockDriverState *bs)
    }
 }

-static int raw_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
+/**
+ * Truncates the given regular file @fd to @offset and, when growing, fills the
+ * new space according to @prealloc.
+ *
+ * Returns: 0 on success, -errno on failure.
+ */
+static int raw_regular_truncate(int fd, int64_t offset, PreallocMode prealloc,
+                                Error **errp)
+{
+    int result = 0;
+    int64_t current_length = 0;
+    char *buf = NULL;
+    struct stat st;
+
+    if (fstat(fd, &st) < 0) {
+        result = -errno;
+        error_setg_errno(errp, -result, "Could not stat file");
+        return result;
+    }
+
+    current_length = st.st_size;
+    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
+        error_setg(errp, "Cannot use preallocation for shrinking files");
+        return -ENOTSUP;
+    }
+
+    switch (prealloc) {
+#ifdef CONFIG_POSIX_FALLOCATE
+    case PREALLOC_MODE_FALLOC:
+        /*
+         * Truncating before posix_fallocate() makes it about twice slower on
+         * file systems that do not support fallocate(), trying to check if a
+         * block is allocated before allocating it, so don't do that here.
+         */
+        result = -posix_fallocate(fd, current_length, offset - current_length);
+        if (result != 0) {
+            /* posix_fallocate() doesn't set errno. */
+            error_setg_errno(errp, -result,
+                             "Could not preallocate new data");
+        }
+        goto out;
+#endif
+    case PREALLOC_MODE_FULL:
+    {
+        int64_t num = 0, left = offset - current_length;
+
+        /*
+         * Knowing the final size from the beginning could allow the file
+         * system driver to do less allocations and possibly avoid
+         * fragmentation of the file.
+         */
+        if (ftruncate(fd, offset) != 0) {
+            result = -errno;
+            error_setg_errno(errp, -result, "Could not resize file");
+            goto out;
+        }
+
+        buf = g_malloc0(65536);
+
+        result = lseek(fd, current_length, SEEK_SET);
+        if (result < 0) {
+            result = -errno;
+            error_setg_errno(errp, -result,
+                             "Failed to seek to the old end of file");
+            goto out;
+        }
+
+        while (left > 0) {
+            num = MIN(left, 65536);
+            result = write(fd, buf, num);
+            if (result < 0) {
+                result = -errno;
+                error_setg_errno(errp, -result,
+                                 "Could not write zeros for preallocation");
+                goto out;
+            }
+            left -= result;
+        }
+        if (result >= 0) {
+            result = fsync(fd);
+            if (result < 0) {
+                result = -errno;
+                error_setg_errno(errp, -result,
+                                 "Could not flush file to disk");
+                goto out;
+            }
+        }
+        goto out;
+    }
+    case PREALLOC_MODE_OFF:
+        if (ftruncate(fd, offset) != 0) {
+            result = -errno;
+            error_setg_errno(errp, -result, "Could not resize file");
+        }
+        return result;
+    default:
+        result = -ENOTSUP;
+        error_setg(errp, "Unsupported preallocation mode: %s",
+                   PreallocMode_str(prealloc));
+        return result;
+    }
+
+out:
+    if (result < 0) {
+        if (ftruncate(fd, current_length) < 0) {
+            error_report("Failed to restore old file length: %s",
+                         strerror(errno));
+        }
+    }
+
+    g_free(buf);
+    return result;
+}
+
+static int raw_truncate(BlockDriverState *bs, int64_t offset,
+                        PreallocMode prealloc, Error **errp)
 {
    BDRVRawState *s = bs->opaque;
    struct stat st;
@@ -1637,12 +1756,16 @@ static int raw_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
    }

    if (S_ISREG(st.st_mode)) {
-        if (ftruncate(s->fd, offset) < 0) {
-            ret = -errno;
-            error_setg_errno(errp, -ret, "Failed to resize the file");
-            return ret;
-        }
-    } else if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
+        return raw_regular_truncate(s->fd, offset, prealloc, errp);
+    }
+
+    if (prealloc != PREALLOC_MODE_OFF) {
+        error_setg(errp, "Preallocation mode '%s' unsupported for this "
+                   "non-regular file", PreallocMode_str(prealloc));
+        return -ENOTSUP;
+    }
+
+    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
        if (offset > raw_getlength(bs)) {
            error_setg(errp, "Cannot grow device files");
            return -EINVAL;
@@ -1852,9 +1975,8 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
                          BDRV_SECTOR_SIZE);
    nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
-    prealloc = qapi_enum_parse(PreallocMode_lookup, buf,
-                               PREALLOC_MODE__MAX, PREALLOC_MODE_OFF,
-                               &local_err);
+    prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
+                               PREALLOC_MODE_OFF, &local_err);
    g_free(buf);
    if (local_err) {
        error_propagate(errp, local_err);
@@ -1885,71 +2007,9 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
 #endif
    }

-    switch (prealloc) {
-#ifdef CONFIG_POSIX_FALLOCATE
-    case PREALLOC_MODE_FALLOC:
-        /*
-         * Truncating before posix_fallocate() makes it about twice slower on
-         * file systems that do not support fallocate(), trying to check if a
-         * block is allocated before allocating it, so don't do that here.
-         */
-        result = -posix_fallocate(fd, 0, total_size);
-        if (result != 0) {
-            /* posix_fallocate() doesn't set errno. */
-            error_setg_errno(errp, -result,
-                             "Could not preallocate data for the new file");
-        }
-        break;
-#endif
-    case PREALLOC_MODE_FULL:
-    {
-        /*
-         * Knowing the final size from the beginning could allow the file
-         * system driver to do less allocations and possibly avoid
-         * fragmentation of the file.
-         */
-        if (ftruncate(fd, total_size) != 0) {
-            result = -errno;
-            error_setg_errno(errp, -result, "Could not resize file");
-            goto out_close;
-        }
-
-        int64_t num = 0, left = total_size;
-        buf = g_malloc0(65536);
-
-        while (left > 0) {
-            num = MIN(left, 65536);
-            result = write(fd, buf, num);
-            if (result < 0) {
-                result = -errno;
-                error_setg_errno(errp, -result,
-                                 "Could not write to the new file");
-                break;
-            }
-            left -= result;
-        }
-        if (result >= 0) {
-            result = fsync(fd);
-            if (result < 0) {
-                result = -errno;
-                error_setg_errno(errp, -result,
-                                 "Could not flush new file to disk");
-            }
-        }
-        g_free(buf);
-        break;
-    }
-    case PREALLOC_MODE_OFF:
-        if (ftruncate(fd, total_size) != 0) {
-            result = -errno;
-            error_setg_errno(errp, -result, "Could not resize file");
-        }
-        break;
-    default:
-        result = -EINVAL;
-        error_setg(errp, "Unsupported preallocation mode: %s",
-                   PreallocMode_lookup[prealloc]);
-        break;
+    result = raw_regular_truncate(fd, total_size, prealloc, errp);
+    if (result < 0) {
+        goto out_close;
    }

 out_close:
@@ -2109,26 +2169,26 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
 }

 static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs,
-    int64_t offset, int count,
+    int64_t offset, int bytes,
    BlockCompletionFunc *cb, void *opaque)
 {
    BDRVRawState *s = bs->opaque;

-    return paio_submit(bs, s->fd, offset, NULL, count,
+    return paio_submit(bs, s->fd, offset, NULL, bytes,
                       cb, opaque, QEMU_AIO_DISCARD);
 }

 static int coroutine_fn raw_co_pwrite_zeroes(
    BlockDriverState *bs, int64_t offset,
-    int count, BdrvRequestFlags flags)
+    int bytes, BdrvRequestFlags flags)
 {
    BDRVRawState *s = bs->opaque;

    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
-        return paio_submit_co(bs, s->fd, offset, NULL, count,
+        return paio_submit_co(bs, s->fd, offset, NULL, bytes,
                              QEMU_AIO_WRITE_ZEROES);
    } else if (s->discard_zeroes) {
-        return paio_submit_co(bs, s->fd, offset, NULL, count,
+        return paio_submit_co(bs, s->fd, offset, NULL, bytes,
                              QEMU_AIO_DISCARD);
    }
    return -ENOTSUP;
@@ -2560,7 +2620,7 @@ static int fd_open(BlockDriverState *bs)
 }

 static coroutine_fn BlockAIOCB *hdev_aio_pdiscard(BlockDriverState *bs,
-    int64_t offset, int count,
+    int64_t offset, int bytes,
    BlockCompletionFunc *cb, void *opaque)
 {
    BDRVRawState *s = bs->opaque;
@@ -2568,12 +2628,12 @@ static coroutine_fn BlockAIOCB *hdev_aio_pdiscard(BlockDriverState *bs,
    if (fd_open(bs) < 0) {
        return NULL;
    }
-    return paio_submit(bs, s->fd, offset, NULL, count,
+    return paio_submit(bs, s->fd, offset, NULL, bytes,
                       cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
 }

 static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
-    int64_t offset, int count, BdrvRequestFlags flags)
+    int64_t offset, int bytes, BdrvRequestFlags flags)
 {
    BDRVRawState *s = bs->opaque;
    int rc;
@@ -2583,10 +2643,10 @@ static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
        return rc;
    }
    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
-        return paio_submit_co(bs, s->fd, offset, NULL, count,
+        return paio_submit_co(bs, s->fd, offset, NULL, bytes,
                              QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV);
    } else if (s->discard_zeroes) {
-        return paio_submit_co(bs, s->fd, offset, NULL, count,
+        return paio_submit_co(bs, s->fd, offset, NULL, bytes,
                              QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
    }
    return -ENOTSUP;
--- a/block/file-win32.c
+++ b/block/file-win32.c
@@ -31,7 +31,6 @@
 #include "block/thread-pool.h"
 #include "qemu/iov.h"
 #include "qapi/qmp/qstring.h"
-#include "qapi/util.h"
 #include <windows.h>
 #include <winioctl.h>

@@ -303,8 +302,8 @@ static bool get_aio_option(QemuOpts *opts, int flags, Error **errp)

    aio_default = (flags & BDRV_O_NATIVE_AIO) ? BLOCKDEV_AIO_OPTIONS_NATIVE
                                              : BLOCKDEV_AIO_OPTIONS_THREADS;
-    aio = qapi_enum_parse(BlockdevAioOptions_lookup, qemu_opt_get(opts, "aio"),
-                          BLOCKDEV_AIO_OPTIONS__MAX, aio_default, errp);
+    aio = qapi_enum_parse(&BlockdevAioOptions_lookup, qemu_opt_get(opts, "aio"),
+                          aio_default, errp);

    switch (aio) {
    case BLOCKDEV_AIO_OPTIONS_NATIVE:
@@ -461,12 +460,19 @@ static void raw_close(BlockDriverState *bs)
    }
 }

-static int raw_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
+static int raw_truncate(BlockDriverState *bs, int64_t offset,
+                        PreallocMode prealloc, Error **errp)
 {
    BDRVRawState *s = bs->opaque;
    LONG low, high;
    DWORD dwPtrLow;

+    if (prealloc != PREALLOC_MODE_OFF) {
+        error_setg(errp, "Unsupported preallocation mode '%s'",
+                   PreallocMode_str(prealloc));
+        return -ENOTSUP;
+    }
+
    low = offset;
    high = offset >> 32;

--- a/block/gluster.c
+++ b/block/gluster.c
@@ -12,7 +12,6 @@
 #include "block/block_int.h"
 #include "qapi/error.h"
 #include "qapi/qmp/qerror.h"
-#include "qapi/util.h"
 #include "qemu/uri.h"
 #include "qemu/error-report.h"
 #include "qemu/cutils.h"
@@ -345,8 +344,7 @@ static int qemu_gluster_parse_uri(BlockdevOptionsGluster *gconf,
        is_unix = true;
    } else if (!strcmp(uri->scheme, "gluster+rdma")) {
        gsconf->type = SOCKET_ADDRESS_TYPE_INET;
-        error_report("Warning: rdma feature is not supported, falling "
-                     "back to tcp");
+        warn_report("rdma feature is not supported, falling back to tcp");
    } else {
        ret = -EINVAL;
        goto out;
@@ -545,8 +543,7 @@ static int qemu_gluster_parse_json(BlockdevOptionsGluster *gconf,
        if (!strcmp(ptr, "tcp")) {
            ptr = "inet";       /* accept legacy "tcp" */
        }
-        type = qapi_enum_parse(SocketAddressType_lookup, ptr,
-                               SOCKET_ADDRESS_TYPE__MAX, -1, NULL);
+        type = qapi_enum_parse(&SocketAddressType_lookup, ptr, -1, NULL);
        if (type != SOCKET_ADDRESS_TYPE_INET
            && type != SOCKET_ADDRESS_TYPE_UNIX) {
            error_setg(&local_err,
@@ -1003,8 +1000,7 @@ static int qemu_gluster_create(const char *filename,
                          BDRV_SECTOR_SIZE);

    tmp = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
-    prealloc = qapi_enum_parse(PreallocMode_lookup, tmp,
-                               PREALLOC_MODE__MAX, PREALLOC_MODE_OFF,
+    prealloc = qapi_enum_parse(&PreallocMode_lookup, tmp, PREALLOC_MODE_OFF,
                               &local_err);
    g_free(tmp);
    if (local_err) {
@@ -1051,7 +1047,7 @@ static int qemu_gluster_create(const char *filename,
    default:
        ret = -EINVAL;
        error_setg(errp, "Unsupported preallocation mode: %s",
-                   PreallocMode_lookup[prealloc]);
+                   PreallocMode_str(prealloc));
        break;
    }

@@ -1096,11 +1092,17 @@ static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
 }

 static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset,
-                                 Error **errp)
+                                 PreallocMode prealloc, Error **errp)
 {
    int ret;
    BDRVGlusterState *s = bs->opaque;

+    if (prealloc != PREALLOC_MODE_OFF) {
+        error_setg(errp, "Unsupported preallocation mode '%s'",
+                   PreallocMode_str(prealloc));
+        return -ENOTSUP;
+    }
+
    ret = glfs_ftruncate(s->fd, offset);
    if (ret < 0) {
        ret = -errno;
--- a/block/io.c
+++ b/block/io.c
@@ -34,16 +34,8 @@

 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */

-static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
-                                          int64_t offset,
-                                          QEMUIOVector *qiov,
-                                          BdrvRequestFlags flags,
-                                          BlockCompletionFunc *cb,
-                                          void *opaque,
-                                          bool is_write);
-static void coroutine_fn bdrv_co_do_rw(void *opaque);
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
-    int64_t offset, int count, BdrvRequestFlags flags);
+    int64_t offset, int bytes, BdrvRequestFlags flags);

 void bdrv_parent_drained_begin(BlockDriverState *bs)
 {
@@ -157,6 +149,37 @@ bool bdrv_requests_pending(BlockDriverState *bs)
    return false;
 }

+typedef struct {
+    Coroutine *co;
+    BlockDriverState *bs;
+    bool done;
+} BdrvCoDrainData;
+
+static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
+{
+    BdrvCoDrainData *data = opaque;
+    BlockDriverState *bs = data->bs;
+
+    bs->drv->bdrv_co_drain(bs);
+
+    /* Set data->done before reading bs->wakeup.  */
+    atomic_mb_set(&data->done, true);
+    bdrv_wakeup(bs);
+}
+
+static void bdrv_drain_invoke(BlockDriverState *bs)
+{
+    BdrvCoDrainData data = { .bs = bs, .done = false };
+
+    if (!bs->drv || !bs->drv->bdrv_co_drain) {
+        return;
+    }
+
+    data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
+    bdrv_coroutine_enter(bs, data.co);
+    BDRV_POLL_WHILE(bs, !data.done);
+}
+
 static bool bdrv_drain_recurse(BlockDriverState *bs)
 {
    BdrvChild *child, *tmp;
@@ -164,9 +187,8 @@ static bool bdrv_drain_recurse(BlockDriverState *bs)

    waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);

-    if (bs->drv && bs->drv->bdrv_drain) {
-        bs->drv->bdrv_drain(bs);
-    }
+    /* Ensure any pending metadata writes are submitted to bs->file.  */
+    bdrv_drain_invoke(bs);

    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
        BlockDriverState *bs = child->bs;
@@ -192,12 +214,6 @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
    return waited;
 }

-typedef struct {
-    Coroutine *co;
-    BlockDriverState *bs;
-    bool done;
-} BdrvCoDrainData;
-
 static void bdrv_co_drain_bh_cb(void *opaque)
 {
    BdrvCoDrainData *data = opaque;
@@ -426,27 +442,6 @@ static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
 }

-/**
- * Round a region to cluster boundaries (sector-based)
- */
-void bdrv_round_sectors_to_clusters(BlockDriverState *bs,
-                                    int64_t sector_num, int nb_sectors,
-                                    int64_t *cluster_sector_num,
-                                    int *cluster_nb_sectors)
-{
-    BlockDriverInfo bdi;
-
-    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
-        *cluster_sector_num = sector_num;
-        *cluster_nb_sectors = nb_sectors;
-    } else {
-        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
-        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
-        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
-                                            nb_sectors, c);
-    }
-}
-
 /**
 * Round a region to cluster boundaries
 */
@@ -674,12 +669,12 @@ int bdrv_write(BdrvChild *child, int64_t sector_num,
 }

 int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
-                       int count, BdrvRequestFlags flags)
+                       int bytes, BdrvRequestFlags flags)
 {
    QEMUIOVector qiov;
    struct iovec iov = {
        .iov_base = NULL,
-        .iov_len = count,
+        .iov_len = bytes,
    };

    qemu_iovec_init_external(&qiov, &iov, 1);
@@ -1062,17 +1057,18 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
    }

    if (flags & BDRV_REQ_COPY_ON_READ) {
-        int64_t start_sector = offset >> BDRV_SECTOR_BITS;
-        int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
-        unsigned int nb_sectors = end_sector - start_sector;
-        int pnum;
+        /* TODO: Simplify further once bdrv_is_allocated no longer
+         * requires sector alignment */
+        int64_t start = QEMU_ALIGN_DOWN(offset, BDRV_SECTOR_SIZE);
+        int64_t end = QEMU_ALIGN_UP(offset + bytes, BDRV_SECTOR_SIZE);
+        int64_t pnum;

-        ret = bdrv_is_allocated(bs, start_sector, nb_sectors, &pnum);
+        ret = bdrv_is_allocated(bs, start, end - start, &pnum);
        if (ret < 0) {
            goto out;
        }

-        if (!ret || pnum != nb_sectors) {
+        if (!ret || pnum != end - start) {
            ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov);
            goto out;
        }
@@ -1139,6 +1135,8 @@ int coroutine_fn bdrv_co_preadv(BdrvChild *child,
    bool use_local_qiov = false;
    int ret;

+    trace_bdrv_co_preadv(child->bs, offset, bytes, flags);
+
    if (!drv) {
        return -ENOMEDIUM;
    }
@@ -1211,8 +1209,6 @@ static int coroutine_fn bdrv_co_do_readv(BdrvChild *child,
 int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num,
                               int nb_sectors, QEMUIOVector *qiov)
 {
-    trace_bdrv_co_readv(child->bs, sector_num, nb_sectors);
-
    return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0);
 }

@@ -1220,7 +1216,7 @@ int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num,
 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)

 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
-    int64_t offset, int count, BdrvRequestFlags flags)
+    int64_t offset, int bytes, BdrvRequestFlags flags)
 {
    BlockDriver *drv = bs->drv;
    QEMUIOVector qiov;
@@ -1238,12 +1234,12 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,

    assert(alignment % bs->bl.request_alignment == 0);
    head = offset % alignment;
-    tail = (offset + count) % alignment;
+    tail = (offset + bytes) % alignment;
    max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
    assert(max_write_zeroes >= bs->bl.request_alignment);

-    while (count > 0 && !ret) {
-        int num = count;
+    while (bytes > 0 && !ret) {
+        int num = bytes;

        /* Align request.  Block drivers can expect the "bulk" of the request
         * to be aligned, and that unaligned requests do not cross cluster
@@ -1253,7 +1249,7 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
            /* Make a small request up to the first aligned sector. For
             * convenience, limit this request to max_transfer even if
             * we don't need to fall back to writes.  */
-            num = MIN(MIN(count, max_transfer), alignment - head);
+            num = MIN(MIN(bytes, max_transfer), alignment - head);
            head = (head + num) % alignment;
            assert(num < max_write_zeroes);
        } else if (tail && num > alignment) {
@@ -1314,7 +1310,7 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
        }

        offset += num;
-        count -= num;
+        bytes -= num;
    }

 fail:
@@ -1343,6 +1339,10 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
    uint64_t bytes_remaining = bytes;
    int max_transfer;

+    if (bdrv_has_readonly_bitmaps(bs)) {
+        return -EPERM;
+    }
+
    assert(is_power_of_2(align));
    assert((offset & (align - 1)) == 0);
    assert((bytes & (align - 1)) == 0);
@@ -1526,6 +1526,8 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
    bool use_local_qiov = false;
    int ret;

+    trace_bdrv_co_pwritev(child->bs, offset, bytes, flags);
+
    if (!bs->drv) {
        return -ENOMEDIUM;
    }
@@ -1660,21 +1662,19 @@ static int coroutine_fn bdrv_co_do_writev(BdrvChild *child,
 int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num,
    int nb_sectors, QEMUIOVector *qiov)
 {
-    trace_bdrv_co_writev(child->bs, sector_num, nb_sectors);
-
    return bdrv_co_do_writev(child, sector_num, nb_sectors, qiov, 0);
 }

 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
-                                       int count, BdrvRequestFlags flags)
+                                       int bytes, BdrvRequestFlags flags)
 {
-    trace_bdrv_co_pwrite_zeroes(child->bs, offset, count, flags);
+    trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);

    if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
        flags &= ~BDRV_REQ_MAY_UNMAP;
    }

-    return bdrv_co_pwritev(child, offset, count, NULL,
+    return bdrv_co_pwritev(child, offset, bytes, NULL,
                           BDRV_REQ_ZERO_WRITE | flags);
 }

@@ -1714,20 +1714,47 @@ typedef struct BdrvCoGetBlockStatusData {
    bool done;
 } BdrvCoGetBlockStatusData;

+int64_t coroutine_fn bdrv_co_get_block_status_from_file(BlockDriverState *bs,
+                                                        int64_t sector_num,
+                                                        int nb_sectors,
+                                                        int *pnum,
+                                                        BlockDriverState **file)
+{
+    assert(bs->file && bs->file->bs);
+    *pnum = nb_sectors;
+    *file = bs->file->bs;
+    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
+           (sector_num << BDRV_SECTOR_BITS);
+}
+
+int64_t coroutine_fn bdrv_co_get_block_status_from_backing(BlockDriverState *bs,
+                                                           int64_t sector_num,
+                                                           int nb_sectors,
+                                                           int *pnum,
+                                                           BlockDriverState **file)
+{
+    assert(bs->backing && bs->backing->bs);
+    *pnum = nb_sectors;
+    *file = bs->backing->bs;
+    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
+           (sector_num << BDRV_SECTOR_BITS);
+}
+
 /*
 * Returns the allocation status of the specified sectors.
 * Drivers not implementing the functionality are assumed to not support
 * backing files, hence all their sectors are reported as allocated.
 *
- * If 'sector_num' is beyond the end of the disk image the return value is 0
- * and 'pnum' is set to 0.
+ * If 'sector_num' is beyond the end of the disk image the return value is
+ * BDRV_BLOCK_EOF and 'pnum' is set to 0.
 *
 * 'pnum' is set to the number of sectors (including and immediately following
 * the specified sector) that are known to be in the same
 * allocated/unallocated state.
 *
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
- * beyond the end of the disk image it will be clamped.
+ * beyond the end of the disk image it will be clamped; if 'pnum' is set to
+ * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
 *
 * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file'
 * points to the BDS which the sector range is allocated in.
@@ -1741,6 +1768,7 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
    int64_t n;
    int64_t ret, ret2;

+    *file = NULL;
    total_sectors = bdrv_nb_sectors(bs);
    if (total_sectors < 0) {
        return total_sectors;
@@ -1748,7 +1776,7 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,

    if (sector_num >= total_sectors) {
        *pnum = 0;
-        return 0;
+        return BDRV_BLOCK_EOF;
    }

    n = total_sectors - sector_num;
@@ -1759,13 +1787,16 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
    if (!bs->drv->bdrv_co_get_block_status) {
        *pnum = nb_sectors;
        ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
+        if (sector_num + nb_sectors == total_sectors) {
+            ret |= BDRV_BLOCK_EOF;
+        }
        if (bs->drv->protocol_name) {
            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
+            *file = bs;
        }
        return ret;
    }

-    *file = NULL;
    bdrv_inc_in_flight(bs);
    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum,
                                            file);
@@ -1775,7 +1806,7 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
    }

    if (ret & BDRV_BLOCK_RAW) {
-        assert(ret & BDRV_BLOCK_OFFSET_VALID);
+        assert(ret & BDRV_BLOCK_OFFSET_VALID && *file);
        ret = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS,
                                       *pnum, pnum, file);
        goto out;
@@ -1807,10 +1838,13 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
            /* Ignore errors.  This is just providing extra information, it
             * is useful but not necessary.
             */
-            if (!file_pnum) {
-                /* !file_pnum indicates an offset at or beyond the EOF; it is
-                 * perfectly valid for the format block driver to point to such
-                 * offsets, so catch it and mark everything as zero */
+            if (ret2 & BDRV_BLOCK_EOF &&
+                (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
+                /*
+                 * It is valid for the format block driver to read
+                 * beyond the end of the underlying file's current
+                 * size; such areas read as zero.
+                 */
                ret |= BDRV_BLOCK_ZERO;
            } else {
                /* Limit request to the range reported by the protocol driver */
@@ -1822,6 +1856,9 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,

 out:
    bdrv_dec_in_flight(bs);
+    if (ret >= 0 && sector_num + *pnum == total_sectors) {
+        ret |= BDRV_BLOCK_EOF;
+    }
    return ret;
 }

@@ -1834,16 +1871,30 @@ static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
 {
    BlockDriverState *p;
    int64_t ret = 0;
+    bool first = true;

    assert(bs != base);
    for (p = bs; p != base; p = backing_bs(p)) {
        ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file);
-        if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) {
+        if (ret < 0) {
+            break;
+        }
+        if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
+            /*
+             * Reading beyond the end of the file continues to read
+             * zeroes, but we can only widen the result to the
+             * unallocated length we learned from an earlier
+             * iteration.
+             */
+            *pnum = nb_sectors;
+        }
+        if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
            break;
        }
        /* [sector_num, pnum] unallocated on this layer, which could be only
         * the first part of [sector_num, nb_sectors].  */
        nb_sectors = MIN(nb_sectors, *pnum);
+        first = false;
    }
    return ret;
 }
@@ -1904,59 +1955,72 @@ int64_t bdrv_get_block_status(BlockDriverState *bs,
                                       sector_num, nb_sectors, pnum, file);
 }

-int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
-                                   int nb_sectors, int *pnum)
+int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
+                                   int64_t bytes, int64_t *pnum)
 {
    BlockDriverState *file;
-    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum,
-                                        &file);
+    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    int64_t ret;
+    int psectors;
+
+    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
+    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE) && bytes < INT_MAX);
+    ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &psectors,
+                                &file);
    if (ret < 0) {
        return ret;
    }
+    if (pnum) {
+        *pnum = psectors * BDRV_SECTOR_SIZE;
+    }
    return !!(ret & BDRV_BLOCK_ALLOCATED);
 }

 /*
 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
 *
- * Return true if the given sector is allocated in any image between
- * BASE and TOP (inclusive).  BASE can be NULL to check if the given
- * sector is allocated in any image of the chain.  Return false otherwise.
+ * Return true if (a prefix of) the given range is allocated in any image
+ * between BASE and TOP (inclusive).  BASE can be NULL to check if the given
+ * offset is allocated in any image of the chain.  Return false otherwise,
+ * or negative errno on failure.
 *
- * 'pnum' is set to the number of sectors (including and immediately following
- *  the specified sector) that are known to be in the same
- *  allocated/unallocated state.
+ * 'pnum' is set to the number of bytes (including and immediately
+ * following the specified offset) that are known to be in the same
+ * allocated/unallocated state.  Note that a subsequent call starting
+ * at 'offset + *pnum' may return the same allocation status (in other
+ * words, the result is not necessarily the maximum possible range);
+ * but 'pnum' will only be 0 when end of file is reached.
 *
 */
 int bdrv_is_allocated_above(BlockDriverState *top,
                            BlockDriverState *base,
-                            int64_t sector_num,
-                            int nb_sectors, int *pnum)
+                            int64_t offset, int64_t bytes, int64_t *pnum)
 {
    BlockDriverState *intermediate;
-    int ret, n = nb_sectors;
+    int ret;
+    int64_t n = bytes;

    intermediate = top;
    while (intermediate && intermediate != base) {
-        int pnum_inter;
-        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
-                                &pnum_inter);
+        int64_t pnum_inter;
+        int64_t size_inter;
+
+        ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter);
        if (ret < 0) {
            return ret;
-        } else if (ret) {
+        }
+        if (ret) {
            *pnum = pnum_inter;
            return 1;
        }

-        /*
-         * [sector_num, nb_sectors] is unallocated on top but intermediate
-         * might have
-         *
-         * [sector_num+x, nr_sectors] allocated.
-         */
+        size_inter = bdrv_getlength(intermediate);
+        if (size_inter < 0) {
+            return size_inter;
+        }
        if (n > pnum_inter &&
-            (intermediate == top ||
-             sector_num + pnum_inter < intermediate->total_sectors)) {
+            (intermediate == top || offset + pnum_inter < size_inter)) {
            n = pnum_inter;
        }

@@ -1980,17 +2044,24 @@ bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
                   bool is_read)
 {
    BlockDriver *drv = bs->drv;
+    int ret = -ENOTSUP;
+
+    bdrv_inc_in_flight(bs);

    if (!drv) {
-        return -ENOMEDIUM;
+        ret = -ENOMEDIUM;
    } else if (drv->bdrv_load_vmstate) {
-        return is_read ? drv->bdrv_load_vmstate(bs, qiov, pos)
-                       : drv->bdrv_save_vmstate(bs, qiov, pos);
+        if (is_read) {
+            ret = drv->bdrv_load_vmstate(bs, qiov, pos);
+        } else {
+            ret = drv->bdrv_save_vmstate(bs, qiov, pos);
+        }
    } else if (bs->file) {
-        return bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
+        ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
    }

-    return -ENOTSUP;
+    bdrv_dec_in_flight(bs);
+    return ret;
 }

 static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
@@ -2016,9 +2087,7 @@ bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
        Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);

        bdrv_coroutine_enter(bs, co);
-        while (data.ret == -EINPROGRESS) {
-            aio_poll(bdrv_get_aio_context(bs), true);
-        }
+        BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
        return data.ret;
    }
 }
@@ -2075,28 +2144,6 @@ int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
 /**************************************************************/
 /* async I/Os */

-BlockAIOCB *bdrv_aio_readv(BdrvChild *child, int64_t sector_num,
-                           QEMUIOVector *qiov, int nb_sectors,
-                           BlockCompletionFunc *cb, void *opaque)
-{
-    trace_bdrv_aio_readv(child->bs, sector_num, nb_sectors, opaque);
-
-    assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
-    return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
-                                  0, cb, opaque, false);
-}
-
-BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num,
-                            QEMUIOVector *qiov, int nb_sectors,
-                            BlockCompletionFunc *cb, void *opaque)
-{
-    trace_bdrv_aio_writev(child->bs, sector_num, nb_sectors, opaque);
-
-    assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
-    return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
-                                  0, cb, opaque, true);
-}
-
 void bdrv_aio_cancel(BlockAIOCB *acb)
 {
    qemu_aio_ref(acb);
@@ -2128,147 +2175,6 @@ void bdrv_aio_cancel_async(BlockAIOCB *acb)
    }
 }

-/**************************************************************/
-/* async block device emulation */
-
-typedef struct BlockRequest {
-    union {
-        /* Used during read, write, trim */
-        struct {
-            int64_t offset;
-            int bytes;
-            int flags;
-            QEMUIOVector *qiov;
-        };
-        /* Used during ioctl */
-        struct {
-            int req;
-            void *buf;
-        };
-    };
-    BlockCompletionFunc *cb;
-    void *opaque;
-
-    int error;
-} BlockRequest;
-
-typedef struct BlockAIOCBCoroutine {
-    BlockAIOCB common;
-    BdrvChild *child;
-    BlockRequest req;
-    bool is_write;
-    bool need_bh;
-    bool *done;
-} BlockAIOCBCoroutine;
-
-static const AIOCBInfo bdrv_em_co_aiocb_info = {
-    .aiocb_size         = sizeof(BlockAIOCBCoroutine),
-};
-
-static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
-{
-    if (!acb->need_bh) {
-        bdrv_dec_in_flight(acb->common.bs);
-        acb->common.cb(acb->common.opaque, acb->req.error);
-        qemu_aio_unref(acb);
-    }
-}
-
-static void bdrv_co_em_bh(void *opaque)
-{
-    BlockAIOCBCoroutine *acb = opaque;
-
-    assert(!acb->need_bh);
-    bdrv_co_complete(acb);
-}
-
-static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
-{
-    acb->need_bh = false;
-    if (acb->req.error != -EINPROGRESS) {
-        BlockDriverState *bs = acb->common.bs;
-
-        aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
-    }
-}
-
-/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
-static void coroutine_fn bdrv_co_do_rw(void *opaque)
-{
-    BlockAIOCBCoroutine *acb = opaque;
-
-    if (!acb->is_write) {
-        acb->req.error = bdrv_co_preadv(acb->child, acb->req.offset,
-            acb->req.qiov->size, acb->req.qiov, acb->req.flags);
-    } else {
-        acb->req.error = bdrv_co_pwritev(acb->child, acb->req.offset,
-            acb->req.qiov->size, acb->req.qiov, acb->req.flags);
-    }
-
-    bdrv_co_complete(acb);
-}
-
-static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
-                                          int64_t offset,
-                                          QEMUIOVector *qiov,
-                                          BdrvRequestFlags flags,
-                                          BlockCompletionFunc *cb,
-                                          void *opaque,
-                                          bool is_write)
-{
-    Coroutine *co;
-    BlockAIOCBCoroutine *acb;
-
-    /* Matched by bdrv_co_complete's bdrv_dec_in_flight.  */
-    bdrv_inc_in_flight(child->bs);
-
-    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, child->bs, cb, opaque);
-    acb->child = child;
-    acb->need_bh = true;
-    acb->req.error = -EINPROGRESS;
-    acb->req.offset = offset;
-    acb->req.qiov = qiov;
-    acb->req.flags = flags;
-    acb->is_write = is_write;
-
-    co = qemu_coroutine_create(bdrv_co_do_rw, acb);
-    bdrv_coroutine_enter(child->bs, co);
-
-    bdrv_co_maybe_schedule_bh(acb);
-    return &acb->common;
-}
-
-static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
-{
-    BlockAIOCBCoroutine *acb = opaque;
-    BlockDriverState *bs = acb->common.bs;
-
-    acb->req.error = bdrv_co_flush(bs);
-    bdrv_co_complete(acb);
-}
-
-BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
-        BlockCompletionFunc *cb, void *opaque)
-{
-    trace_bdrv_aio_flush(bs, opaque);
-
-    Coroutine *co;
-    BlockAIOCBCoroutine *acb;
-
-    /* Matched by bdrv_co_complete's bdrv_dec_in_flight.  */
-    bdrv_inc_in_flight(bs);
-
-    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
-    acb->need_bh = true;
-    acb->req.error = -EINPROGRESS;
-
-    co = qemu_coroutine_create(bdrv_aio_flush_co_entry, acb);
-    bdrv_coroutine_enter(bs, co);
-
-    bdrv_co_maybe_schedule_bh(acb);
-    return &acb->common;
-}
-
 /**************************************************************/
 /* Coroutine block device emulation */

@@ -2414,18 +2320,18 @@ int bdrv_flush(BlockDriverState *bs)
 typedef struct DiscardCo {
    BlockDriverState *bs;
    int64_t offset;
-    int count;
+    int bytes;
    int ret;
 } DiscardCo;
 static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
 {
    DiscardCo *rwco = opaque;

-    rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->count);
+    rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->bytes);
 }

 int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
-                                  int count)
+                                  int bytes)
 {
    BdrvTrackedRequest req;
    int max_pdiscard, ret;
@@ -2435,7 +2341,11 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
        return -ENOMEDIUM;
    }

-    ret = bdrv_check_byte_request(bs, offset, count);
+    if (bdrv_has_readonly_bitmaps(bs)) {
+        return -EPERM;
+    }
+
+    ret = bdrv_check_byte_request(bs, offset, bytes);
    if (ret < 0) {
        return ret;
    } else if (bs->read_only) {
@@ -2460,10 +2370,10 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
    align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
    assert(align % bs->bl.request_alignment == 0);
    head = offset % align;
-    tail = (offset + count) % align;
+    tail = (offset + bytes) % align;

    bdrv_inc_in_flight(bs);
-    tracked_request_begin(&req, bs, offset, count, BDRV_TRACKED_DISCARD);
+    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);

    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
    if (ret < 0) {
@@ -2474,13 +2384,12 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
                                   align);
    assert(max_pdiscard >= bs->bl.request_alignment);

-    while (count > 0) {
-        int ret;
-        int num = count;
+    while (bytes > 0) {
+        int num = bytes;

        if (head) {
            /* Make small requests to get to alignment boundaries. */
-            num = MIN(count, align - head);
+            num = MIN(bytes, align - head);
            if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
                num %= bs->bl.request_alignment;
            }
@@ -2524,7 +2433,7 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
        }

        offset += num;
-        count -= num;
+        bytes -= num;
    }
    ret = 0;
 out:
@@ -2536,13 +2445,13 @@ out:
    return ret;
 }

-int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count)
+int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
 {
    Coroutine *co;
    DiscardCo rwco = {
        .bs = bs,
        .offset = offset,
-        .count = count,
+        .bytes = bytes,
        .ret = NOT_DONE,
    };

--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -34,15 +34,20 @@
 #include "qemu/bitops.h"
 #include "qemu/bitmap.h"
 #include "block/block_int.h"
-#include "block/scsi.h"
+#include "scsi/constants.h"
 #include "qemu/iov.h"
 #include "qemu/uuid.h"
 #include "qmp-commands.h"
 #include "qapi/qmp/qstring.h"
 #include "crypto/secret.h"
+#include "scsi/utils.h"

+/* Conflict between scsi/utils.h and libiscsi! :( */
+#define SCSI_XFER_NONE ISCSI_XFER_NONE
 #include <iscsi/iscsi.h>
 #include <iscsi/scsi-lowlevel.h>
+#undef SCSI_XFER_NONE
+QEMU_BUILD_BUG_ON((int)SCSI_XFER_NONE != (int)ISCSI_XFER_NONE);

 #ifdef __linux__
 #include <scsi/sg.h>
@@ -209,47 +214,9 @@ static inline unsigned exp_random(double mean)

 static int iscsi_translate_sense(struct scsi_sense *sense)
 {
-    int ret;
-
-    switch (sense->key) {
-    case SCSI_SENSE_NOT_READY:
-        return -EBUSY;
-    case SCSI_SENSE_DATA_PROTECTION:
-        return -EACCES;
-    case SCSI_SENSE_COMMAND_ABORTED:
-        return -ECANCELED;
-    case SCSI_SENSE_ILLEGAL_REQUEST:
-        /* Parse ASCQ */
-        break;
-    default:
-        return -EIO;
-    }
-    switch (sense->ascq) {
-    case SCSI_SENSE_ASCQ_PARAMETER_LIST_LENGTH_ERROR:
-    case SCSI_SENSE_ASCQ_INVALID_OPERATION_CODE:
-    case SCSI_SENSE_ASCQ_INVALID_FIELD_IN_CDB:
-    case SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST:
-        ret = -EINVAL;
-        break;
-    case SCSI_SENSE_ASCQ_LBA_OUT_OF_RANGE:
-        ret = -ENOSPC;
-        break;
-    case SCSI_SENSE_ASCQ_LOGICAL_UNIT_NOT_SUPPORTED:
-        ret = -ENOTSUP;
-        break;
-    case SCSI_SENSE_ASCQ_MEDIUM_NOT_PRESENT:
-    case SCSI_SENSE_ASCQ_MEDIUM_NOT_PRESENT_TRAY_CLOSED:
-    case SCSI_SENSE_ASCQ_MEDIUM_NOT_PRESENT_TRAY_OPEN:
-        ret = -ENOMEDIUM;
-        break;
-    case SCSI_SENSE_ASCQ_WRITE_PROTECTED:
-        ret = -EACCES;
-        break;
-    default:
-        ret = -EIO;
-        break;
-    }
-    return ret;
+    return - scsi_sense_to_errno(sense->key,
+                                 (sense->ascq & 0xFF00) >> 8,
+                                 sense->ascq & 0xFF);
 }

 /* Called (via iscsi_service) with QemuMutex held.  */
@@ -1116,14 +1083,14 @@ iscsi_getlength(BlockDriverState *bs)
 }

 static int
-coroutine_fn iscsi_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
+coroutine_fn iscsi_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
 {
    IscsiLun *iscsilun = bs->opaque;
    struct IscsiTask iTask;
    struct unmap_list list;
    int r = 0;

-    if (!is_byte_request_lun_aligned(offset, count, iscsilun)) {
+    if (!is_byte_request_lun_aligned(offset, bytes, iscsilun)) {
        return -ENOTSUP;
    }

@@ -1133,7 +1100,7 @@ coroutine_fn iscsi_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
    }

    list.lba = offset / iscsilun->block_size;
-    list.num = count / iscsilun->block_size;
+    list.num = bytes / iscsilun->block_size;

    iscsi_co_init_iscsitask(iscsilun, &iTask);
    qemu_mutex_lock(&iscsilun->mutex);
@@ -1174,7 +1141,7 @@ retry:
    }

    iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
-                               count >> BDRV_SECTOR_BITS);
+                               bytes >> BDRV_SECTOR_BITS);

 out_unlock:
    qemu_mutex_unlock(&iscsilun->mutex);
@@ -1183,7 +1150,7 @@ out_unlock:

 static int
 coroutine_fn iscsi_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
-                                    int count, BdrvRequestFlags flags)
+                                    int bytes, BdrvRequestFlags flags)
 {
    IscsiLun *iscsilun = bs->opaque;
    struct IscsiTask iTask;
@@ -1192,7 +1159,7 @@ coroutine_fn iscsi_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
    bool use_16_for_ws = iscsilun->use_16_for_rw;
    int r = 0;

-    if (!is_byte_request_lun_aligned(offset, count, iscsilun)) {
+    if (!is_byte_request_lun_aligned(offset, bytes, iscsilun)) {
        return -ENOTSUP;
    }

@@ -1215,7 +1182,7 @@ coroutine_fn iscsi_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
    }

    lba = offset / iscsilun->block_size;
-    nb_blocks = count / iscsilun->block_size;
+    nb_blocks = bytes / iscsilun->block_size;

    if (iscsilun->zeroblock == NULL) {
        iscsilun->zeroblock = g_try_malloc0(iscsilun->block_size);
@@ -1273,17 +1240,17 @@ retry:

    if (iTask.status != SCSI_STATUS_GOOD) {
        iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
-                                   count >> BDRV_SECTOR_BITS);
+                                   bytes >> BDRV_SECTOR_BITS);
        r = iTask.err_code;
        goto out_unlock;
    }

    if (flags & BDRV_REQ_MAY_UNMAP) {
        iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
-                                   count >> BDRV_SECTOR_BITS);
+                                   bytes >> BDRV_SECTOR_BITS);
    } else {
        iscsi_allocmap_set_allocated(iscsilun, offset >> BDRV_SECTOR_BITS,
-                                     count >> BDRV_SECTOR_BITS);
+                                     bytes >> BDRV_SECTOR_BITS);
    }

 out_unlock:
@@ -1761,9 +1728,9 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
     * filename encoded options */
    filename = qdict_get_try_str(options, "filename");
    if (filename) {
-        error_report("Warning: 'filename' option specified. "
-                      "This is an unsupported option, and may be deprecated "
-                      "in the future");
+        warn_report("'filename' option specified. "
+                    "This is an unsupported option, and may be deprecated "
+                    "in the future");
        iscsi_parse_filename(filename, options, &local_err);
        if (local_err) {
            ret = -EINVAL;
@@ -2079,11 +2046,18 @@ static void iscsi_reopen_commit(BDRVReopenState *reopen_state)
    }
 }

-static int iscsi_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
+static int iscsi_truncate(BlockDriverState *bs, int64_t offset,
+                          PreallocMode prealloc, Error **errp)
 {
    IscsiLun *iscsilun = bs->opaque;
    Error *local_err = NULL;

+    if (prealloc != PREALLOC_MODE_OFF) {
+        error_setg(errp, "Unsupported preallocation mode '%s'",
+                   PreallocMode_str(prealloc));
+        return -ENOTSUP;
+    }
+
    if (iscsilun->type != TYPE_DISK) {
        error_setg(errp, "Cannot resize non-disk iSCSI devices");
        return -ENOTSUP;
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -24,9 +24,8 @@

 #define SLICE_TIME    100000000ULL /* ns */
 #define MAX_IN_FLIGHT 16
-#define MAX_IO_SECTORS ((1 << 20) >> BDRV_SECTOR_BITS) /* 1 Mb */
-#define DEFAULT_MIRROR_BUF_SIZE \
-    (MAX_IN_FLIGHT * MAX_IO_SECTORS * BDRV_SECTOR_SIZE)
+#define MAX_IO_BYTES (1 << 20) /* 1 Mb */
+#define DEFAULT_MIRROR_BUF_SIZE (MAX_IN_FLIGHT * MAX_IO_BYTES)

 /* The mirroring buffer is a list of granularity-sized chunks.
 * Free chunks are organized in a list.
@@ -67,11 +66,11 @@ typedef struct MirrorBlockJob {
    uint64_t last_pause_ns;
    unsigned long *in_flight_bitmap;
    int in_flight;
-    int64_t sectors_in_flight;
+    int64_t bytes_in_flight;
    int ret;
    bool unmap;
    bool waiting_for_io;
-    int target_cluster_sectors;
+    int target_cluster_size;
    int max_iov;
    bool initial_zeroing_ongoing;
 } MirrorBlockJob;
@@ -79,8 +78,8 @@ typedef struct MirrorBlockJob {
 typedef struct MirrorOp {
    MirrorBlockJob *s;
    QEMUIOVector qiov;
-    int64_t sector_num;
-    int nb_sectors;
+    int64_t offset;
+    uint64_t bytes;
 } MirrorOp;

 static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
@@ -101,12 +100,12 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
    MirrorBlockJob *s = op->s;
    struct iovec *iov;
    int64_t chunk_num;
-    int i, nb_chunks, sectors_per_chunk;
+    int i, nb_chunks;

-    trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret);
+    trace_mirror_iteration_done(s, op->offset, op->bytes, ret);

    s->in_flight--;
-    s->sectors_in_flight -= op->nb_sectors;
+    s->bytes_in_flight -= op->bytes;
    iov = op->qiov.iov;
    for (i = 0; i < op->qiov.niov; i++) {
        MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
@@ -114,16 +113,15 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
        s->buf_free_count++;
    }

-    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
-    chunk_num = op->sector_num / sectors_per_chunk;
-    nb_chunks = DIV_ROUND_UP(op->nb_sectors, sectors_per_chunk);
+    chunk_num = op->offset / s->granularity;
+    nb_chunks = DIV_ROUND_UP(op->bytes, s->granularity);
    bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
    if (ret >= 0) {
        if (s->cow_bitmap) {
            bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
        }
        if (!s->initial_zeroing_ongoing) {
-            s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE;
+            s->common.offset += op->bytes;
        }
    }
    qemu_iovec_destroy(&op->qiov);
@@ -143,7 +141,8 @@ static void mirror_write_complete(void *opaque, int ret)
    if (ret < 0) {
        BlockErrorAction action;

-        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
+        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->offset >> BDRV_SECTOR_BITS,
+                              op->bytes >> BDRV_SECTOR_BITS);
        action = mirror_error_action(s, false, -ret);
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
            s->ret = ret;
@@ -162,7 +161,8 @@ static void mirror_read_complete(void *opaque, int ret)
    if (ret < 0) {
        BlockErrorAction action;

-        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
+        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->offset >> BDRV_SECTOR_BITS,
+                              op->bytes >> BDRV_SECTOR_BITS);
        action = mirror_error_action(s, true, -ret);
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
            s->ret = ret;
@@ -170,56 +170,53 @@ static void mirror_read_complete(void *opaque, int ret)

        mirror_iteration_done(op, ret);
    } else {
-        blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
+        blk_aio_pwritev(s->target, op->offset, &op->qiov,
                        0, mirror_write_complete, op);
    }
    aio_context_release(blk_get_aio_context(s->common.blk));
 }

-static inline void mirror_clip_sectors(MirrorBlockJob *s,
-                                       int64_t sector_num,
-                                       int *nb_sectors)
+/* Clip bytes relative to offset to not exceed end-of-file */
+static inline int64_t mirror_clip_bytes(MirrorBlockJob *s,
+                                        int64_t offset,
+                                        int64_t bytes)
 {
-    *nb_sectors = MIN(*nb_sectors,
-                      s->bdev_length / BDRV_SECTOR_SIZE - sector_num);
+    return MIN(bytes, s->bdev_length - offset);
 }

-/* Round sector_num and/or nb_sectors to target cluster if COW is needed, and
- * return the offset of the adjusted tail sector against original. */
-static int mirror_cow_align(MirrorBlockJob *s,
-                            int64_t *sector_num,
-                            int *nb_sectors)
+/* Round offset and/or bytes to target cluster if COW is needed, and
+ * return the offset of the adjusted tail against original. */
+static int mirror_cow_align(MirrorBlockJob *s, int64_t *offset,
+                            uint64_t *bytes)
 {
    bool need_cow;
    int ret = 0;
-    int chunk_sectors = s->granularity >> BDRV_SECTOR_BITS;
-    int64_t align_sector_num = *sector_num;
-    int align_nb_sectors = *nb_sectors;
-    int max_sectors = chunk_sectors * s->max_iov;
+    int64_t align_offset = *offset;
+    unsigned int align_bytes = *bytes;
+    int max_bytes = s->granularity * s->max_iov;

-    need_cow = !test_bit(*sector_num / chunk_sectors, s->cow_bitmap);
-    need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors,
+    assert(*bytes < INT_MAX);
+    need_cow = !test_bit(*offset / s->granularity, s->cow_bitmap);
+    need_cow |= !test_bit((*offset + *bytes - 1) / s->granularity,
                          s->cow_bitmap);
    if (need_cow) {
-        bdrv_round_sectors_to_clusters(blk_bs(s->target), *sector_num,
-                                       *nb_sectors, &align_sector_num,
-                                       &align_nb_sectors);
+        bdrv_round_to_clusters(blk_bs(s->target), *offset, *bytes,
+                               &align_offset, &align_bytes);
    }

-    if (align_nb_sectors > max_sectors) {
-        align_nb_sectors = max_sectors;
+    if (align_bytes > max_bytes) {
+        align_bytes = max_bytes;
        if (need_cow) {
-            align_nb_sectors = QEMU_ALIGN_DOWN(align_nb_sectors,
-                                               s->target_cluster_sectors);
+            align_bytes = QEMU_ALIGN_DOWN(align_bytes, s->target_cluster_size);
        }
    }
-    /* Clipping may result in align_nb_sectors unaligned to chunk boundary, but
+    /* Clipping may result in align_bytes unaligned to chunk boundary, but
     * that doesn't matter because it's already the end of source image. */
-    mirror_clip_sectors(s, align_sector_num, &align_nb_sectors);
+    align_bytes = mirror_clip_bytes(s, align_offset, align_bytes);

-    ret = align_sector_num + align_nb_sectors - (*sector_num + *nb_sectors);
-    *sector_num = align_sector_num;
-    *nb_sectors = align_nb_sectors;
+    ret = align_offset + align_bytes - (*offset + *bytes);
+    *offset = align_offset;
+    *bytes = align_bytes;
    assert(ret >= 0);
    return ret;
 }
@@ -233,50 +230,51 @@ static inline void mirror_wait_for_io(MirrorBlockJob *s)
 }

 /* Submit async read while handling COW.
- * Returns: The number of sectors copied after and including sector_num,
- *          excluding any sectors copied prior to sector_num due to alignment.
- *          This will be nb_sectors if no alignment is necessary, or
- *          (new_end - sector_num) if tail is rounded up or down due to
+ * Returns: The number of bytes copied after and including offset,
+ *          excluding any bytes copied prior to offset due to alignment.
+ *          This will be @bytes if no alignment is necessary, or
+ *          (new_end - offset) if tail is rounded up or down due to
 *          alignment or buffer limit.
 */
-static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
-                          int nb_sectors)
+static uint64_t mirror_do_read(MirrorBlockJob *s, int64_t offset,
+                               uint64_t bytes)
 {
    BlockBackend *source = s->common.blk;
-    int sectors_per_chunk, nb_chunks;
-    int ret;
+    int nb_chunks;
+    uint64_t ret;
    MirrorOp *op;
-    int max_sectors;
+    uint64_t max_bytes;

-    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
-    max_sectors = sectors_per_chunk * s->max_iov;
+    max_bytes = s->granularity * s->max_iov;

    /* We can only handle as much as buf_size at a time. */
-    nb_sectors = MIN(s->buf_size >> BDRV_SECTOR_BITS, nb_sectors);
-    nb_sectors = MIN(max_sectors, nb_sectors);
-    assert(nb_sectors);
-    ret = nb_sectors;
+    bytes = MIN(s->buf_size, MIN(max_bytes, bytes));
+    assert(bytes);
+    assert(bytes < BDRV_REQUEST_MAX_BYTES);
+    ret = bytes;

    if (s->cow_bitmap) {
-        ret += mirror_cow_align(s, &sector_num, &nb_sectors);
+        ret += mirror_cow_align(s, &offset, &bytes);
    }
-    assert(nb_sectors << BDRV_SECTOR_BITS <= s->buf_size);
-    /* The sector range must meet granularity because:
+    assert(bytes <= s->buf_size);
+    /* The offset is granularity-aligned because:
     * 1) Caller passes in aligned values;
     * 2) mirror_cow_align is used only when target cluster is larger. */
-    assert(!(sector_num % sectors_per_chunk));
-    nb_chunks = DIV_ROUND_UP(nb_sectors, sectors_per_chunk);
+    assert(QEMU_IS_ALIGNED(offset, s->granularity));
+    /* The range is sector-aligned, since bdrv_getlength() rounds up. */
+    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
+    nb_chunks = DIV_ROUND_UP(bytes, s->granularity);

    while (s->buf_free_count < nb_chunks) {
-        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
+        trace_mirror_yield_in_flight(s, offset, s->in_flight);
        mirror_wait_for_io(s);
    }

    /* Allocate a MirrorOp that is used as an AIO callback.  */
    op = g_new(MirrorOp, 1);
    op->s = s;
-    op->sector_num = sector_num;
-    op->nb_sectors = nb_sectors;
+    op->offset = offset;
+    op->bytes = bytes;

    /* Now make a QEMUIOVector taking enough granularity-sized chunks
     * from s->buf_free.
@@ -284,7 +282,7 @@ static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
    qemu_iovec_init(&op->qiov, nb_chunks);
    while (nb_chunks-- > 0) {
        MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
-        size_t remaining = nb_sectors * BDRV_SECTOR_SIZE - op->qiov.size;
+        size_t remaining = bytes - op->qiov.size;

        QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
        s->buf_free_count--;
@@ -293,17 +291,16 @@ static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,

    /* Copy the dirty cluster.  */
    s->in_flight++;
-    s->sectors_in_flight += nb_sectors;
-    trace_mirror_one_iteration(s, sector_num, nb_sectors);
+    s->bytes_in_flight += bytes;
+    trace_mirror_one_iteration(s, offset, bytes);

-    blk_aio_preadv(source, sector_num * BDRV_SECTOR_SIZE, &op->qiov, 0,
-                   mirror_read_complete, op);
+    blk_aio_preadv(source, offset, &op->qiov, 0, mirror_read_complete, op);
    return ret;
 }

 static void mirror_do_zero_or_discard(MirrorBlockJob *s,
-                                      int64_t sector_num,
-                                      int nb_sectors,
+                                      int64_t offset,
+                                      uint64_t bytes,
                                      bool is_discard)
 {
    MirrorOp *op;
@@ -312,19 +309,17 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s,
     * so the freeing in mirror_iteration_done is nop. */
    op = g_new0(MirrorOp, 1);
    op->s = s;
-    op->sector_num = sector_num;
-    op->nb_sectors = nb_sectors;
+    op->offset = offset;
+    op->bytes = bytes;

    s->in_flight++;
-    s->sectors_in_flight += nb_sectors;
+    s->bytes_in_flight += bytes;
    if (is_discard) {
-        blk_aio_pdiscard(s->target, sector_num << BDRV_SECTOR_BITS,
-                         op->nb_sectors << BDRV_SECTOR_BITS,
-                         mirror_write_complete, op);
+        blk_aio_pdiscard(s->target, offset,
+                         op->bytes, mirror_write_complete, op);
    } else {
-        blk_aio_pwrite_zeroes(s->target, sector_num * BDRV_SECTOR_SIZE,
-                              op->nb_sectors * BDRV_SECTOR_SIZE,
-                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
+        blk_aio_pwrite_zeroes(s->target, offset,
+                              op->bytes, s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
                              mirror_write_complete, op);
    }
 }
@@ -332,29 +327,28 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s,
 static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
    BlockDriverState *source = s->source;
-    int64_t sector_num, first_chunk;
+    int64_t offset, first_chunk;
    uint64_t delay_ns = 0;
    /* At least the first dirty chunk is mirrored in one iteration. */
    int nb_chunks = 1;
-    int64_t end = s->bdev_length / BDRV_SECTOR_SIZE;
    int sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
    bool write_zeroes_ok = bdrv_can_write_zeroes_with_unmap(blk_bs(s->target));
-    int max_io_sectors = MAX((s->buf_size >> BDRV_SECTOR_BITS) / MAX_IN_FLIGHT,
-                             MAX_IO_SECTORS);
+    int max_io_bytes = MAX(s->buf_size / MAX_IN_FLIGHT, MAX_IO_BYTES);

    bdrv_dirty_bitmap_lock(s->dirty_bitmap);
-    sector_num = bdrv_dirty_iter_next(s->dbi);
-    if (sector_num < 0) {
+    offset = bdrv_dirty_iter_next(s->dbi) * BDRV_SECTOR_SIZE;
+    if (offset < 0) {
        bdrv_set_dirty_iter(s->dbi, 0);
-        sector_num = bdrv_dirty_iter_next(s->dbi);
-        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
-        assert(sector_num >= 0);
+        offset = bdrv_dirty_iter_next(s->dbi) * BDRV_SECTOR_SIZE;
+        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap) *
+                                  BDRV_SECTOR_SIZE);
+        assert(offset >= 0);
    }
    bdrv_dirty_bitmap_unlock(s->dirty_bitmap);

-    first_chunk = sector_num / sectors_per_chunk;
+    first_chunk = offset / s->granularity;
    while (test_bit(first_chunk, s->in_flight_bitmap)) {
-        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
+        trace_mirror_yield_in_flight(s, offset, s->in_flight);
        mirror_wait_for_io(s);
    }

@@ -363,25 +357,26 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
    /* Find the number of consective dirty chunks following the first dirty
     * one, and wait for in flight requests in them. */
    bdrv_dirty_bitmap_lock(s->dirty_bitmap);
-    while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) {
+    while (nb_chunks * s->granularity < s->buf_size) {
        int64_t next_dirty;
-        int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk;
-        int64_t next_chunk = next_sector / sectors_per_chunk;
-        if (next_sector >= end ||
-            !bdrv_get_dirty_locked(source, s->dirty_bitmap, next_sector)) {
+        int64_t next_offset = offset + nb_chunks * s->granularity;
+        int64_t next_chunk = next_offset / s->granularity;
+        if (next_offset >= s->bdev_length ||
+            !bdrv_get_dirty_locked(source, s->dirty_bitmap,
+                                   next_offset >> BDRV_SECTOR_BITS)) {
            break;
        }
        if (test_bit(next_chunk, s->in_flight_bitmap)) {
            break;
        }

-        next_dirty = bdrv_dirty_iter_next(s->dbi);
-        if (next_dirty > next_sector || next_dirty < 0) {
+        next_dirty = bdrv_dirty_iter_next(s->dbi) * BDRV_SECTOR_SIZE;
+        if (next_dirty > next_offset || next_dirty < 0) {
            /* The bitmap iterator's cache is stale, refresh it */
-            bdrv_set_dirty_iter(s->dbi, next_sector);
-            next_dirty = bdrv_dirty_iter_next(s->dbi);
+            bdrv_set_dirty_iter(s->dbi, next_offset >> BDRV_SECTOR_BITS);
+            next_dirty = bdrv_dirty_iter_next(s->dbi) * BDRV_SECTOR_SIZE;
        }
-        assert(next_dirty == next_sector);
+        assert(next_dirty == next_offset);
        nb_chunks++;
    }

@@ -389,14 +384,16 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
     * calling bdrv_get_block_status_above could yield - if some blocks are
     * marked dirty in this window, we need to know.
     */
-    bdrv_reset_dirty_bitmap_locked(s->dirty_bitmap, sector_num,
-                                  nb_chunks * sectors_per_chunk);
+    bdrv_reset_dirty_bitmap_locked(s->dirty_bitmap, offset >> BDRV_SECTOR_BITS,
+                                   nb_chunks * sectors_per_chunk);
    bdrv_dirty_bitmap_unlock(s->dirty_bitmap);

-    bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks);
-    while (nb_chunks > 0 && sector_num < end) {
+    bitmap_set(s->in_flight_bitmap, offset / s->granularity, nb_chunks);
+    while (nb_chunks > 0 && offset < s->bdev_length) {
        int64_t ret;
-        int io_sectors, io_sectors_acct;
+        int io_sectors;
+        unsigned int io_bytes;
+        int64_t io_bytes_acct;
        BlockDriverState *file;
        enum MirrorMethod {
            MIRROR_METHOD_COPY,
@@ -404,27 +401,28 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
            MIRROR_METHOD_DISCARD
        } mirror_method = MIRROR_METHOD_COPY;

-        assert(!(sector_num % sectors_per_chunk));
-        ret = bdrv_get_block_status_above(source, NULL, sector_num,
+        assert(!(offset % s->granularity));
+        ret = bdrv_get_block_status_above(source, NULL,
+                                          offset >> BDRV_SECTOR_BITS,
                                          nb_chunks * sectors_per_chunk,
                                          &io_sectors, &file);
+        io_bytes = io_sectors * BDRV_SECTOR_SIZE;
        if (ret < 0) {
-            io_sectors = MIN(nb_chunks * sectors_per_chunk, max_io_sectors);
+            io_bytes = MIN(nb_chunks * s->granularity, max_io_bytes);
        } else if (ret & BDRV_BLOCK_DATA) {
-            io_sectors = MIN(io_sectors, max_io_sectors);
+            io_bytes = MIN(io_bytes, max_io_bytes);
        }

-        io_sectors -= io_sectors % sectors_per_chunk;
-        if (io_sectors < sectors_per_chunk) {
-            io_sectors = sectors_per_chunk;
+        io_bytes -= io_bytes % s->granularity;
+        if (io_bytes < s->granularity) {
+            io_bytes = s->granularity;
        } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
-            int64_t target_sector_num;
-            int target_nb_sectors;
-            bdrv_round_sectors_to_clusters(blk_bs(s->target), sector_num,
-                                           io_sectors,  &target_sector_num,
-                                           &target_nb_sectors);
-            if (target_sector_num == sector_num &&
-                target_nb_sectors == io_sectors) {
+            int64_t target_offset;
+            unsigned int target_bytes;
+            bdrv_round_to_clusters(blk_bs(s->target), offset, io_bytes,
+                                   &target_offset, &target_bytes);
+            if (target_offset == offset &&
+                target_bytes == io_bytes) {
                mirror_method = ret & BDRV_BLOCK_ZERO ?
                                    MIRROR_METHOD_ZERO :
                                    MIRROR_METHOD_DISCARD;
@@ -432,7 +430,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
        }

        while (s->in_flight >= MAX_IN_FLIGHT) {
-            trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
+            trace_mirror_yield_in_flight(s, offset, s->in_flight);
            mirror_wait_for_io(s);
        }

@@ -440,30 +438,29 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
            return 0;
        }

-        mirror_clip_sectors(s, sector_num, &io_sectors);
+        io_bytes = mirror_clip_bytes(s, offset, io_bytes);
        switch (mirror_method) {
        case MIRROR_METHOD_COPY:
-            io_sectors = mirror_do_read(s, sector_num, io_sectors);
-            io_sectors_acct = io_sectors;
+            io_bytes = io_bytes_acct = mirror_do_read(s, offset, io_bytes);
            break;
        case MIRROR_METHOD_ZERO:
        case MIRROR_METHOD_DISCARD:
-            mirror_do_zero_or_discard(s, sector_num, io_sectors,
+            mirror_do_zero_or_discard(s, offset, io_bytes,
                                      mirror_method == MIRROR_METHOD_DISCARD);
            if (write_zeroes_ok) {
-                io_sectors_acct = 0;
+                io_bytes_acct = 0;
            } else {
-                io_sectors_acct = io_sectors;
+                io_bytes_acct = io_bytes;
            }
            break;
        default:
            abort();
        }
-        assert(io_sectors);
-        sector_num += io_sectors;
-        nb_chunks -= DIV_ROUND_UP(io_sectors, sectors_per_chunk);
+        assert(io_bytes);
+        offset += io_bytes;
+        nb_chunks -= DIV_ROUND_UP(io_bytes, s->granularity);
        if (s->common.speed) {
-            delay_ns = ratelimit_calculate_delay(&s->limit, io_sectors_acct);
+            delay_ns = ratelimit_calculate_delay(&s->limit, io_bytes_acct);
        }
    }
    return delay_ns;
@@ -624,6 +621,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
    BlockDriverState *bs = s->source;
    BlockDriverState *target_bs = blk_bs(s->target);
    int ret, n;
+    int64_t count;

    end = s->bdev_length / BDRV_SECTOR_SIZE;

@@ -652,7 +650,8 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
                continue;
            }

-            mirror_do_zero_or_discard(s, sector_num, nb_sectors, false);
+            mirror_do_zero_or_discard(s, sector_num * BDRV_SECTOR_SIZE,
+                                      nb_sectors * BDRV_SECTOR_SIZE, false);
            sector_num += nb_sectors;
        }

@@ -672,11 +671,16 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
            return 0;
        }

-        ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n);
+        ret = bdrv_is_allocated_above(bs, base, sector_num * BDRV_SECTOR_SIZE,
+                                      nb_sectors * BDRV_SECTOR_SIZE, &count);
        if (ret < 0) {
            return ret;
        }

+        /* TODO: Relax this once bdrv_is_allocated_above and dirty
+         * bitmaps no longer require sector alignment. */
+        assert(QEMU_IS_ALIGNED(count, BDRV_SECTOR_SIZE));
+        n = count >> BDRV_SECTOR_BITS;
        assert(n > 0);
        if (ret == 1) {
            bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
@@ -712,7 +716,6 @@ static void coroutine_fn mirror_run(void *opaque)
    char backing_filename[2]; /* we only need 2 characters because we are only
                                 checking for a NULL string */
    int ret = 0;
-    int target_cluster_size = BDRV_SECTOR_SIZE;

    if (block_job_is_cancelled(&s->common)) {
        goto immediate_exit;
@@ -736,7 +739,8 @@ static void coroutine_fn mirror_run(void *opaque)
        }

        if (s->bdev_length > base_length) {
-            ret = blk_truncate(s->target, s->bdev_length, NULL);
+            ret = blk_truncate(s->target, s->bdev_length, PREALLOC_MODE_OFF,
+                               NULL);
            if (ret < 0) {
                goto immediate_exit;
            }
@@ -764,14 +768,15 @@ static void coroutine_fn mirror_run(void *opaque)
    bdrv_get_backing_filename(target_bs, backing_filename,
                              sizeof(backing_filename));
    if (!bdrv_get_info(target_bs, &bdi) && bdi.cluster_size) {
-        target_cluster_size = bdi.cluster_size;
+        s->target_cluster_size = bdi.cluster_size;
+    } else {
+        s->target_cluster_size = BDRV_SECTOR_SIZE;
    }
-    if (backing_filename[0] && !target_bs->backing
-        && s->granularity < target_cluster_size) {
-        s->buf_size = MAX(s->buf_size, target_cluster_size);
+    if (backing_filename[0] && !target_bs->backing &&
+        s->granularity < s->target_cluster_size) {
+        s->buf_size = MAX(s->buf_size, s->target_cluster_size);
        s->cow_bitmap = bitmap_new(length);
    }
-    s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS;
    s->max_iov = MIN(bs->bl.max_iov, target_bs->bl.max_iov);

    s->buf = qemu_try_blockalign(bs, s->buf_size);
@@ -807,10 +812,10 @@ static void coroutine_fn mirror_run(void *opaque)
        cnt = bdrv_get_dirty_count(s->dirty_bitmap);
        /* s->common.offset contains the number of bytes already processed so
         * far, cnt is the number of dirty sectors remaining and
-         * s->sectors_in_flight is the number of sectors currently being
+         * s->bytes_in_flight is the number of bytes currently being
         * processed; together those are the current total operation length */
-        s->common.len = s->common.offset +
-                        (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE;
+        s->common.len = s->common.offset + s->bytes_in_flight +
+            cnt * BDRV_SECTOR_SIZE;

        /* Note that even when no rate limit is applied we need to yield
         * periodically with no pending I/O so that bdrv_drain_all() returns.
@@ -822,7 +827,8 @@ static void coroutine_fn mirror_run(void *opaque)
            s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
            if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                (cnt == 0 && s->in_flight > 0)) {
-                trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight);
+                trace_mirror_yield(s, cnt * BDRV_SECTOR_SIZE,
+                                   s->buf_free_count, s->in_flight);
                mirror_wait_for_io(s);
                continue;
            } else if (cnt != 0) {
@@ -863,7 +869,7 @@ static void coroutine_fn mirror_run(void *opaque)
             * whether to switch to target check one last time if I/O has
             * come in the meanwhile, and if not flush the data to disk.
             */
-            trace_mirror_before_drain(s, cnt);
+            trace_mirror_before_drain(s, cnt * BDRV_SECTOR_SIZE);

            bdrv_drained_begin(bs);
            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
@@ -882,7 +888,8 @@ static void coroutine_fn mirror_run(void *opaque)
        }

        ret = 0;
-        trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
+        trace_mirror_before_sleep(s, cnt * BDRV_SECTOR_SIZE,
+                                  s->synced, delay_ns);
        if (!s->synced) {
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
            if (block_job_is_cancelled(&s->common)) {
@@ -929,7 +936,7 @@ static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
        return;
    }
-    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
+    ratelimit_set_speed(&s->limit, speed, SLICE_TIME);
 }

 static void mirror_complete(BlockJob *job, Error **errp)
@@ -1052,26 +1059,16 @@ static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs)
    return bdrv_co_flush(bs->backing->bs);
 }

-static int64_t coroutine_fn bdrv_mirror_top_get_block_status(
-    BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
-    BlockDriverState **file)
-{
-    *pnum = nb_sectors;
-    *file = bs->backing->bs;
-    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
-           (sector_num << BDRV_SECTOR_BITS);
-}
-
 static int coroutine_fn bdrv_mirror_top_pwrite_zeroes(BlockDriverState *bs,
-    int64_t offset, int count, BdrvRequestFlags flags)
+    int64_t offset, int bytes, BdrvRequestFlags flags)
 {
-    return bdrv_co_pwrite_zeroes(bs->backing, offset, count, flags);
+    return bdrv_co_pwrite_zeroes(bs->backing, offset, bytes, flags);
 }

 static int coroutine_fn bdrv_mirror_top_pdiscard(BlockDriverState *bs,
-    int64_t offset, int count)
+    int64_t offset, int bytes)
 {
-    return bdrv_co_pdiscard(bs->backing->bs, offset, count);
+    return bdrv_co_pdiscard(bs->backing->bs, offset, bytes);
 }

 static void bdrv_mirror_top_refresh_filename(BlockDriverState *bs, QDict *opts)
@@ -1108,7 +1105,7 @@ static BlockDriver bdrv_mirror_top = {
    .bdrv_co_pwrite_zeroes      = bdrv_mirror_top_pwrite_zeroes,
    .bdrv_co_pdiscard           = bdrv_mirror_top_pdiscard,
    .bdrv_co_flush              = bdrv_mirror_top_flush,
-    .bdrv_co_get_block_status   = bdrv_mirror_top_get_block_status,
+    .bdrv_co_get_block_status   = bdrv_co_get_block_status_from_backing,
    .bdrv_refresh_filename      = bdrv_mirror_top_refresh_filename,
    .bdrv_close                 = bdrv_mirror_top_close,
    .bdrv_child_perm            = bdrv_mirror_top_child_perm,
@@ -1127,6 +1124,7 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
                             const BlockJobDriver *driver,
                             bool is_none_mode, BlockDriverState *base,
                             bool auto_complete, const char *filter_node_name,
+                             bool is_mirror,
                             Error **errp)
 {
    MirrorBlockJob *s;
@@ -1141,6 +1139,8 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
    }

    assert ((granularity & (granularity - 1)) == 0);
+    /* Granularity must be large enough for sector-based dirty bitmap */
+    assert(granularity >= BDRV_SECTOR_SIZE);

    if (buf_size < 0) {
        error_setg(errp, "Invalid parameter 'buf-size'");
@@ -1159,6 +1159,9 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
    if (mirror_top_bs == NULL) {
        return;
    }
+    if (!filter_node_name) {
+        mirror_top_bs->implicit = true;
+    }
    mirror_top_bs->total_sectors = bs->total_sectors;
    bdrv_set_aio_context(mirror_top_bs, bdrv_get_aio_context(bs));

@@ -1210,6 +1213,15 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
    if (ret < 0) {
        goto fail;
    }
+    if (is_mirror) {
+        /* XXX: Mirror target could be a NBD server of target QEMU in the case
+         * of non-shared block migration. To allow migration completion, we
+         * have to allow "inactivate" of the target BB.  When that happens, we
+         * know the job is drained, and the vcpus are stopped, so no write
+         * operation will be performed. Block layer already has assertions to
+         * ensure that. */
+        blk_set_force_allow_inactivate(s->target);
+    }

    s->replaces = g_strdup(replaces);
    s->on_source_error = on_source_error;
@@ -1294,7 +1306,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
                     speed, granularity, buf_size, backing_mode,
                     on_source_error, on_target_error, unmap, NULL, NULL,
                     &mirror_job_driver, is_none_mode, base, false,
-                     filter_node_name, errp);
+                     filter_node_name, true, errp);
 }

 void commit_active_start(const char *job_id, BlockDriverState *bs,
@@ -1317,7 +1329,7 @@ void commit_active_start(const char *job_id, BlockDriverState *bs,
                     MIRROR_LEAVE_BACKING_CHAIN,
                     on_error, on_error, true, cb, opaque,
                     &commit_active_job_driver, false, base, auto_complete,
-                     filter_node_name, &local_err);
+                     filter_node_name, false, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        goto error_restore_flags;
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -34,13 +34,15 @@
 #define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
 #define INDEX_TO_HANDLE(bs, index)  ((index)  ^ ((uint64_t)(intptr_t)bs))

-static void nbd_recv_coroutines_enter_all(NBDClientSession *s)
+static void nbd_recv_coroutines_wake_all(NBDClientSession *s)
 {
    int i;

    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
-        if (s->recv_coroutine[i]) {
-            aio_co_wake(s->recv_coroutine[i]);
+        NBDClientRequest *req = &s->requests[i];
+
+        if (req->coroutine && req->receiving) {
+            aio_co_wake(req->coroutine);
        }
    }
 }
@@ -70,10 +72,10 @@ static coroutine_fn void nbd_read_reply_entry(void *opaque)
 {
    NBDClientSession *s = opaque;
    uint64_t i;
-    int ret;
+    int ret = 0;
    Error *local_err = NULL;

-    for (;;) {
+    while (!s->quit) {
        assert(s->reply.handle == 0);
        ret = nbd_receive_reply(s->ioc, &s->reply, &local_err);
        if (ret < 0) {
@@ -88,26 +90,29 @@ static coroutine_fn void nbd_read_reply_entry(void *opaque)
         * one coroutine is called until the reply finishes.
         */
        i = HANDLE_TO_INDEX(s, s->reply.handle);
-        if (i >= MAX_NBD_REQUESTS || !s->recv_coroutine[i]) {
+        if (i >= MAX_NBD_REQUESTS ||
+            !s->requests[i].coroutine ||
+            !s->requests[i].receiving) {
            break;
        }

-        /* We're woken up by the recv_coroutine itself.  Note that there
+        /* We're woken up again by the request itself.  Note that there
         * is no race between yielding and reentering read_reply_co.  This
         * is because:
         *
-         * - if recv_coroutine[i] runs on the same AioContext, it is only
+         * - if the request runs on the same AioContext, it is only
         *   entered after we yield
         *
-         * - if recv_coroutine[i] runs on a different AioContext, reentering
+         * - if the request runs on a different AioContext, reentering
         *   read_reply_co happens through a bottom half, which can only
         *   run after we yield.
         */
-        aio_co_wake(s->recv_coroutine[i]);
+        aio_co_wake(s->requests[i].coroutine);
        qemu_coroutine_yield();
    }

-    nbd_recv_coroutines_enter_all(s);
+    s->quit = true;
+    nbd_recv_coroutines_wake_all(s);
    s->read_reply_co = NULL;
 }

@@ -116,7 +121,7 @@ static int nbd_co_send_request(BlockDriverState *bs,
                               QEMUIOVector *qiov)
 {
    NBDClientSession *s = nbd_get_client_session(bs);
-    int rc, ret, i;
+    int rc, i;

    qemu_co_mutex_lock(&s->send_mutex);
    while (s->in_flight == MAX_NBD_REQUESTS) {
@@ -125,28 +130,35 @@ static int nbd_co_send_request(BlockDriverState *bs,
    s->in_flight++;

    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
-        if (s->recv_coroutine[i] == NULL) {
-            s->recv_coroutine[i] = qemu_coroutine_self();
+        if (s->requests[i].coroutine == NULL) {
            break;
        }
    }

    g_assert(qemu_in_coroutine());
    assert(i < MAX_NBD_REQUESTS);
+
+    s->requests[i].coroutine = qemu_coroutine_self();
+    s->requests[i].receiving = false;
+
    request->handle = INDEX_TO_HANDLE(s, i);

+    if (s->quit) {
+        rc = -EIO;
+        goto err;
+    }
    if (!s->ioc) {
-        qemu_co_mutex_unlock(&s->send_mutex);
-        return -EPIPE;
+        rc = -EPIPE;
+        goto err;
    }

    if (qiov) {
        qio_channel_set_cork(s->ioc, true);
        rc = nbd_send_request(s->ioc, request);
-        if (rc >= 0) {
-            ret = nbd_rwv(s->ioc, qiov->iov, qiov->niov, request->len, false,
-                          NULL);
-            if (ret != request->len) {
+        if (rc >= 0 && !s->quit) {
+            assert(request->len == iov_size(qiov->iov, qiov->niov));
+            if (qio_channel_writev_all(s->ioc, qiov->iov, qiov->niov,
+                                       NULL) < 0) {
                rc = -EIO;
            }
        }
@@ -154,6 +166,14 @@ static int nbd_co_send_request(BlockDriverState *bs,
    } else {
        rc = nbd_send_request(s->ioc, request);
    }
+
+err:
+    if (rc < 0) {
+        s->quit = true;
+        s->requests[i].coroutine = NULL;
+        s->in_flight--;
+        qemu_co_queue_next(&s->free_sema);
+    }
    qemu_co_mutex_unlock(&s->send_mutex);
    return rc;
 }
@@ -163,35 +183,30 @@ static void nbd_co_receive_reply(NBDClientSession *s,
                                 NBDReply *reply,
                                 QEMUIOVector *qiov)
 {
-    int ret;
+    int i = HANDLE_TO_INDEX(s, request->handle);

    /* Wait until we're woken up by nbd_read_reply_entry.  */
+    s->requests[i].receiving = true;
    qemu_coroutine_yield();
+    s->requests[i].receiving = false;
    *reply = s->reply;
-    if (reply->handle != request->handle ||
-        !s->ioc) {
+    if (reply->handle != request->handle || !s->ioc || s->quit) {
        reply->error = EIO;
    } else {
        if (qiov && reply->error == 0) {
-            ret = nbd_rwv(s->ioc, qiov->iov, qiov->niov, request->len, true,
-                          NULL);
-            if (ret != request->len) {
+            assert(request->len == iov_size(qiov->iov, qiov->niov));
+            if (qio_channel_readv_all(s->ioc, qiov->iov, qiov->niov,
+                                      NULL) < 0) {
                reply->error = EIO;
+                s->quit = true;
            }
        }

        /* Tell the read handler to read another header.  */
        s->reply.handle = 0;
    }
-}

-static void nbd_coroutine_end(BlockDriverState *bs,
-                              NBDRequest *request)
-{
-    NBDClientSession *s = nbd_get_client_session(bs);
-    int i = HANDLE_TO_INDEX(s, request->handle);
-
-    s->recv_coroutine[i] = NULL;
+    s->requests[i].coroutine = NULL;

    /* Kick the read_reply_co to get the next reply.  */
    if (s->read_reply_co) {
@@ -204,29 +219,40 @@ static void nbd_coroutine_end(BlockDriverState *bs,
    qemu_co_mutex_unlock(&s->send_mutex);
 }

+static int nbd_co_request(BlockDriverState *bs,
+                          NBDRequest *request,
+                          QEMUIOVector *qiov)
+{
+    NBDClientSession *client = nbd_get_client_session(bs);
+    NBDReply reply;
+    int ret;
+
+    assert(!qiov || request->type == NBD_CMD_WRITE ||
+           request->type == NBD_CMD_READ);
+    ret = nbd_co_send_request(bs, request,
+                              request->type == NBD_CMD_WRITE ? qiov : NULL);
+    if (ret < 0) {
+        reply.error = -ret;
+    } else {
+        nbd_co_receive_reply(client, request, &reply,
+                             request->type == NBD_CMD_READ ? qiov : NULL);
+    }
+    return -reply.error;
+}
+
 int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
                         uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
-    NBDClientSession *client = nbd_get_client_session(bs);
    NBDRequest request = {
        .type = NBD_CMD_READ,
        .from = offset,
        .len = bytes,
    };
-    NBDReply reply;
-    ssize_t ret;

    assert(bytes <= NBD_MAX_BUFFER_SIZE);
    assert(!flags);

-    ret = nbd_co_send_request(bs, &request, NULL);
-    if (ret < 0) {
-        reply.error = -ret;
-    } else {
-        nbd_co_receive_reply(client, &request, &reply, qiov);
-    }
-    nbd_coroutine_end(bs, &request);
-    return -reply.error;
+    return nbd_co_request(bs, &request, qiov);
 }

 int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
@@ -238,121 +264,84 @@ int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
        .from = offset,
        .len = bytes,
    };
-    NBDReply reply;
-    ssize_t ret;

    if (flags & BDRV_REQ_FUA) {
-        assert(client->nbdflags & NBD_FLAG_SEND_FUA);
+        assert(client->info.flags & NBD_FLAG_SEND_FUA);
        request.flags |= NBD_CMD_FLAG_FUA;
    }

    assert(bytes <= NBD_MAX_BUFFER_SIZE);

-    ret = nbd_co_send_request(bs, &request, qiov);
-    if (ret < 0) {
-        reply.error = -ret;
-    } else {
-        nbd_co_receive_reply(client, &request, &reply, NULL);
-    }
-    nbd_coroutine_end(bs, &request);
-    return -reply.error;
+    return nbd_co_request(bs, &request, qiov);
 }

 int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
-                                int count, BdrvRequestFlags flags)
+                                int bytes, BdrvRequestFlags flags)
 {
-    ssize_t ret;
    NBDClientSession *client = nbd_get_client_session(bs);
    NBDRequest request = {
        .type = NBD_CMD_WRITE_ZEROES,
        .from = offset,
-        .len = count,
+        .len = bytes,
    };
-    NBDReply reply;

-    if (!(client->nbdflags & NBD_FLAG_SEND_WRITE_ZEROES)) {
+    if (!(client->info.flags & NBD_FLAG_SEND_WRITE_ZEROES)) {
        return -ENOTSUP;
    }

    if (flags & BDRV_REQ_FUA) {
-        assert(client->nbdflags & NBD_FLAG_SEND_FUA);
+        assert(client->info.flags & NBD_FLAG_SEND_FUA);
        request.flags |= NBD_CMD_FLAG_FUA;
    }
    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
        request.flags |= NBD_CMD_FLAG_NO_HOLE;
    }

-    ret = nbd_co_send_request(bs, &request, NULL);
-    if (ret < 0) {
-        reply.error = -ret;
-    } else {
-        nbd_co_receive_reply(client, &request, &reply, NULL);
-    }
-    nbd_coroutine_end(bs, &request);
-    return -reply.error;
+    return nbd_co_request(bs, &request, NULL);
 }

 int nbd_client_co_flush(BlockDriverState *bs)
 {
    NBDClientSession *client = nbd_get_client_session(bs);
    NBDRequest request = { .type = NBD_CMD_FLUSH };
-    NBDReply reply;
-    ssize_t ret;

-    if (!(client->nbdflags & NBD_FLAG_SEND_FLUSH)) {
+    if (!(client->info.flags & NBD_FLAG_SEND_FLUSH)) {
        return 0;
    }

    request.from = 0;
    request.len = 0;

-    ret = nbd_co_send_request(bs, &request, NULL);
-    if (ret < 0) {
-        reply.error = -ret;
-    } else {
-        nbd_co_receive_reply(client, &request, &reply, NULL);
-    }
-    nbd_coroutine_end(bs, &request);
-    return -reply.error;
+    return nbd_co_request(bs, &request, NULL);
 }

-int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
+int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
 {
    NBDClientSession *client = nbd_get_client_session(bs);
    NBDRequest request = {
        .type = NBD_CMD_TRIM,
        .from = offset,
-        .len = count,
+        .len = bytes,
    };
-    NBDReply reply;
-    ssize_t ret;

-    if (!(client->nbdflags & NBD_FLAG_SEND_TRIM)) {
+    if (!(client->info.flags & NBD_FLAG_SEND_TRIM)) {
        return 0;
    }

-    ret = nbd_co_send_request(bs, &request, NULL);
-    if (ret < 0) {
-        reply.error = -ret;
-    } else {
-        nbd_co_receive_reply(client, &request, &reply, NULL);
-    }
-    nbd_coroutine_end(bs, &request);
-    return -reply.error;
-
+    return nbd_co_request(bs, &request, NULL);
 }

 void nbd_client_detach_aio_context(BlockDriverState *bs)
 {
    NBDClientSession *client = nbd_get_client_session(bs);
-    qio_channel_detach_aio_context(QIO_CHANNEL(client->sioc));
+    qio_channel_detach_aio_context(QIO_CHANNEL(client->ioc));
 }

 void nbd_client_attach_aio_context(BlockDriverState *bs,
                                   AioContext *new_context)
 {
    NBDClientSession *client = nbd_get_client_session(bs);
-    qio_channel_attach_aio_context(QIO_CHANNEL(client->sioc), new_context);
+    qio_channel_attach_aio_context(QIO_CHANNEL(client->ioc), new_context);
    aio_co_schedule(new_context, client->read_reply_co);
 }

@@ -384,22 +373,24 @@ int nbd_client_init(BlockDriverState *bs,
    logout("session init %s\n", export);
    qio_channel_set_blocking(QIO_CHANNEL(sioc), true, NULL);

+    client->info.request_sizes = true;
    ret = nbd_receive_negotiate(QIO_CHANNEL(sioc), export,
-                                &client->nbdflags,
                                tlscreds, hostname,
-                                &client->ioc,
-                                &client->size, errp);
+                                &client->ioc, &client->info, errp);
    if (ret < 0) {
        logout("Failed to negotiate with the NBD server\n");
        return ret;
    }
-    if (client->nbdflags & NBD_FLAG_SEND_FUA) {
+    if (client->info.flags & NBD_FLAG_SEND_FUA) {
        bs->supported_write_flags = BDRV_REQ_FUA;
        bs->supported_zero_flags |= BDRV_REQ_FUA;
    }
-    if (client->nbdflags & NBD_FLAG_SEND_WRITE_ZEROES) {
+    if (client->info.flags & NBD_FLAG_SEND_WRITE_ZEROES) {
        bs->supported_zero_flags |= BDRV_REQ_MAY_UNMAP;
    }
+    if (client->info.min_block > bs->bl.request_alignment) {
+        bs->bl.request_alignment = client->info.min_block;
+    }

    qemu_co_mutex_init(&client->send_mutex);
    qemu_co_queue_init(&client->free_sema);
--- a/block/nbd-client.h
+++ b/block/nbd-client.h
@@ -17,19 +17,24 @@

 #define MAX_NBD_REQUESTS    16

+typedef struct {
+    Coroutine *coroutine;
+    bool receiving;         /* waiting for read_reply_co? */
+} NBDClientRequest;
+
 typedef struct NBDClientSession {
    QIOChannelSocket *sioc; /* The master data channel */
    QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
-    uint16_t nbdflags;
-    off_t size;
+    NBDExportInfo info;

    CoMutex send_mutex;
    CoQueue free_sema;
    Coroutine *read_reply_co;
    int in_flight;

-    Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
+    NBDClientRequest requests[MAX_NBD_REQUESTS];
    NBDReply reply;
+    bool quit;
 } NBDClientSession;

 NBDClientSession *nbd_get_client_session(BlockDriverState *bs);
@@ -42,12 +47,12 @@ int nbd_client_init(BlockDriverState *bs,
                    Error **errp);
 void nbd_client_close(BlockDriverState *bs);

-int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count);
+int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes);
 int nbd_client_co_flush(BlockDriverState *bs);
 int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
                          uint64_t bytes, QEMUIOVector *qiov, int flags);
 int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
-                                int count, BdrvRequestFlags flags);
+                                int bytes, BdrvRequestFlags flags);
 int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
                         uint64_t bytes, QEMUIOVector *qiov, int flags);

--- a/block/nbd.c
+++ b/block/nbd.c
@@ -64,11 +64,11 @@ static int nbd_parse_uri(const char *filename, QDict *options)
    }

    /* transport */
-    if (!strcmp(uri->scheme, "nbd")) {
+    if (!g_strcmp0(uri->scheme, "nbd")) {
        is_unix = false;
-    } else if (!strcmp(uri->scheme, "nbd+tcp")) {
+    } else if (!g_strcmp0(uri->scheme, "nbd+tcp")) {
        is_unix = false;
-    } else if (!strcmp(uri->scheme, "nbd+unix")) {
+    } else if (!g_strcmp0(uri->scheme, "nbd+unix")) {
        is_unix = true;
    } else {
        ret = -EINVAL;
@@ -472,9 +472,17 @@ static int nbd_co_flush(BlockDriverState *bs)

 static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
 {
-    bs->bl.max_pdiscard = NBD_MAX_BUFFER_SIZE;
-    bs->bl.max_pwrite_zeroes = NBD_MAX_BUFFER_SIZE;
-    bs->bl.max_transfer = NBD_MAX_BUFFER_SIZE;
+    NBDClientSession *s = nbd_get_client_session(bs);
+    uint32_t max = MIN_NON_ZERO(NBD_MAX_BUFFER_SIZE, s->info.max_block);
+
+    bs->bl.max_pdiscard = max;
+    bs->bl.max_pwrite_zeroes = max;
+    bs->bl.max_transfer = max;
+
+    if (s->info.opt_block &&
+        s->info.opt_block > bs->bl.opt_transfer) {
+        bs->bl.opt_transfer = s->info.opt_block;
+    }
 }

 static void nbd_close(BlockDriverState *bs)
@@ -492,7 +500,7 @@ static int64_t nbd_getlength(BlockDriverState *bs)
 {
    BDRVNBDState *s = bs->opaque;

-    return s->client.size;
+    return s->client.info.size;
 }

 static void nbd_detach_aio_context(BlockDriverState *bs)
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -82,7 +82,7 @@ static int nfs_parse_uri(const char *filename, QDict *options, Error **errp)
        error_setg(errp, "Invalid URI specified");
        goto out;
    }
-    if (strcmp(uri->scheme, "nfs") != 0) {
+    if (g_strcmp0(uri->scheme, "nfs") != 0) {
        error_setg(errp, "URI scheme must be 'nfs'");
        goto out;
    }
@@ -433,19 +433,23 @@ static void nfs_client_close(NFSClient *client)
    if (client->context) {
        if (client->fh) {
            nfs_close(client->context, client->fh);
+            client->fh = NULL;
        }
        aio_set_fd_handler(client->aio_context, nfs_get_fd(client->context),
                           false, NULL, NULL, NULL, NULL);
        nfs_destroy_context(client->context);
+        client->context = NULL;
    }
-    memset(client, 0, sizeof(NFSClient));
+    g_free(client->path);
+    qemu_mutex_destroy(&client->mutex);
+    qapi_free_NFSServer(client->server);
+    client->server = NULL;
 }

 static void nfs_file_close(BlockDriverState *bs)
 {
    NFSClient *client = bs->opaque;
    nfs_client_close(client);
-    qemu_mutex_destroy(&client->mutex);
 }

 static NFSServer *nfs_config(QDict *options, Error **errp)
@@ -498,6 +502,7 @@ static int64_t nfs_client_open(NFSClient *client, QDict *options,
    struct stat st;
    char *file = NULL, *strp = NULL;

+    qemu_mutex_init(&client->mutex);
    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
    qemu_opts_absorb_qdict(opts, options, &local_err);
    if (local_err) {
@@ -558,8 +563,8 @@ static int64_t nfs_client_open(NFSClient *client, QDict *options,
        }
        client->readahead = qemu_opt_get_number(opts, "readahead-size", 0);
        if (client->readahead > QEMU_NFS_MAX_READAHEAD_SIZE) {
-            error_report("NFS Warning: Truncating NFS readahead "
-                         "size to %d", QEMU_NFS_MAX_READAHEAD_SIZE);
+            warn_report("Truncating NFS readahead size to %d",
+                        QEMU_NFS_MAX_READAHEAD_SIZE);
            client->readahead = QEMU_NFS_MAX_READAHEAD_SIZE;
        }
        nfs_set_readahead(client->context, client->readahead);
@@ -579,8 +584,8 @@ static int64_t nfs_client_open(NFSClient *client, QDict *options,
        }
        client->pagecache = qemu_opt_get_number(opts, "page-cache-size", 0);
        if (client->pagecache > QEMU_NFS_MAX_PAGECACHE_SIZE) {
-            error_report("NFS Warning: Truncating NFS pagecache "
-                         "size to %d pages", QEMU_NFS_MAX_PAGECACHE_SIZE);
+            warn_report("Truncating NFS pagecache size to %d pages",
+                        QEMU_NFS_MAX_PAGECACHE_SIZE);
            client->pagecache = QEMU_NFS_MAX_PAGECACHE_SIZE;
        }
        nfs_set_pagecache(client->context, client->pagecache);
@@ -595,8 +600,8 @@ static int64_t nfs_client_open(NFSClient *client, QDict *options,
        /* limit the maximum debug level to avoid potential flooding
         * of our log files. */
        if (client->debug > QEMU_NFS_MAX_DEBUG_LEVEL) {
-            error_report("NFS Warning: Limiting NFS debug level "
-                         "to %d", QEMU_NFS_MAX_DEBUG_LEVEL);
+            warn_report("Limiting NFS debug level to %d",
+                        QEMU_NFS_MAX_DEBUG_LEVEL);
            client->debug = QEMU_NFS_MAX_DEBUG_LEVEL;
        }
        nfs_set_debug(client->context, client->debug);
@@ -660,7 +665,7 @@ static int nfs_file_open(BlockDriverState *bs, QDict *options, int flags,
    if (ret < 0) {
        return ret;
    }
-    qemu_mutex_init(&client->mutex);
+
    bs->total_sectors = ret;
    ret = 0;
    return ret;
@@ -759,11 +764,18 @@ static int64_t nfs_get_allocated_file_size(BlockDriverState *bs)
    return (task.ret < 0 ? task.ret : st.st_blocks * 512);
 }

-static int nfs_file_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
+static int nfs_file_truncate(BlockDriverState *bs, int64_t offset,
+                             PreallocMode prealloc, Error **errp)
 {
    NFSClient *client = bs->opaque;
    int ret;

+    if (prealloc != PREALLOC_MODE_OFF) {
+        error_setg(errp, "Unsupported preallocation mode '%s'",
+                   PreallocMode_str(prealloc));
+        return -ENOTSUP;
+    }
+
    ret = nfs_ftruncate(client->context, client->fh, offset);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Failed to truncate file");
--- a/block/null.c
+++ b/block/null.c
@@ -29,11 +29,6 @@ static QemuOptsList runtime_opts = {
    .name = "null",
    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
    .desc = {
-        {
-            .name = "filename",
-            .type = QEMU_OPT_STRING,
-            .help = "",
-        },
        {
            .name = BLOCK_OPT_SIZE,
            .type = QEMU_OPT_SIZE,
@@ -54,6 +49,30 @@ static QemuOptsList runtime_opts = {
    },
 };

+static void null_co_parse_filename(const char *filename, QDict *options,
+                                   Error **errp)
+{
+    /* This functions only exists so that a null-co:// filename is accepted
+     * with the null-co driver. */
+    if (strcmp(filename, "null-co://")) {
+        error_setg(errp, "The only allowed filename for this driver is "
+                         "'null-co://'");
+        return;
+    }
+}
+
+static void null_aio_parse_filename(const char *filename, QDict *options,
+                                    Error **errp)
+{
+    /* This functions only exists so that a null-aio:// filename is accepted
+     * with the null-aio driver. */
+    if (strcmp(filename, "null-aio://")) {
+        error_setg(errp, "The only allowed filename for this driver is "
+                         "'null-aio://'");
+        return;
+    }
+}
+
 static int null_file_open(BlockDriverState *bs, QDict *options, int flags,
                          Error **errp)
 {
@@ -242,6 +261,7 @@ static BlockDriver bdrv_null_co = {
    .instance_size          = sizeof(BDRVNullState),

    .bdrv_file_open         = null_file_open,
+    .bdrv_parse_filename    = null_co_parse_filename,
    .bdrv_close             = null_close,
    .bdrv_getlength         = null_getlength,

@@ -261,6 +281,7 @@ static BlockDriver bdrv_null_aio = {
    .instance_size          = sizeof(BDRVNullState),

    .bdrv_file_open         = null_file_open,
+    .bdrv_parse_filename    = null_aio_parse_filename,
    .bdrv_close             = null_close,
    .bdrv_getlength         = null_getlength,

--- a/block/parallels.c
+++ b/block/parallels.c
@@ -35,7 +35,6 @@
 #include "qemu/module.h"
 #include "qemu/bswap.h"
 #include "qemu/bitmap.h"
-#include "qapi/util.h"

 /**************************************************************/

@@ -69,13 +68,14 @@ typedef enum ParallelsPreallocMode {
    PRL_PREALLOC_MODE__MAX = 2,
 } ParallelsPreallocMode;

-static const char *prealloc_mode_lookup[] = {
-    "falloc",
-    "truncate",
-    NULL,
+static QEnumLookup prealloc_mode_lookup = {
+    .array = (const char *const[]) {
+        "falloc",
+        "truncate",
+    },
+    .size = PRL_PREALLOC_MODE__MAX
 };

-
 typedef struct BDRVParallelsState {
    /** Locking is conservative, the lock protects
     *   - image file extending (truncate, fallocate)
@@ -192,7 +192,7 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
                                 int nb_sectors, int *pnum)
 {
    BDRVParallelsState *s = bs->opaque;
-    int64_t pos, space, idx, to_allocate, i;
+    int64_t pos, space, idx, to_allocate, i, len;

    pos = block_status(s, sector_num, nb_sectors, pnum);
    if (pos > 0) {
@@ -214,7 +214,11 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
    assert(idx < s->bat_size && idx + to_allocate <= s->bat_size);

    space = to_allocate * s->tracks;
-    if (s->data_end + space > bdrv_getlength(bs->file->bs) >> BDRV_SECTOR_BITS) {
+    len = bdrv_getlength(bs->file->bs);
+    if (len < 0) {
+        return len;
+    }
+    if (s->data_end + space > (len >> BDRV_SECTOR_BITS)) {
        int ret;
        space += s->prealloc_size;
        if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE) {
@@ -224,7 +228,7 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
        } else {
            ret = bdrv_truncate(bs->file,
                                (s->data_end + space) << BDRV_SECTOR_BITS,
-                                NULL);
+                                PREALLOC_MODE_OFF, NULL);
        }
        if (ret < 0) {
            return ret;
@@ -458,7 +462,8 @@ static int parallels_check(BlockDriverState *bs, BdrvCheckResult *res,
        res->leaks += count;
        if (fix & BDRV_FIX_LEAKS) {
            Error *local_err = NULL;
-            ret = bdrv_truncate(bs->file, res->image_end_offset, &local_err);
+            ret = bdrv_truncate(bs->file, res->image_end_offset,
+                                PREALLOC_MODE_OFF, &local_err);
            if (ret < 0) {
                error_report_err(local_err);
                res->check_errors++;
@@ -507,7 +512,7 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)

    blk_set_allow_write_beyond_eof(file, true);

-    ret = blk_truncate(file, 0, errp);
+    ret = blk_truncate(file, 0, PREALLOC_MODE_OFF, errp);
    if (ret < 0) {
        goto exit;
    }
@@ -691,15 +696,15 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
        qemu_opt_get_size_del(opts, PARALLELS_OPT_PREALLOC_SIZE, 0);
    s->prealloc_size = MAX(s->tracks, s->prealloc_size >> BDRV_SECTOR_BITS);
    buf = qemu_opt_get_del(opts, PARALLELS_OPT_PREALLOC_MODE);
-    s->prealloc_mode = qapi_enum_parse(prealloc_mode_lookup, buf,
-            PRL_PREALLOC_MODE__MAX, PRL_PREALLOC_MODE_FALLOCATE, &local_err);
+    s->prealloc_mode = qapi_enum_parse(&prealloc_mode_lookup, buf,
+                                       PRL_PREALLOC_MODE_FALLOCATE,
+                                       &local_err);
    g_free(buf);
    if (local_err != NULL) {
        goto fail_options;
    }

-    if (!(flags & BDRV_O_RESIZE) || !bdrv_has_zero_init(bs->file->bs) ||
-            bdrv_truncate(bs->file, bdrv_getlength(bs->file->bs), NULL) != 0) {
+    if (!bdrv_has_zero_init(bs->file->bs)) {
        s->prealloc_mode = PRL_PREALLOC_MODE_FALLOCATE;
    }

@@ -742,7 +747,8 @@ static void parallels_close(BlockDriverState *bs)
    }

    if (bs->open_flags & BDRV_O_RDWR) {
-        bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS, NULL);
+        bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS,
+                      PREALLOC_MODE_OFF, NULL);
    }

    g_free(s->bat_dirty_bmap);
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -45,7 +45,7 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
    info->ro                     = bs->read_only;
    info->drv                    = g_strdup(bs->drv->format_name);
    info->encrypted              = bs->encrypted;
-    info->encryption_key_missing = bdrv_key_required(bs);
+    info->encryption_key_missing = false;

    info->cache = g_new(BlockdevCacheInfo, 1);
    *info->cache = (BlockdevCacheInfo) {
@@ -64,13 +64,13 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
        info->backing_file = g_strdup(bs->backing_file);
    }

-    info->backing_file_depth = bdrv_get_backing_file_depth(bs);
    info->detect_zeroes = bs->detect_zeroes;

-    if (blk && blk_get_public(blk)->throttle_state) {
+    if (blk && blk_get_public(blk)->throttle_group_member.throttle_state) {
        ThrottleConfig cfg;
+        BlockBackendPublic *blkp = blk_get_public(blk);

-        throttle_group_get_config(blk, &cfg);
+        throttle_group_get_config(&blkp->throttle_group_member, &cfg);

        info->bps     = cfg.buckets[THROTTLE_BPS_TOTAL].avg;
        info->bps_rd  = cfg.buckets[THROTTLE_BPS_READ].avg;
@@ -118,13 +118,15 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
        info->iops_size = cfg.op_size;

        info->has_group = true;
-        info->group = g_strdup(throttle_group_get_name(blk));
+        info->group =
+            g_strdup(throttle_group_get_name(&blkp->throttle_group_member));
    }

    info->write_threshold = bdrv_write_threshold_get(bs);

    bs0 = bs;
    p_image_info = &info->image;
+    info->backing_file_depth = 0;
    while (1) {
        Error *local_err = NULL;
        bdrv_query_image_info(bs0, p_image_info, &local_err);
@@ -133,13 +135,22 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
            qapi_free_BlockDeviceInfo(info);
            return NULL;
        }
+
        if (bs0->drv && bs0->backing) {
+            info->backing_file_depth++;
            bs0 = bs0->backing->bs;
            (*p_image_info)->has_backing_image = true;
            p_image_info = &((*p_image_info)->backing_image);
        } else {
            break;
        }
+
+        /* Skip automatically inserted nodes that the user isn't aware of for
+         * query-block (blk != NULL), but not for query-named-block-nodes */
+        while (blk && bs0->drv && bs0->implicit) {
+            bs0 = backing_bs(bs0);
+            assert(bs0);
+        }
    }

    return info;
@@ -322,11 +333,26 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info,
 {
    BlockInfo *info = g_malloc0(sizeof(*info));
    BlockDriverState *bs = blk_bs(blk);
+    char *qdev;
+
+    /* Skip automatically inserted nodes that the user isn't aware of */
+    while (bs && bs->drv && bs->implicit) {
+        bs = backing_bs(bs);
+    }
+
    info->device = g_strdup(blk_name(blk));
    info->type = g_strdup("unknown");
    info->locked = blk_dev_is_medium_locked(blk);
    info->removable = blk_dev_has_removable_media(blk);

+    qdev = blk_get_attached_dev_id(blk);
+    if (qdev && *qdev) {
+        info->has_qdev = true;
+        info->qdev = qdev;
+    } else {
+        g_free(qdev);
+    }
+
    if (blk_dev_has_tray(blk)) {
        info->has_tray_open = true;
        info->tray_open = blk_dev_is_tray_open(blk);
@@ -424,8 +450,8 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
    }
 }

-static BlockStats *bdrv_query_bds_stats(const BlockDriverState *bs,
-                                 bool query_backing)
+static BlockStats *bdrv_query_bds_stats(BlockDriverState *bs,
+                                        bool blk_level)
 {
    BlockStats *s = NULL;

@@ -436,6 +462,14 @@ static BlockStats *bdrv_query_bds_stats(const BlockDriverState *bs,
        return s;
    }

+    /* Skip automatically inserted nodes that the user isn't aware of in
+     * a BlockBackend-level command. Stay at the exact node for a node-level
+     * command. */
+    while (blk_level && bs->drv && bs->implicit) {
+        bs = backing_bs(bs);
+        assert(bs);
+    }
+
    if (bdrv_get_node_name(bs)[0]) {
        s->has_node_name = true;
        s->node_name = g_strdup(bdrv_get_node_name(bs));
@@ -445,12 +479,12 @@ static BlockStats *bdrv_query_bds_stats(const BlockDriverState *bs,

    if (bs->file) {
        s->has_parent = true;
-        s->parent = bdrv_query_bds_stats(bs->file->bs, query_backing);
+        s->parent = bdrv_query_bds_stats(bs->file->bs, blk_level);
    }

-    if (query_backing && bs->backing) {
+    if (blk_level && bs->backing) {
        s->has_backing = true;
-        s->backing = bdrv_query_bds_stats(bs->backing->bs, query_backing);
+        s->backing = bdrv_query_bds_stats(bs->backing->bs, blk_level);
    }

    return s;
@@ -462,8 +496,14 @@ BlockInfoList *qmp_query_block(Error **errp)
    BlockBackend *blk;
    Error *local_err = NULL;

-    for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
-        BlockInfoList *info = g_malloc0(sizeof(*info));
+    for (blk = blk_all_next(NULL); blk; blk = blk_all_next(blk)) {
+        BlockInfoList *info;
+
+        if (!*blk_name(blk) && !blk_get_attached_dev(blk)) {
+            continue;
+        }
+
+        info = g_malloc0(sizeof(*info));
        bdrv_query_info(blk, &info->value, &local_err);
        if (local_err) {
            error_propagate(errp, local_err);
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -31,8 +31,10 @@
 #include "qemu/bswap.h"
 #include <zlib.h>
 #include "qapi/qmp/qerror.h"
-#include "crypto/cipher.h"
+#include "qapi/qmp/qstring.h"
+#include "crypto/block.h"
 #include "migration/blocker.h"
+#include "block/crypto.h"

 /**************************************************************/
 /* QEMU COW block driver with compression and encryption support */
@@ -77,7 +79,7 @@ typedef struct BDRVQcowState {
    uint8_t *cluster_cache;
    uint8_t *cluster_data;
    uint64_t cluster_cache_offset;
-    QCryptoCipher *cipher; /* NULL if no key yet */
+    QCryptoBlock *crypto; /* Disk encryption format driver */
    uint32_t crypt_method_header;
    CoMutex lock;
    Error *migration_blocker;
@@ -97,6 +99,15 @@ static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
        return 0;
 }

+static QemuOptsList qcow_runtime_opts = {
+    .name = "qcow",
+    .head = QTAILQ_HEAD_INITIALIZER(qcow_runtime_opts.head),
+    .desc = {
+        BLOCK_CRYPTO_OPT_DEF_QCOW_KEY_SECRET("encrypt."),
+        { /* end of list */ }
+    },
+};
+
 static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp)
 {
@@ -105,11 +116,19 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
    int ret;
    QCowHeader header;
    Error *local_err = NULL;
+    QCryptoBlockOpenOptions *crypto_opts = NULL;
+    unsigned int cflags = 0;
+    QDict *encryptopts = NULL;
+    const char *encryptfmt;
+
+    qdict_extract_subqdict(options, &encryptopts, "encrypt.");
+    encryptfmt = qdict_get_try_str(encryptopts, "format");

    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
                               false, errp);
    if (!bs->file) {
-        return -EINVAL;
+        ret = -EINVAL;
+        goto fail;
    }

    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
@@ -155,17 +174,6 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
        goto fail;
    }

-    if (header.crypt_method > QCOW_CRYPT_AES) {
-        error_setg(errp, "invalid encryption method in qcow header");
-        ret = -EINVAL;
-        goto fail;
-    }
-    if (!qcrypto_cipher_supports(QCRYPTO_CIPHER_ALG_AES_128,
-                                 QCRYPTO_CIPHER_MODE_CBC)) {
-        error_setg(errp, "AES cipher not available");
-        ret = -EINVAL;
-        goto fail;
-    }
    s->crypt_method_header = header.crypt_method;
    if (s->crypt_method_header) {
        if (bdrv_uses_whitelist() &&
@@ -181,8 +189,44 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
            ret = -ENOSYS;
            goto fail;
        }
+        if (s->crypt_method_header == QCOW_CRYPT_AES) {
+            if (encryptfmt && !g_str_equal(encryptfmt, "aes")) {
+                error_setg(errp,
+                           "Header reported 'aes' encryption format but "
+                           "options specify '%s'", encryptfmt);
+                ret = -EINVAL;
+                goto fail;
+            }
+            qdict_del(encryptopts, "format");
+            crypto_opts = block_crypto_open_opts_init(
+                Q_CRYPTO_BLOCK_FORMAT_QCOW, encryptopts, errp);
+            if (!crypto_opts) {
+                ret = -EINVAL;
+                goto fail;
+            }

+            if (flags & BDRV_O_NO_IO) {
+                cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
+            }
+            s->crypto = qcrypto_block_open(crypto_opts, "encrypt.",
+                                           NULL, NULL, cflags, errp);
+            if (!s->crypto) {
+                ret = -EINVAL;
+                goto fail;
+            }
+        } else {
+            error_setg(errp, "invalid encryption method in qcow header");
+            ret = -EINVAL;
+            goto fail;
+        }
        bs->encrypted = true;
+    } else {
+        if (encryptfmt) {
+            error_setg(errp, "No encryption in image header, but options "
+                       "specified format '%s'", encryptfmt);
+            ret = -EINVAL;
+            goto fail;
+        }
    }
    s->cluster_bits = header.cluster_bits;
    s->cluster_size = 1 << s->cluster_bits;
@@ -266,6 +310,8 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
        goto fail;
    }

+    QDECREF(encryptopts);
+    qapi_free_QCryptoBlockOpenOptions(crypto_opts);
    qemu_co_mutex_init(&s->lock);
    return 0;

@@ -274,6 +320,9 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
    qemu_vfree(s->l2_cache);
    g_free(s->cluster_cache);
    g_free(s->cluster_data);
+    qcrypto_block_free(s->crypto);
+    QDECREF(encryptopts);
+    qapi_free_QCryptoBlockOpenOptions(crypto_opts);
    return ret;
 }

@@ -286,85 +335,6 @@ static int qcow_reopen_prepare(BDRVReopenState *state,
    return 0;
 }

-static int qcow_set_key(BlockDriverState *bs, const char *key)
-{
-    BDRVQcowState *s = bs->opaque;
-    uint8_t keybuf[16];
-    int len, i;
-    Error *err;
-
-    memset(keybuf, 0, 16);
-    len = strlen(key);
-    if (len > 16)
-        len = 16;
-    /* XXX: we could compress the chars to 7 bits to increase
-       entropy */
-    for(i = 0;i < len;i++) {
-        keybuf[i] = key[i];
-    }
-    assert(bs->encrypted);
-
-    qcrypto_cipher_free(s->cipher);
-    s->cipher = qcrypto_cipher_new(
-        QCRYPTO_CIPHER_ALG_AES_128,
-        QCRYPTO_CIPHER_MODE_CBC,
-        keybuf, G_N_ELEMENTS(keybuf),
-        &err);
-
-    if (!s->cipher) {
-        /* XXX would be nice if errors in this method could
-         * be properly propagate to the caller. Would need
-         * the bdrv_set_key() API signature to be fixed. */
-        error_free(err);
-        return -1;
-    }
-    return 0;
-}
-
-/* The crypt function is compatible with the linux cryptoloop
-   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
-   supported */
-static int encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
-                           uint8_t *out_buf, const uint8_t *in_buf,
-                           int nb_sectors, bool enc, Error **errp)
-{
-    union {
-        uint64_t ll[2];
-        uint8_t b[16];
-    } ivec;
-    int i;
-    int ret;
-
-    for(i = 0; i < nb_sectors; i++) {
-        ivec.ll[0] = cpu_to_le64(sector_num);
-        ivec.ll[1] = 0;
-        if (qcrypto_cipher_setiv(s->cipher,
-                                 ivec.b, G_N_ELEMENTS(ivec.b),
-                                 errp) < 0) {
-            return -1;
-        }
-        if (enc) {
-            ret = qcrypto_cipher_encrypt(s->cipher,
-                                         in_buf,
-                                         out_buf,
-                                         512,
-                                         errp);
-        } else {
-            ret = qcrypto_cipher_decrypt(s->cipher,
-                                         in_buf,
-                                         out_buf,
-                                         512,
-                                         errp);
-        }
-        if (ret < 0) {
-            return -1;
-        }
-        sector_num++;
-        in_buf += 512;
-        out_buf += 512;
-    }
-    return 0;
-}

 /* 'allocate' is:
 *
@@ -377,19 +347,22 @@ static int encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
 * 'compressed_size'. 'compressed_size' must be > 0 and <
 * cluster_size
 *
- * return 0 if not allocated.
+ * return 0 if not allocated, 1 if *result is assigned, and negative
+ * errno on failure.
 */
-static uint64_t get_cluster_offset(BlockDriverState *bs,
-                                   uint64_t offset, int allocate,
-                                   int compressed_size,
-                                   int n_start, int n_end)
+static int get_cluster_offset(BlockDriverState *bs,
+                              uint64_t offset, int allocate,
+                              int compressed_size,
+                              int n_start, int n_end, uint64_t *result)
 {
    BDRVQcowState *s = bs->opaque;
-    int min_index, i, j, l1_index, l2_index;
-    uint64_t l2_offset, *l2_table, cluster_offset, tmp;
+    int min_index, i, j, l1_index, l2_index, ret;
+    int64_t l2_offset;
+    uint64_t *l2_table, cluster_offset, tmp;
    uint32_t min_count;
    int new_l2_table;

+    *result = 0;
    l1_index = offset >> (s->l2_bits + s->cluster_bits);
    l2_offset = s->l1_table[l1_index];
    new_l2_table = 0;
@@ -398,15 +371,20 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
            return 0;
        /* allocate a new l2 entry */
        l2_offset = bdrv_getlength(bs->file->bs);
+        if (l2_offset < 0) {
+            return l2_offset;
+        }
        /* round to cluster size */
-        l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1);
+        l2_offset = QEMU_ALIGN_UP(l2_offset, s->cluster_size);
        /* update the L1 entry */
        s->l1_table[l1_index] = l2_offset;
        tmp = cpu_to_be64(l2_offset);
-        if (bdrv_pwrite_sync(bs->file,
-                s->l1_table_offset + l1_index * sizeof(tmp),
-                &tmp, sizeof(tmp)) < 0)
-            return 0;
+        ret = bdrv_pwrite_sync(bs->file,
+                               s->l1_table_offset + l1_index * sizeof(tmp),
+                               &tmp, sizeof(tmp));
+        if (ret < 0) {
+            return ret;
+        }
        new_l2_table = 1;
    }
    for(i = 0; i < L2_CACHE_SIZE; i++) {
@@ -433,14 +411,17 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
    l2_table = s->l2_cache + (min_index << s->l2_bits);
    if (new_l2_table) {
        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
-        if (bdrv_pwrite_sync(bs->file, l2_offset, l2_table,
-                s->l2_size * sizeof(uint64_t)) < 0)
-            return 0;
+        ret = bdrv_pwrite_sync(bs->file, l2_offset, l2_table,
+                               s->l2_size * sizeof(uint64_t));
+        if (ret < 0) {
+            return ret;
+        }
    } else {
-        if (bdrv_pread(bs->file, l2_offset, l2_table,
-                       s->l2_size * sizeof(uint64_t)) !=
-            s->l2_size * sizeof(uint64_t))
-            return 0;
+        ret = bdrv_pread(bs->file, l2_offset, l2_table,
+                         s->l2_size * sizeof(uint64_t));
+        if (ret < 0) {
+            return ret;
+        }
    }
    s->l2_cache_offsets[min_index] = l2_offset;
    s->l2_cache_counts[min_index] = 1;
@@ -457,46 +438,58 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
            /* if the cluster is already compressed, we must
               decompress it in the case it is not completely
               overwritten */
-            if (decompress_cluster(bs, cluster_offset) < 0)
-                return 0;
+            if (decompress_cluster(bs, cluster_offset) < 0) {
+                return -EIO;
+            }
            cluster_offset = bdrv_getlength(bs->file->bs);
-            cluster_offset = (cluster_offset + s->cluster_size - 1) &
-                ~(s->cluster_size - 1);
+            if ((int64_t) cluster_offset < 0) {
+                return cluster_offset;
+            }
+            cluster_offset = QEMU_ALIGN_UP(cluster_offset, s->cluster_size);
            /* write the cluster content */
-            if (bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache,
-                            s->cluster_size) !=
-                s->cluster_size)
-                return -1;
+            ret = bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache,
+                              s->cluster_size);
+            if (ret < 0) {
+                return ret;
+            }
        } else {
            cluster_offset = bdrv_getlength(bs->file->bs);
+            if ((int64_t) cluster_offset < 0) {
+                return cluster_offset;
+            }
            if (allocate == 1) {
                /* round to cluster size */
-                cluster_offset = (cluster_offset + s->cluster_size - 1) &
-                    ~(s->cluster_size - 1);
-                bdrv_truncate(bs->file, cluster_offset + s->cluster_size, NULL);
+                cluster_offset = QEMU_ALIGN_UP(cluster_offset, s->cluster_size);
+                if (cluster_offset + s->cluster_size > INT64_MAX) {
+                    return -E2BIG;
+                }
+                ret = bdrv_truncate(bs->file, cluster_offset + s->cluster_size,
+                                    PREALLOC_MODE_OFF, NULL);
+                if (ret < 0) {
+                    return ret;
+                }
                /* if encrypted, we must initialize the cluster
                   content which won't be written */
                if (bs->encrypted &&
                    (n_end - n_start) < s->cluster_sectors) {
                    uint64_t start_sect;
-                    assert(s->cipher);
+                    assert(s->crypto);
                    start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
-                    memset(s->cluster_data + 512, 0x00, 512);
                    for(i = 0; i < s->cluster_sectors; i++) {
                        if (i < n_start || i >= n_end) {
-                            Error *err = NULL;
-                            if (encrypt_sectors(s, start_sect + i,
-                                                s->cluster_data,
-                                                s->cluster_data + 512, 1,
-                                                true, &err) < 0) {
-                                error_free(err);
-                                errno = EIO;
-                                return -1;
+                            memset(s->cluster_data, 0x00, 512);
+                            if (qcrypto_block_encrypt(s->crypto, start_sect + i,
+                                                      s->cluster_data,
+                                                      BDRV_SECTOR_SIZE,
+                                                      NULL) < 0) {
+                                return -EIO;
+                            }
+                            ret = bdrv_pwrite(bs->file,
+                                              cluster_offset + i * 512,
+                                              s->cluster_data, 512);
+                            if (ret < 0) {
+                                return ret;
                            }
-                            if (bdrv_pwrite(bs->file,
-                                            cluster_offset + i * 512,
-                                            s->cluster_data, 512) != 512)
-                                return -1;
                        }
                    }
                }
@@ -508,23 +501,29 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
        /* update L2 table */
        tmp = cpu_to_be64(cluster_offset);
        l2_table[l2_index] = tmp;
-        if (bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp),
-                &tmp, sizeof(tmp)) < 0)
-            return 0;
+        ret = bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp),
+                               &tmp, sizeof(tmp));
+        if (ret < 0) {
+            return ret;
+        }
    }
-    return cluster_offset;
+    *result = cluster_offset;
+    return 1;
 }

 static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs,
        int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
 {
    BDRVQcowState *s = bs->opaque;
-    int index_in_cluster, n;
+    int index_in_cluster, n, ret;
    uint64_t cluster_offset;

    qemu_co_mutex_lock(&s->lock);
-    cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
+    ret = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0, &cluster_offset);
    qemu_co_mutex_unlock(&s->lock);
+    if (ret < 0) {
+        return ret;
+    }
    index_in_cluster = sector_num & (s->cluster_sectors - 1);
    n = s->cluster_sectors - index_in_cluster;
    if (n > nb_sectors)
@@ -533,7 +532,7 @@ static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs,
    if (!cluster_offset) {
        return 0;
    }
-    if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->cipher) {
+    if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->crypto) {
        return BDRV_BLOCK_DATA;
    }
    cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS);
@@ -601,7 +600,6 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
    QEMUIOVector hd_qiov;
    uint8_t *buf;
    void *orig_buf;
-    Error *err = NULL;

    if (qiov->niov > 1) {
        buf = orig_buf = qemu_try_blockalign(bs, qiov->size);
@@ -617,8 +615,11 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,

    while (nb_sectors != 0) {
        /* prepare next request */
-        cluster_offset = get_cluster_offset(bs, sector_num << 9,
-                                                 0, 0, 0, 0);
+        ret = get_cluster_offset(bs, sector_num << 9,
+                                 0, 0, 0, 0, &cluster_offset);
+        if (ret < 0) {
+            break;
+        }
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
        n = s->cluster_sectors - index_in_cluster;
        if (n > nb_sectors) {
@@ -635,7 +636,7 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
                ret = bdrv_co_readv(bs->backing, sector_num, n, &hd_qiov);
                qemu_co_mutex_lock(&s->lock);
                if (ret < 0) {
-                    goto fail;
+                    break;
                }
            } else {
                /* Note: in this case, no need to wait */
@@ -644,13 +645,15 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
            /* add AIO support for compressed blocks ? */
            if (decompress_cluster(bs, cluster_offset) < 0) {
-                goto fail;
+                ret = -EIO;
+                break;
            }
            memcpy(buf,
                   s->cluster_cache + index_in_cluster * 512, 512 * n);
        } else {
            if ((cluster_offset & 511) != 0) {
-                goto fail;
+                ret = -EIO;
+                break;
            }
            hd_iov.iov_base = (void *)buf;
            hd_iov.iov_len = n * 512;
@@ -664,10 +667,11 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
                break;
            }
            if (bs->encrypted) {
-                assert(s->cipher);
-                if (encrypt_sectors(s, sector_num, buf, buf,
-                                    n, false, &err) < 0) {
-                    goto fail;
+                assert(s->crypto);
+                if (qcrypto_block_decrypt(s->crypto, sector_num, buf,
+                                          n * BDRV_SECTOR_SIZE, NULL) < 0) {
+                    ret = -EIO;
+                    break;
                }
            }
        }
@@ -678,7 +682,6 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
        buf += n * 512;
    }

-done:
    qemu_co_mutex_unlock(&s->lock);

    if (qiov->niov > 1) {
@@ -687,11 +690,6 @@ done:
    }

    return ret;
-
-fail:
-    error_free(err);
-    ret = -EIO;
-    goto done;
 }

 static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
@@ -700,9 +698,7 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
    BDRVQcowState *s = bs->opaque;
    int index_in_cluster;
    uint64_t cluster_offset;
-    const uint8_t *src_buf;
    int ret = 0, n;
-    uint8_t *cluster_data = NULL;
    struct iovec hd_iov;
    QEMUIOVector hd_qiov;
    uint8_t *buf;
@@ -710,7 +706,9 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,

    s->cluster_cache_offset = -1; /* disable compressed cache */

-    if (qiov->niov > 1) {
+    /* We must always copy the iov when encrypting, so we
+     * don't modify the original data buffer during encryption */
+    if (bs->encrypted || qiov->niov > 1) {
        buf = orig_buf = qemu_try_blockalign(bs, qiov->size);
        if (buf == NULL) {
            return -ENOMEM;
@@ -730,31 +728,26 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
        if (n > nb_sectors) {
            n = nb_sectors;
        }
-        cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0,
-                                            index_in_cluster,
-                                            index_in_cluster + n);
+        ret = get_cluster_offset(bs, sector_num << 9, 1, 0,
+                                 index_in_cluster,
+                                 index_in_cluster + n, &cluster_offset);
+        if (ret < 0) {
+            break;
+        }
        if (!cluster_offset || (cluster_offset & 511) != 0) {
            ret = -EIO;
            break;
        }
        if (bs->encrypted) {
-            Error *err = NULL;
-            assert(s->cipher);
-            if (!cluster_data) {
-                cluster_data = g_malloc0(s->cluster_size);
-            }
-            if (encrypt_sectors(s, sector_num, cluster_data, buf,
-                                n, true, &err) < 0) {
-                error_free(err);
+            assert(s->crypto);
+            if (qcrypto_block_encrypt(s->crypto, sector_num, buf,
+                                      n * BDRV_SECTOR_SIZE, NULL) < 0) {
                ret = -EIO;
                break;
            }
-            src_buf = cluster_data;
-        } else {
-            src_buf = buf;
        }

-        hd_iov.iov_base = (void *)src_buf;
+        hd_iov.iov_base = (void *)buf;
        hd_iov.iov_len = n * 512;
        qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
        qemu_co_mutex_unlock(&s->lock);
@@ -773,10 +766,7 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
    }
    qemu_co_mutex_unlock(&s->lock);

-    if (qiov->niov > 1) {
-        qemu_vfree(orig_buf);
-    }
-    g_free(cluster_data);
+    qemu_vfree(orig_buf);

    return ret;
 }
@@ -785,8 +775,8 @@ static void qcow_close(BlockDriverState *bs)
 {
    BDRVQcowState *s = bs->opaque;

-    qcrypto_cipher_free(s->cipher);
-    s->cipher = NULL;
+    qcrypto_block_free(s->crypto);
+    s->crypto = NULL;
    g_free(s->l1_table);
    qemu_vfree(s->l2_cache);
    g_free(s->cluster_cache);
@@ -803,17 +793,35 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    uint8_t *tmp;
    int64_t total_size = 0;
    char *backing_file = NULL;
-    int flags = 0;
    Error *local_err = NULL;
    int ret;
    BlockBackend *qcow_blk;
+    char *encryptfmt = NULL;
+    QDict *options;
+    QDict *encryptopts = NULL;
+    QCryptoBlockCreateOptions *crypto_opts = NULL;
+    QCryptoBlock *crypto = NULL;

    /* Read out options */
    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
                          BDRV_SECTOR_SIZE);
+    if (total_size == 0) {
+        error_setg(errp, "Image size is too small, cannot be zero length");
+        ret = -EINVAL;
+        goto cleanup;
+    }
+
    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
-    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) {
-        flags |= BLOCK_FLAG_ENCRYPT;
+    encryptfmt = qemu_opt_get_del(opts, BLOCK_OPT_ENCRYPT_FORMAT);
+    if (encryptfmt) {
+        if (qemu_opt_get(opts, BLOCK_OPT_ENCRYPT)) {
+            error_setg(errp, "Options " BLOCK_OPT_ENCRYPT " and "
+                       BLOCK_OPT_ENCRYPT_FORMAT " are mutually exclusive");
+            ret = -EINVAL;
+            goto cleanup;
+        }
+    } else if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) {
+        encryptfmt = g_strdup("aes");
    }

    ret = bdrv_create_file(filename, opts, &local_err);
@@ -833,7 +841,7 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)

    blk_set_allow_write_beyond_eof(qcow_blk, true);

-    ret = blk_truncate(qcow_blk, 0, errp);
+    ret = blk_truncate(qcow_blk, 0, PREALLOC_MODE_OFF, errp);
    if (ret < 0) {
        goto exit;
    }
@@ -867,8 +875,32 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    l1_size = (total_size + (1LL << shift) - 1) >> shift;

    header.l1_table_offset = cpu_to_be64(header_size);
-    if (flags & BLOCK_FLAG_ENCRYPT) {
+
+    options = qemu_opts_to_qdict(opts, NULL);
+    qdict_extract_subqdict(options, &encryptopts, "encrypt.");
+    QDECREF(options);
+    if (encryptfmt) {
+        if (!g_str_equal(encryptfmt, "aes")) {
+            error_setg(errp, "Unknown encryption format '%s', expected 'aes'",
+                       encryptfmt);
+            ret = -EINVAL;
+            goto exit;
+        }
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
+
+        crypto_opts = block_crypto_create_opts_init(
+            Q_CRYPTO_BLOCK_FORMAT_QCOW, encryptopts, errp);
+        if (!crypto_opts) {
+            ret = -EINVAL;
+            goto exit;
+        }
+
+        crypto = qcrypto_block_create(crypto_opts, "encrypt.",
+                                      NULL, NULL, NULL, errp);
+        if (!crypto) {
+            ret = -EINVAL;
+            goto exit;
+        }
    } else {
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
    }
@@ -903,6 +935,10 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
 exit:
    blk_unref(qcow_blk);
 cleanup:
+    QDECREF(encryptopts);
+    g_free(encryptfmt);
+    qcrypto_block_free(crypto);
+    qapi_free_QCryptoBlockCreateOptions(crypto_opts);
    g_free(backing_file);
    return ret;
 }
@@ -917,7 +953,8 @@ static int qcow_make_empty(BlockDriverState *bs)
    if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table,
            l1_length) < 0)
        return -1;
-    ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length, NULL);
+    ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length,
+                        PREALLOC_MODE_OFF, NULL);
    if (ret < 0)
        return ret;

@@ -992,8 +1029,11 @@ qcow_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
        goto success;
    }
    qemu_co_mutex_lock(&s->lock);
-    cluster_offset = get_cluster_offset(bs, offset, 2, out_len, 0, 0);
+    ret = get_cluster_offset(bs, offset, 2, out_len, 0, 0, &cluster_offset);
    qemu_co_mutex_unlock(&s->lock);
+    if (ret < 0) {
+        goto fail;
+    }
    if (cluster_offset == 0) {
        ret = -EIO;
        goto fail;
@@ -1041,9 +1081,15 @@ static QemuOptsList qcow_create_opts = {
        {
            .name = BLOCK_OPT_ENCRYPT,
            .type = QEMU_OPT_BOOL,
-            .help = "Encrypt the image",
-            .def_value_str = "off"
+            .help = "Encrypt the image with format 'aes'. (Deprecated "
+                    "in favor of " BLOCK_OPT_ENCRYPT_FORMAT "=aes)",
        },
+        {
+            .name = BLOCK_OPT_ENCRYPT_FORMAT,
+            .type = QEMU_OPT_STRING,
+            .help = "Encrypt the image, format choices: 'aes'",
+        },
+        BLOCK_CRYPTO_OPT_DEF_QCOW_KEY_SECRET("encrypt."),
        { /* end of list */ }
    }
 };
@@ -1064,7 +1110,6 @@ static BlockDriver bdrv_qcow = {
    .bdrv_co_writev         = qcow_co_writev,
    .bdrv_co_get_block_status   = qcow_co_get_block_status,

-    .bdrv_set_key           = qcow_set_key,
    .bdrv_make_empty        = qcow_make_empty,
    .bdrv_co_pwritev_compressed = qcow_co_pwritev_compressed,
    .bdrv_get_info          = qcow_get_info,
--- a/block/qcow2-bitmap.c
+++ b/block/qcow2-bitmap.c
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -61,7 +61,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
            new_l1_size = 1;
        }
        while (min_size > new_l1_size) {
-            new_l1_size = (new_l1_size * 3 + 1) / 2;
+            new_l1_size = DIV_ROUND_UP(new_l1_size * 3, 2);
        }
    }

@@ -357,76 +357,21 @@ static int count_contiguous_clusters_unallocated(int nb_clusters,
    return i;
 }

-/* The crypt function is compatible with the linux cryptoloop
-   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
-   supported */
-int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
-                          uint8_t *out_buf, const uint8_t *in_buf,
-                          int nb_sectors, bool enc,
-                          Error **errp)
+static int coroutine_fn do_perform_cow_read(BlockDriverState *bs,
+                                            uint64_t src_cluster_offset,
+                                            unsigned offset_in_cluster,
+                                            QEMUIOVector *qiov)
 {
-    union {
-        uint64_t ll[2];
-        uint8_t b[16];
-    } ivec;
-    int i;
    int ret;

-    for(i = 0; i < nb_sectors; i++) {
-        ivec.ll[0] = cpu_to_le64(sector_num);
-        ivec.ll[1] = 0;
-        if (qcrypto_cipher_setiv(s->cipher,
-                                 ivec.b, G_N_ELEMENTS(ivec.b),
-                                 errp) < 0) {
-            return -1;
-        }
-        if (enc) {
-            ret = qcrypto_cipher_encrypt(s->cipher,
-                                         in_buf,
-                                         out_buf,
-                                         512,
-                                         errp);
-        } else {
-            ret = qcrypto_cipher_decrypt(s->cipher,
-                                         in_buf,
-                                         out_buf,
-                                         512,
-                                         errp);
-        }
-        if (ret < 0) {
-            return -1;
-        }
-        sector_num++;
-        in_buf += 512;
-        out_buf += 512;
+    if (qiov->size == 0) {
+        return 0;
    }
-    return 0;
-}
-
-static int coroutine_fn do_perform_cow(BlockDriverState *bs,
-                                       uint64_t src_cluster_offset,
-                                       uint64_t cluster_offset,
-                                       int offset_in_cluster,
-                                       int bytes)
-{
-    BDRVQcow2State *s = bs->opaque;
-    QEMUIOVector qiov;
-    struct iovec iov;
-    int ret;
-
-    iov.iov_len = bytes;
-    iov.iov_base = qemu_try_blockalign(bs, iov.iov_len);
-    if (iov.iov_base == NULL) {
-        return -ENOMEM;
-    }
-
-    qemu_iovec_init_external(&qiov, &iov, 1);

    BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);

    if (!bs->drv) {
-        ret = -ENOMEDIUM;
-        goto out;
+        return -ENOMEDIUM;
    }

    /* Call .bdrv_co_readv() directly instead of using the public block-layer
@@ -434,43 +379,63 @@ static int coroutine_fn do_perform_cow(BlockDriverState *bs,
     * which can lead to deadlock when block layer copy-on-read is enabled.
     */
    ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster,
-                                  bytes, &qiov, 0);
+                                  qiov->size, qiov, 0);
    if (ret < 0) {
-        goto out;
+        return ret;
    }

-    if (bs->encrypted) {
-        Error *err = NULL;
-        int64_t sector = (src_cluster_offset + offset_in_cluster)
+    return 0;
+}
+
+static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs,
+                                                uint64_t src_cluster_offset,
+                                                uint64_t cluster_offset,
+                                                unsigned offset_in_cluster,
+                                                uint8_t *buffer,
+                                                unsigned bytes)
+{
+    if (bytes && bs->encrypted) {
+        BDRVQcow2State *s = bs->opaque;
+        int64_t sector = (s->crypt_physical_offset ?
+                          (cluster_offset + offset_in_cluster) :
+                          (src_cluster_offset + offset_in_cluster))
                         >> BDRV_SECTOR_BITS;
-        assert(s->cipher);
        assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0);
        assert((bytes & ~BDRV_SECTOR_MASK) == 0);
-        if (qcow2_encrypt_sectors(s, sector, iov.iov_base, iov.iov_base,
-                                  bytes >> BDRV_SECTOR_BITS, true, &err) < 0) {
-            ret = -EIO;
-            error_free(err);
-            goto out;
+        assert(s->crypto);
+        if (qcrypto_block_encrypt(s->crypto, sector, buffer,
+                                  bytes, NULL) < 0) {
+            return false;
        }
    }
+    return true;
+}
+
+static int coroutine_fn do_perform_cow_write(BlockDriverState *bs,
+                                             uint64_t cluster_offset,
+                                             unsigned offset_in_cluster,
+                                             QEMUIOVector *qiov)
+{
+    int ret;
+
+    if (qiov->size == 0) {
+        return 0;
+    }

    ret = qcow2_pre_write_overlap_check(bs, 0,
-            cluster_offset + offset_in_cluster, bytes);
+            cluster_offset + offset_in_cluster, qiov->size);
    if (ret < 0) {
-        goto out;
+        return ret;
    }

    BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
    ret = bdrv_co_pwritev(bs->file, cluster_offset + offset_in_cluster,
-                          bytes, &qiov, 0);
+                          qiov->size, qiov, 0);
    if (ret < 0) {
-        goto out;
+        return ret;
    }

-    ret = 0;
-out:
-    qemu_vfree(iov.iov_base);
-    return ret;
+    return 0;
 }


@@ -548,7 +513,7 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,

    /* find the cluster offset for the given disk offset */

-    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+    l2_index = offset_to_l2_index(s, offset);
    *cluster_offset = be64_to_cpu(l2_table[l2_index]);

    nb_clusters = size_to_clusters(s, bytes_needed);
@@ -685,7 +650,7 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,

    /* find the cluster offset for the given disk offset */

-    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+    l2_index = offset_to_l2_index(s, offset);

    *new_l2_table = l2_table;
    *new_l2_index = l2_index;
@@ -753,31 +718,134 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
    return cluster_offset;
 }

-static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
+static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
 {
    BDRVQcow2State *s = bs->opaque;
+    Qcow2COWRegion *start = &m->cow_start;
+    Qcow2COWRegion *end = &m->cow_end;
+    unsigned buffer_size;
+    unsigned data_bytes = end->offset - (start->offset + start->nb_bytes);
+    bool merge_reads;
+    uint8_t *start_buffer, *end_buffer;
+    QEMUIOVector qiov;
    int ret;

-    if (r->nb_bytes == 0) {
+    assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
+    assert(start->nb_bytes + end->nb_bytes <= UINT_MAX - data_bytes);
+    assert(start->offset + start->nb_bytes <= end->offset);
+    assert(!m->data_qiov || m->data_qiov->size == data_bytes);
+
+    if (start->nb_bytes == 0 && end->nb_bytes == 0) {
        return 0;
    }

-    qemu_co_mutex_unlock(&s->lock);
-    ret = do_perform_cow(bs, m->offset, m->alloc_offset, r->offset, r->nb_bytes);
-    qemu_co_mutex_lock(&s->lock);
-
-    if (ret < 0) {
-        return ret;
+    /* If we have to read both the start and end COW regions and the
+     * middle region is not too large then perform just one read
+     * operation */
+    merge_reads = start->nb_bytes && end->nb_bytes && data_bytes <= 16384;
+    if (merge_reads) {
+        buffer_size = start->nb_bytes + data_bytes + end->nb_bytes;
+    } else {
+        /* If we have to do two reads, add some padding in the middle
+         * if necessary to make sure that the end region is optimally
+         * aligned. */
+        size_t align = bdrv_opt_mem_align(bs);
+        assert(align > 0 && align <= UINT_MAX);
+        assert(QEMU_ALIGN_UP(start->nb_bytes, align) <=
+               UINT_MAX - end->nb_bytes);
+        buffer_size = QEMU_ALIGN_UP(start->nb_bytes, align) + end->nb_bytes;
    }

+    /* Reserve a buffer large enough to store all the data that we're
+     * going to read */
+    start_buffer = qemu_try_blockalign(bs, buffer_size);
+    if (start_buffer == NULL) {
+        return -ENOMEM;
+    }
+    /* The part of the buffer where the end region is located */
+    end_buffer = start_buffer + buffer_size - end->nb_bytes;
+
+    qemu_iovec_init(&qiov, 2 + (m->data_qiov ? m->data_qiov->niov : 0));
+
+    qemu_co_mutex_unlock(&s->lock);
+    /* First we read the existing data from both COW regions. We
+     * either read the whole region in one go, or the start and end
+     * regions separately. */
+    if (merge_reads) {
+        qemu_iovec_add(&qiov, start_buffer, buffer_size);
+        ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
+    } else {
+        qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
+        ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        qemu_iovec_reset(&qiov);
+        qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
+        ret = do_perform_cow_read(bs, m->offset, end->offset, &qiov);
+    }
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Encrypt the data if necessary before writing it */
+    if (bs->encrypted) {
+        if (!do_perform_cow_encrypt(bs, m->offset, m->alloc_offset,
+                                    start->offset, start_buffer,
+                                    start->nb_bytes) ||
+            !do_perform_cow_encrypt(bs, m->offset, m->alloc_offset,
+                                    end->offset, end_buffer, end->nb_bytes)) {
+            ret = -EIO;
+            goto fail;
+        }
+    }
+
+    /* And now we can write everything. If we have the guest data we
+     * can write everything in one single operation */
+    if (m->data_qiov) {
+        qemu_iovec_reset(&qiov);
+        if (start->nb_bytes) {
+            qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
+        }
+        qemu_iovec_concat(&qiov, m->data_qiov, 0, data_bytes);
+        if (end->nb_bytes) {
+            qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
+        }
+        /* NOTE: we have a write_aio blkdebug event here followed by
+         * a cow_write one in do_perform_cow_write(), but there's only
+         * one single I/O operation */
+        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
+        ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
+    } else {
+        /* If there's no guest data then write both COW regions separately */
+        qemu_iovec_reset(&qiov);
+        qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
+        ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        qemu_iovec_reset(&qiov);
+        qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
+        ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov);
+    }
+
+fail:
+    qemu_co_mutex_lock(&s->lock);
+
    /*
     * Before we update the L2 table to actually point to the new cluster, we
     * need to be sure that the refcounts have been increased and COW was
     * handled.
     */
-    qcow2_cache_depends_on_flush(s->l2_table_cache);
+    if (ret == 0) {
+        qcow2_cache_depends_on_flush(s->l2_table_cache);
+    }

-    return 0;
+    qemu_vfree(start_buffer);
+    qemu_iovec_destroy(&qiov);
+    return ret;
 }

 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
@@ -797,12 +865,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
    }

    /* copy content of unmodified sectors */
-    ret = perform_cow(bs, m, &m->cow_start);
-    if (ret < 0) {
-        goto err;
-    }
-
-    ret = perform_cow(bs, m, &m->cow_end);
+    ret = perform_cow(bs, m);
    if (ret < 0) {
        goto err;
    }
@@ -1453,6 +1516,23 @@ int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
        nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
        sector_offset = coffset & 511;
        csize = nb_csectors * 512 - sector_offset;
+
+        /* Allocate buffers on first decompress operation, most images are
+         * uncompressed and the memory overhead can be avoided.  The buffers
+         * are freed in .bdrv_close().
+         */
+        if (!s->cluster_data) {
+            /* one more sector for decompressed data alignment */
+            s->cluster_data = qemu_try_blockalign(bs->file->bs,
+                    QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size + 512);
+            if (!s->cluster_data) {
+                return -ENOMEM;
+            }
+        }
+        if (!s->cluster_cache) {
+            s->cluster_cache = g_malloc(s->cluster_size);
+        }
+
        BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
        ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data,
                        nb_csectors);
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -281,25 +281,6 @@ int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
    return 0;
 }

-/*
- * Rounds the refcount table size up to avoid growing the table for each single
- * refcount block that is allocated.
- */
-static unsigned int next_refcount_table_size(BDRVQcow2State *s,
-    unsigned int min_size)
-{
-    unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
-    unsigned int refcount_table_clusters =
-        MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
-
-    while (min_clusters > refcount_table_clusters) {
-        refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
-    }
-
-    return refcount_table_clusters << (s->cluster_bits - 3);
-}
-
-
 /* Checks if two offsets are described by the same refcount block */
 static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
    uint64_t offset_b)
@@ -321,7 +302,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
 {
    BDRVQcow2State *s = bs->opaque;
    unsigned int refcount_table_index;
-    int ret;
+    int64_t ret;

    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);

@@ -396,7 +377,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
                                    refcount_block);
        if (ret < 0) {
-            goto fail_block;
+            goto fail;
        }

        memset(*refcount_block, 0, s->cluster_size);
@@ -411,12 +392,12 @@ static int alloc_refcount_block(BlockDriverState *bs,
        ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
                              QCOW2_DISCARD_NEVER);
        if (ret < 0) {
-            goto fail_block;
+            goto fail;
        }

        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
        if (ret < 0) {
-            goto fail_block;
+            goto fail;
        }

        /* Initialize the new refcount block only after updating its refcount,
@@ -424,7 +405,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
                                    refcount_block);
        if (ret < 0) {
-            goto fail_block;
+            goto fail;
        }

        memset(*refcount_block, 0, s->cluster_size);
@@ -435,7 +416,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
    qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block);
    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
    if (ret < 0) {
-        goto fail_block;
+        goto fail;
    }

    /* If the refcount table is big enough, just hook the block up there */
@@ -446,7 +427,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
            s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
            &data64, sizeof(data64));
        if (ret < 0) {
-            goto fail_block;
+            goto fail;
        }

        s->refcount_table[refcount_table_index] = new_block;
@@ -490,74 +471,201 @@ static int alloc_refcount_block(BlockDriverState *bs,
                                            (new_block >> s->cluster_bits) + 1),
                                        s->refcount_block_size);

-    if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
-        return -EFBIG;
-    }
-
-    /* And now we need at least one block more for the new metadata */
-    uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
-    uint64_t last_table_size;
-    uint64_t blocks_clusters;
-    do {
-        uint64_t table_clusters =
-            size_to_clusters(s, table_size * sizeof(uint64_t));
-        blocks_clusters = 1 +
-            DIV_ROUND_UP(table_clusters, s->refcount_block_size);
-        uint64_t meta_clusters = table_clusters + blocks_clusters;
-
-        last_table_size = table_size;
-        table_size = next_refcount_table_size(s, blocks_used +
-            DIV_ROUND_UP(meta_clusters, s->refcount_block_size));
-
-    } while (last_table_size != table_size);
-
-#ifdef DEBUG_ALLOC2
-    fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
-        s->refcount_table_size, table_size);
-#endif
-
    /* Create the new refcount table and blocks */
    uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
        s->cluster_size;
-    uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
-    uint64_t *new_table = g_try_new0(uint64_t, table_size);
-    void *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size);

-    assert(table_size > 0 && blocks_clusters > 0);
-    if (new_table == NULL || new_blocks == NULL) {
+    ret = qcow2_refcount_area(bs, meta_offset, 0, false,
+                              refcount_table_index, new_block);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = load_refcount_block(bs, new_block, refcount_block);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* If we were trying to do the initial refcount update for some cluster
+     * allocation, we might have used the same clusters to store newly
+     * allocated metadata. Make the caller search some new space. */
+    return -EAGAIN;
+
+fail:
+    if (*refcount_block != NULL) {
+        qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
+    }
+    return ret;
+}
+
+/*
+ * Starting at @start_offset, this function creates new self-covering refcount
+ * structures: A new refcount table and refcount blocks which cover all of
+ * themselves, and a number of @additional_clusters beyond their end.
+ * @start_offset must be at the end of the image file, that is, there must be
+ * only empty space beyond it.
+ * If @exact_size is false, the refcount table will have 50 % more entries than
+ * necessary so it will not need to grow again soon.
+ * If @new_refblock_offset is not zero, it contains the offset of a refcount
+ * block that should be entered into the new refcount table at index
+ * @new_refblock_index.
+ *
+ * Returns: The offset after the new refcount structures (i.e. where the
+ *          @additional_clusters may be placed) on success, -errno on error.
+ */
+int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
+                            uint64_t additional_clusters, bool exact_size,
+                            int new_refblock_index,
+                            uint64_t new_refblock_offset)
+{
+    BDRVQcow2State *s = bs->opaque;
+    uint64_t total_refblock_count_u64, additional_refblock_count;
+    int total_refblock_count, table_size, area_reftable_index, table_clusters;
+    int i;
+    uint64_t table_offset, block_offset, end_offset;
+    int ret;
+    uint64_t *new_table;
+
+    assert(!(start_offset % s->cluster_size));
+
+    qcow2_refcount_metadata_size(start_offset / s->cluster_size +
+                                 additional_clusters,
+                                 s->cluster_size, s->refcount_order,
+                                 !exact_size, &total_refblock_count_u64);
+    if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) {
+        return -EFBIG;
+    }
+    total_refblock_count = total_refblock_count_u64;
+
+    /* Index in the refcount table of the first refcount block to cover the area
+     * of refcount structures we are about to create; we know that
+     * @total_refblock_count can cover @start_offset, so this will definitely
+     * fit into an int. */
+    area_reftable_index = (start_offset / s->cluster_size) /
+                          s->refcount_block_size;
+
+    if (exact_size) {
+        table_size = total_refblock_count;
+    } else {
+        table_size = total_refblock_count +
+                     DIV_ROUND_UP(total_refblock_count, 2);
+    }
+    /* The qcow2 file can only store the reftable size in number of clusters */
+    table_size = ROUND_UP(table_size, s->cluster_size / sizeof(uint64_t));
+    table_clusters = (table_size * sizeof(uint64_t)) / s->cluster_size;
+
+    if (table_size > QCOW_MAX_REFTABLE_SIZE) {
+        return -EFBIG;
+    }
+
+    new_table = g_try_new0(uint64_t, table_size);
+
+    assert(table_size > 0);
+    if (new_table == NULL) {
        ret = -ENOMEM;
-        goto fail_table;
+        goto fail;
    }

    /* Fill the new refcount table */
-    memcpy(new_table, s->refcount_table,
-        s->refcount_table_size * sizeof(uint64_t));
-    new_table[refcount_table_index] = new_block;
-
-    int i;
-    for (i = 0; i < blocks_clusters; i++) {
-        new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
+    if (table_size > s->max_refcount_table_index) {
+        /* We're actually growing the reftable */
+        memcpy(new_table, s->refcount_table,
+               (s->max_refcount_table_index + 1) * sizeof(uint64_t));
+    } else {
+        /* Improbable case: We're shrinking the reftable. However, the caller
+         * has assured us that there is only empty space beyond @start_offset,
+         * so we can simply drop all of the refblocks that won't fit into the
+         * new reftable. */
+        memcpy(new_table, s->refcount_table, table_size * sizeof(uint64_t));
    }

-    /* Fill the refcount blocks */
-    uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
-    int block = 0;
-    for (i = 0; i < table_clusters + blocks_clusters; i++) {
-        s->set_refcount(new_blocks, block++, 1);
+    if (new_refblock_offset) {
+        assert(new_refblock_index < total_refblock_count);
+        new_table[new_refblock_index] = new_refblock_offset;
    }

+    /* Count how many new refblocks we have to create */
+    additional_refblock_count = 0;
+    for (i = area_reftable_index; i < total_refblock_count; i++) {
+        if (!new_table[i]) {
+            additional_refblock_count++;
+        }
+    }
+
+    table_offset = start_offset + additional_refblock_count * s->cluster_size;
+    end_offset = table_offset + table_clusters * s->cluster_size;
+
+    /* Fill the refcount blocks, and create new ones, if necessary */
+    block_offset = start_offset;
+    for (i = area_reftable_index; i < total_refblock_count; i++) {
+        void *refblock_data;
+        uint64_t first_offset_covered;
+
+        /* Reuse an existing refblock if possible, create a new one otherwise */
+        if (new_table[i]) {
+            ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i],
+                                  &refblock_data);
+            if (ret < 0) {
+                goto fail;
+            }
+        } else {
+            ret = qcow2_cache_get_empty(bs, s->refcount_block_cache,
+                                        block_offset, &refblock_data);
+            if (ret < 0) {
+                goto fail;
+            }
+            memset(refblock_data, 0, s->cluster_size);
+            qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
+                                         refblock_data);
+
+            new_table[i] = block_offset;
+            block_offset += s->cluster_size;
+        }
+
+        /* First host offset covered by this refblock */
+        first_offset_covered = (uint64_t)i * s->refcount_block_size *
+                               s->cluster_size;
+        if (first_offset_covered < end_offset) {
+            int j, end_index;
+
+            /* Set the refcount of all of the new refcount structures to 1 */
+
+            if (first_offset_covered < start_offset) {
+                assert(i == area_reftable_index);
+                j = (start_offset - first_offset_covered) / s->cluster_size;
+                assert(j < s->refcount_block_size);
+            } else {
+                j = 0;
+            }
+
+            end_index = MIN((end_offset - first_offset_covered) /
+                            s->cluster_size,
+                            s->refcount_block_size);
+
+            for (; j < end_index; j++) {
+                /* The caller guaranteed us this space would be empty */
+                assert(s->get_refcount(refblock_data, j) == 0);
+                s->set_refcount(refblock_data, j, 1);
+            }
+
+            qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
+                                         refblock_data);
+        }
+
+        qcow2_cache_put(bs, s->refcount_block_cache, &refblock_data);
+    }
+
+    assert(block_offset == table_offset);
+
    /* Write refcount blocks to disk */
    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
-    ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks,
-        blocks_clusters * s->cluster_size);
-    g_free(new_blocks);
-    new_blocks = NULL;
+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
    if (ret < 0) {
-        goto fail_table;
+        goto fail;
    }

    /* Write refcount table to disk */
-    for(i = 0; i < table_size; i++) {
+    for (i = 0; i < total_refblock_count; i++) {
        cpu_to_be64s(&new_table[i]);
    }

@@ -565,10 +673,10 @@ static int alloc_refcount_block(BlockDriverState *bs,
    ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
        table_size * sizeof(uint64_t));
    if (ret < 0) {
-        goto fail_table;
+        goto fail;
    }

-    for(i = 0; i < table_size; i++) {
+    for (i = 0; i < total_refblock_count; i++) {
        be64_to_cpus(&new_table[i]);
    }

@@ -584,7 +692,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
                           offsetof(QCowHeader, refcount_table_offset),
                           &data, sizeof(data));
    if (ret < 0) {
-        goto fail_table;
+        goto fail;
    }

    /* And switch it in memory */
@@ -601,23 +709,10 @@ static int alloc_refcount_block(BlockDriverState *bs,
    qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
                        QCOW2_DISCARD_OTHER);

-    ret = load_refcount_block(bs, new_block, refcount_block);
-    if (ret < 0) {
-        return ret;
-    }
+    return end_offset;

-    /* If we were trying to do the initial refcount update for some cluster
-     * allocation, we might have used the same clusters to store newly
-     * allocated metadata. Make the caller search some new space. */
-    return -EAGAIN;
-
-fail_table:
-    g_free(new_blocks);
+fail:
    g_free(new_table);
-fail_block:
-    if (*refcount_block != NULL) {
-        qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
-    }
    return ret;
 }

@@ -1323,11 +1418,10 @@ static int realloc_refcount_array(BDRVQcow2State *s, void **array,
 *
 * Modifies the number of errors in res.
 */
-static int inc_refcounts(BlockDriverState *bs,
-                         BdrvCheckResult *res,
-                         void **refcount_table,
-                         int64_t *refcount_table_size,
-                         int64_t offset, int64_t size)
+int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
+                             void **refcount_table,
+                             int64_t *refcount_table_size,
+                             int64_t offset, int64_t size)
 {
    BDRVQcow2State *s = bs->opaque;
    uint64_t start, last, cluster_offset, k, refcount;
@@ -1420,8 +1514,9 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
            nb_csectors = ((l2_entry >> s->csize_shift) &
                           s->csize_mask) + 1;
            l2_entry &= s->cluster_offset_mask;
-            ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
-                                l2_entry & ~511, nb_csectors * 512);
+            ret = qcow2_inc_refcounts_imrt(bs, res,
+                                           refcount_table, refcount_table_size,
+                                           l2_entry & ~511, nb_csectors * 512);
            if (ret < 0) {
                goto fail;
            }
@@ -1454,8 +1549,9 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
            }

            /* Mark cluster as used */
-            ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
-                                offset, s->cluster_size);
+            ret = qcow2_inc_refcounts_imrt(bs, res,
+                                           refcount_table, refcount_table_size,
+                                           offset, s->cluster_size);
            if (ret < 0) {
                goto fail;
            }
@@ -1508,8 +1604,8 @@ static int check_refcounts_l1(BlockDriverState *bs,
    l1_size2 = l1_size * sizeof(uint64_t);

    /* Mark L1 table as used */
-    ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
-                        l1_table_offset, l1_size2);
+    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
+                                   l1_table_offset, l1_size2);
    if (ret < 0) {
        goto fail;
    }
@@ -1538,8 +1634,9 @@ static int check_refcounts_l1(BlockDriverState *bs,
        if (l2_offset) {
            /* Mark L2 table as used */
            l2_offset &= L1E_OFFSET_MASK;
-            ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
-                                l2_offset, s->cluster_size);
+            ret = qcow2_inc_refcounts_imrt(bs, res,
+                                           refcount_table, refcount_table_size,
+                                           l2_offset, s->cluster_size);
            if (ret < 0) {
                goto fail;
            }
@@ -1730,7 +1827,7 @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
                }

                ret = bdrv_truncate(bs->file, offset + s->cluster_size,
-                                    &local_err);
+                                    PREALLOC_MODE_OFF, &local_err);
                if (ret < 0) {
                    error_report_err(local_err);
                    goto resize_fail;
@@ -1757,14 +1854,15 @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
                }

                res->corruptions_fixed++;
-                ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
-                                    offset, s->cluster_size);
+                ret = qcow2_inc_refcounts_imrt(bs, res,
+                                               refcount_table, nb_clusters,
+                                               offset, s->cluster_size);
                if (ret < 0) {
                    return ret;
                }
                /* No need to check whether the refcount is now greater than 1:
                 * This area was just allocated and zeroed, so it can only be
-                 * exactly 1 after inc_refcounts() */
+                 * exactly 1 after qcow2_inc_refcounts_imrt() */
                continue;

 resize_fail:
@@ -1779,8 +1877,8 @@ resize_fail:
        }

        if (offset != 0) {
-            ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
-                                offset, s->cluster_size);
+            ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
+                                           offset, s->cluster_size);
            if (ret < 0) {
                return ret;
            }
@@ -1820,8 +1918,8 @@ static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
    }

    /* header */
-    ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
-                        0, s->cluster_size);
+    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
+                                   0, s->cluster_size);
    if (ret < 0) {
        return ret;
    }
@@ -1842,16 +1940,32 @@ static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
            return ret;
        }
    }
-    ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
-                        s->snapshots_offset, s->snapshots_size);
+    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
+                                   s->snapshots_offset, s->snapshots_size);
    if (ret < 0) {
        return ret;
    }

    /* refcount data */
-    ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
-                        s->refcount_table_offset,
-                        s->refcount_table_size * sizeof(uint64_t));
+    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
+                                   s->refcount_table_offset,
+                                   s->refcount_table_size * sizeof(uint64_t));
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* encryption */
+    if (s->crypto_header.length) {
+        ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
+                                       s->crypto_header.offset,
+                                       s->crypto_header.length);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    /* bitmaps */
+    ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters);
    if (ret < 0) {
        return ret;
    }
@@ -2075,6 +2189,8 @@ write_refblocks:
             * this will leak that range, but we can easily fix that by running
             * a leak-fixing check after this rebuild operation */
            reftable_offset = -1;
+        } else {
+            assert(on_disk_reftable);
        }
        on_disk_reftable[refblock_index] = refblock_offset;

@@ -2144,8 +2260,6 @@ write_refblocks:
        goto write_refblocks;
    }

-    assert(on_disk_reftable);
-
    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
        cpu_to_be64s(&on_disk_reftable[refblock_index]);
    }
--- a/block/qcow2.c
+++ b/block/qcow2.c
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -25,7 +25,7 @@
 #ifndef BLOCK_QCOW2_H
 #define BLOCK_QCOW2_H

-#include "crypto/cipher.h"
+#include "crypto/block.h"
 #include "qemu/coroutine.h"

 //#define DEBUG_ALLOC
@@ -36,6 +36,7 @@

 #define QCOW_CRYPT_NONE 0
 #define QCOW_CRYPT_AES  1
+#define QCOW_CRYPT_LUKS 2

 #define QCOW_MAX_CRYPT_CLUSTERS 32
 #define QCOW_MAX_SNAPSHOTS 65536
@@ -52,6 +53,10 @@
 * space for snapshot names and IDs */
 #define QCOW_MAX_SNAPSHOTS_SIZE (1024 * QCOW_MAX_SNAPSHOTS)

+/* Bitmap header extension constraints */
+#define QCOW2_MAX_BITMAPS 65535
+#define QCOW2_MAX_BITMAP_DIRECTORY_SIZE (1024 * QCOW2_MAX_BITMAPS)
+
 /* indicate that the refcount of the referenced cluster is exactly one. */
 #define QCOW_OFLAG_COPIED     (1ULL << 63)
 /* indicate that the cluster is compressed (they never have the copied flag) */
@@ -163,6 +168,11 @@ typedef struct QCowSnapshot {
 struct Qcow2Cache;
 typedef struct Qcow2Cache Qcow2Cache;

+typedef struct Qcow2CryptoHeaderExtension {
+    uint64_t offset;
+    uint64_t length;
+} QEMU_PACKED Qcow2CryptoHeaderExtension;
+
 typedef struct Qcow2UnknownHeaderExtension {
    uint32_t magic;
    uint32_t len;
@@ -195,6 +205,14 @@ enum {
    QCOW2_COMPAT_FEAT_MASK            = QCOW2_COMPAT_LAZY_REFCOUNTS,
 };

+/* Autoclear feature bits */
+enum {
+    QCOW2_AUTOCLEAR_BITMAPS_BITNR = 0,
+    QCOW2_AUTOCLEAR_BITMAPS       = 1 << QCOW2_AUTOCLEAR_BITMAPS_BITNR,
+
+    QCOW2_AUTOCLEAR_MASK          = QCOW2_AUTOCLEAR_BITMAPS,
+};
+
 enum qcow2_discard_type {
    QCOW2_DISCARD_NEVER = 0,
    QCOW2_DISCARD_ALWAYS,
@@ -222,6 +240,13 @@ typedef uint64_t Qcow2GetRefcountFunc(const void *refcount_array,
 typedef void Qcow2SetRefcountFunc(void *refcount_array,
                                  uint64_t index, uint64_t value);

+typedef struct Qcow2BitmapHeaderExt {
+    uint32_t nb_bitmaps;
+    uint32_t reserved32;
+    uint64_t bitmap_directory_size;
+    uint64_t bitmap_directory_offset;
+} QEMU_PACKED Qcow2BitmapHeaderExt;
+
 typedef struct BDRVQcow2State {
    int cluster_bits;
    int cluster_size;
@@ -257,13 +282,21 @@ typedef struct BDRVQcow2State {

    CoMutex lock;

-    QCryptoCipher *cipher; /* current cipher, NULL if no key yet */
+    Qcow2CryptoHeaderExtension crypto_header; /* QCow2 header extension */
+    QCryptoBlockOpenOptions *crypto_opts; /* Disk encryption runtime options */
+    QCryptoBlock *crypto; /* Disk encryption format driver */
+    bool crypt_physical_offset; /* Whether to use virtual or physical offset
+                                   for encryption initialization vector tweak */
    uint32_t crypt_method_header;
    uint64_t snapshots_offset;
    int snapshots_size;
    unsigned int nb_snapshots;
    QCowSnapshot *snapshots;

+    uint32_t nb_bitmaps;
+    uint64_t bitmap_directory_size;
+    uint64_t bitmap_directory_offset;
+
    int flags;
    int qcow_version;
    bool use_lazy_refcounts;
@@ -301,10 +334,10 @@ typedef struct Qcow2COWRegion {
     * Offset of the COW region in bytes from the start of the first cluster
     * touched by the request.
     */
-    uint64_t    offset;
+    unsigned    offset;

    /** Number of bytes to copy */
-    int         nb_bytes;
+    unsigned    nb_bytes;
 } Qcow2COWRegion;

 /**
@@ -343,6 +376,13 @@ typedef struct QCowL2Meta
     */
    Qcow2COWRegion cow_end;

+    /**
+     * The I/O vector with the data from the actual guest write request.
+     * If non-NULL, this is meant to be merged together with the data
+     * from @cow_start and @cow_end into one single write operation.
+     */
+    QEMUIOVector *data_qiov;
+
    /** Pointer to next L2Meta of the same write request */
    struct QCowL2Meta *next;

@@ -485,6 +525,10 @@ static inline uint64_t refcount_diff(uint64_t r1, uint64_t r2)
 int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
                  int64_t sector_num, int nb_sectors);

+int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
+                                     int refcount_order, bool generous_increase,
+                                     uint64_t *refblock_count);
+
 int qcow2_mark_dirty(BlockDriverState *bs);
 int qcow2_mark_corrupt(BlockDriverState *bs);
 int qcow2_mark_consistent(BlockDriverState *bs);
@@ -505,6 +549,11 @@ int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index,
                                  uint64_t addend, bool decrease,
                                  enum qcow2_discard_type type);

+int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t offset,
+                            uint64_t additional_clusters, bool exact_size,
+                            int new_refblock_index,
+                            uint64_t new_refblock_offset);
+
 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size);
 int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
                                int64_t nb_clusters);
@@ -527,6 +576,10 @@ int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
                                 int64_t size);
 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
                                  int64_t size);
+int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
+                             void **refcount_table,
+                             int64_t *refcount_table_size,
+                             int64_t offset, int64_t size);

 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
                                BlockDriverAmendStatusCB *status_cb,
@@ -538,8 +591,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
 int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index);
 int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
 int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
-                          uint8_t *out_buf, const uint8_t *in_buf,
-                          int nb_sectors, bool enc, Error **errp);
+                          uint8_t *buf, int nb_sectors, bool enc, Error **errp);

 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
                             unsigned int *bytes, uint64_t *cluster_offset);
@@ -598,4 +650,20 @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
    void **table);
 void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);

+/* qcow2-bitmap.c functions */
+int qcow2_check_bitmaps_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
+                                  void **refcount_table,
+                                  int64_t *refcount_table_size);
+bool qcow2_load_autoloading_dirty_bitmaps(BlockDriverState *bs, Error **errp);
+int qcow2_reopen_bitmaps_rw(BlockDriverState *bs, Error **errp);
+void qcow2_store_persistent_dirty_bitmaps(BlockDriverState *bs, Error **errp);
+int qcow2_reopen_bitmaps_ro(BlockDriverState *bs, Error **errp);
+bool qcow2_can_store_new_dirty_bitmap(BlockDriverState *bs,
+                                      const char *name,
+                                      uint32_t granularity,
+                                      Error **errp);
+void qcow2_remove_persistent_dirty_bitmap(BlockDriverState *bs,
+                                          const char *name,
+                                          Error **errp);
+
 #endif
--- a/block/qed-cluster.c
+++ b/block/qed-cluster.c
@@ -61,37 +61,65 @@ static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s,
    return i - index;
 }

-typedef struct {
-    BDRVQEDState *s;
-    uint64_t pos;
-    size_t len;
-
-    QEDRequest *request;
-
-    /* User callback */
-    QEDFindClusterFunc *cb;
-    void *opaque;
-} QEDFindClusterCB;
-
-static void qed_find_cluster_cb(void *opaque, int ret)
+/**
+ * Find the offset of a data cluster
+ *
+ * @s:          QED state
+ * @request:    L2 cache entry
+ * @pos:        Byte position in device
+ * @len:        Number of bytes (may be shortened on return)
+ * @img_offset: Contains offset in the image file on success
+ *
+ * This function translates a position in the block device to an offset in the
+ * image file. The translated offset or unallocated range in the image file is
+ * reported back in *img_offset and *len.
+ *
+ * If the L2 table exists, request->l2_table points to the L2 table cache entry
+ * and the caller must free the reference when they are finished.  The cache
+ * entry is exposed in this way to avoid callers having to read the L2 table
+ * again later during request processing.  If request->l2_table is non-NULL it
+ * will be unreferenced before taking on the new cache entry.
+ *
+ * On success QED_CLUSTER_FOUND is returned and img_offset/len are a contiguous
+ * range in the image file.
+ *
+ * On failure QED_CLUSTER_L2 or QED_CLUSTER_L1 is returned for missing L2 or L1
+ * table offset, respectively. len is number of contiguous unallocated bytes.
+ *
+ * Called with table_lock held.
+ */
+int coroutine_fn qed_find_cluster(BDRVQEDState *s, QEDRequest *request,
+                                  uint64_t pos, size_t *len,
+                                  uint64_t *img_offset)
 {
-    QEDFindClusterCB *find_cluster_cb = opaque;
-    BDRVQEDState *s = find_cluster_cb->s;
-    QEDRequest *request = find_cluster_cb->request;
+    uint64_t l2_offset;
    uint64_t offset = 0;
-    size_t len = 0;
    unsigned int index;
    unsigned int n;
+    int ret;

-    qed_acquire(s);
+    /* Limit length to L2 boundary.  Requests are broken up at the L2 boundary
+     * so that a request acts on one L2 table at a time.
+     */
+    *len = MIN(*len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos);
+
+    l2_offset = s->l1_table->offsets[qed_l1_index(s, pos)];
+    if (qed_offset_is_unalloc_cluster(l2_offset)) {
+        *img_offset = 0;
+        return QED_CLUSTER_L1;
+    }
+    if (!qed_check_table_offset(s, l2_offset)) {
+        *img_offset = *len = 0;
+        return -EINVAL;
+    }
+
+    ret = qed_read_l2_table(s, request, l2_offset);
    if (ret) {
        goto out;
    }

-    index = qed_l2_index(s, find_cluster_cb->pos);
-    n = qed_bytes_to_clusters(s,
-                              qed_offset_into_cluster(s, find_cluster_cb->pos) +
-                              find_cluster_cb->len);
+    index = qed_l2_index(s, pos);
+    n = qed_bytes_to_clusters(s, qed_offset_into_cluster(s, pos) + *len);
    n = qed_count_contiguous_clusters(s, request->l2_table->table,
                                      index, n, &offset);

@@ -105,64 +133,10 @@ static void qed_find_cluster_cb(void *opaque, int ret)
        ret = -EINVAL;
    }

-    len = MIN(find_cluster_cb->len, n * s->header.cluster_size -
-              qed_offset_into_cluster(s, find_cluster_cb->pos));
+    *len = MIN(*len,
+               n * s->header.cluster_size - qed_offset_into_cluster(s, pos));

 out:
-    find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len);
-    qed_release(s);
-    g_free(find_cluster_cb);
-}
-
-/**
- * Find the offset of a data cluster
- *
- * @s:          QED state
- * @request:    L2 cache entry
- * @pos:        Byte position in device
- * @len:        Number of bytes
- * @cb:         Completion function
- * @opaque:     User data for completion function
- *
- * This function translates a position in the block device to an offset in the
- * image file.  It invokes the cb completion callback to report back the
- * translated offset or unallocated range in the image file.
- *
- * If the L2 table exists, request->l2_table points to the L2 table cache entry
- * and the caller must free the reference when they are finished.  The cache
- * entry is exposed in this way to avoid callers having to read the L2 table
- * again later during request processing.  If request->l2_table is non-NULL it
- * will be unreferenced before taking on the new cache entry.
- */
-void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
-                      size_t len, QEDFindClusterFunc *cb, void *opaque)
-{
-    QEDFindClusterCB *find_cluster_cb;
-    uint64_t l2_offset;
-
-    /* Limit length to L2 boundary.  Requests are broken up at the L2 boundary
-     * so that a request acts on one L2 table at a time.
-     */
-    len = MIN(len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos);
-
-    l2_offset = s->l1_table->offsets[qed_l1_index(s, pos)];
-    if (qed_offset_is_unalloc_cluster(l2_offset)) {
-        cb(opaque, QED_CLUSTER_L1, 0, len);
-        return;
-    }
-    if (!qed_check_table_offset(s, l2_offset)) {
-        cb(opaque, -EINVAL, 0, 0);
-        return;
-    }
-
-    find_cluster_cb = g_malloc(sizeof(*find_cluster_cb));
-    find_cluster_cb->s = s;
-    find_cluster_cb->pos = pos;
-    find_cluster_cb->len = len;
-    find_cluster_cb->cb = cb;
-    find_cluster_cb->opaque = opaque;
-    find_cluster_cb->request = request;
-
-    qed_read_l2_table(s, request, l2_offset,
-                      qed_find_cluster_cb, find_cluster_cb);
+    *img_offset = offset;
+    return ret;
 }
--- a/block/qed-gencb.c
+++ b/block/qed-gencb.c
@@ -1,33 +0,0 @@
-/*
- * QEMU Enhanced Disk Format
- *
- * Copyright IBM, Corp. 2010
- *
- * Authors:
- *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
- *
- * This work is licensed under the terms of the GNU LGPL, version 2 or later.
- * See the COPYING.LIB file in the top-level directory.
- *
- */
-
-#include "qemu/osdep.h"
-#include "qed.h"
-
-void *gencb_alloc(size_t len, BlockCompletionFunc *cb, void *opaque)
-{
-    GenericCB *gencb = g_malloc(len);
-    gencb->cb = cb;
-    gencb->opaque = opaque;
-    return gencb;
-}
-
-void gencb_complete(void *opaque, int ret)
-{
-    GenericCB *gencb = opaque;
-    BlockCompletionFunc *cb = gencb->cb;
-    void *user_opaque = gencb->opaque;
-
-    g_free(gencb);
-    cb(user_opaque, ret);
-}
--- a/block/qed-l2-cache.c
+++ b/block/qed-l2-cache.c
@@ -101,6 +101,8 @@ CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache)
 /**
 * Decrease an entry's reference count and free if necessary when the reference
 * count drops to zero.
+ *
+ * Called with table_lock held.
 */
 void qed_unref_l2_cache_entry(CachedL2Table *entry)
 {
@@ -122,6 +124,8 @@ void qed_unref_l2_cache_entry(CachedL2Table *entry)
 *
 * For a cached entry, this function increases the reference count and returns
 * the entry.
+ *
+ * Called with table_lock held.
 */
 CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset)
 {
@@ -150,6 +154,8 @@ CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset)
 * N.B. This function steals a reference to the l2_table from the caller so the
 * caller must obtain a new reference by issuing a call to
 * qed_find_l2_cache_entry().
+ *
+ * Called with table_lock held.
 */
 void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table)
 {
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -18,99 +18,43 @@
 #include "qed.h"
 #include "qemu/bswap.h"

-typedef struct {
-    GenericCB gencb;
-    BDRVQEDState *s;
-    QEDTable *table;
-
-    struct iovec iov;
-    QEMUIOVector qiov;
-} QEDReadTableCB;
-
-static void qed_read_table_cb(void *opaque, int ret)
+/* Called either from qed_check or with table_lock held.  */
+static int qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table)
 {
-    QEDReadTableCB *read_table_cb = opaque;
-    QEDTable *table = read_table_cb->table;
-    BDRVQEDState *s = read_table_cb->s;
-    int noffsets = read_table_cb->qiov.size / sizeof(uint64_t);
-    int i;
+    QEMUIOVector qiov;
+    int noffsets;
+    int i, ret;

-    /* Handle I/O error */
-    if (ret) {
+    struct iovec iov = {
+        .iov_base = table->offsets,
+        .iov_len = s->header.cluster_size * s->header.table_size,
+    };
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    trace_qed_read_table(s, offset, table);
+
+    if (qemu_in_coroutine()) {
+        qemu_co_mutex_unlock(&s->table_lock);
+    }
+    ret = bdrv_preadv(s->bs->file, offset, &qiov);
+    if (qemu_in_coroutine()) {
+        qemu_co_mutex_lock(&s->table_lock);
+    }
+    if (ret < 0) {
        goto out;
    }

    /* Byteswap offsets */
-    qed_acquire(s);
+    noffsets = qiov.size / sizeof(uint64_t);
    for (i = 0; i < noffsets; i++) {
        table->offsets[i] = le64_to_cpu(table->offsets[i]);
    }
-    qed_release(s);

+    ret = 0;
 out:
    /* Completion */
-    trace_qed_read_table_cb(s, read_table_cb->table, ret);
-    gencb_complete(&read_table_cb->gencb, ret);
-}
-
-static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
-                           BlockCompletionFunc *cb, void *opaque)
-{
-    QEDReadTableCB *read_table_cb = gencb_alloc(sizeof(*read_table_cb),
-                                                cb, opaque);
-    QEMUIOVector *qiov = &read_table_cb->qiov;
-
-    trace_qed_read_table(s, offset, table);
-
-    read_table_cb->s = s;
-    read_table_cb->table = table;
-    read_table_cb->iov.iov_base = table->offsets,
-    read_table_cb->iov.iov_len = s->header.cluster_size * s->header.table_size,
-
-    qemu_iovec_init_external(qiov, &read_table_cb->iov, 1);
-    bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov,
-                   qiov->size / BDRV_SECTOR_SIZE,
-                   qed_read_table_cb, read_table_cb);
-}
-
-typedef struct {
-    GenericCB gencb;
-    BDRVQEDState *s;
-    QEDTable *orig_table;
-    QEDTable *table;
-    bool flush;             /* flush after write? */
-
-    struct iovec iov;
-    QEMUIOVector qiov;
-} QEDWriteTableCB;
-
-static void qed_write_table_cb(void *opaque, int ret)
-{
-    QEDWriteTableCB *write_table_cb = opaque;
-    BDRVQEDState *s = write_table_cb->s;
-
-    trace_qed_write_table_cb(s,
-                             write_table_cb->orig_table,
-                             write_table_cb->flush,
-                             ret);
-
-    if (ret) {
-        goto out;
-    }
-
-    if (write_table_cb->flush) {
-        /* We still need to flush first */
-        write_table_cb->flush = false;
-        qed_acquire(s);
-        bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb,
-                       write_table_cb);
-        qed_release(s);
-        return;
-    }
-
-out:
-    qemu_vfree(write_table_cb->table);
-    gencb_complete(&write_table_cb->gencb, ret);
+    trace_qed_read_table_cb(s, table, ret);
+    return ret;
 }

 /**
@@ -122,17 +66,19 @@ out:
 * @index:      Index of first element
 * @n:          Number of elements
 * @flush:      Whether or not to sync to disk
- * @cb:         Completion function
- * @opaque:     Argument for completion function
+ *
+ * Called either from qed_check or with table_lock held.
 */
-static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
-                            unsigned int index, unsigned int n, bool flush,
-                            BlockCompletionFunc *cb, void *opaque)
+static int qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
+                           unsigned int index, unsigned int n, bool flush)
 {
-    QEDWriteTableCB *write_table_cb;
    unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1;
    unsigned int start, end, i;
+    QEDTable *new_table;
+    struct iovec iov;
+    QEMUIOVector qiov;
    size_t len_bytes;
+    int ret;

    trace_qed_write_table(s, offset, table, index, n);

@@ -142,157 +88,120 @@ static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,

    len_bytes = (end - start) * sizeof(uint64_t);

-    write_table_cb = gencb_alloc(sizeof(*write_table_cb), cb, opaque);
-    write_table_cb->s = s;
-    write_table_cb->orig_table = table;
-    write_table_cb->flush = flush;
-    write_table_cb->table = qemu_blockalign(s->bs, len_bytes);
-    write_table_cb->iov.iov_base = write_table_cb->table->offsets;
-    write_table_cb->iov.iov_len = len_bytes;
-    qemu_iovec_init_external(&write_table_cb->qiov, &write_table_cb->iov, 1);
+    new_table = qemu_blockalign(s->bs, len_bytes);
+    iov = (struct iovec) {
+        .iov_base = new_table->offsets,
+        .iov_len = len_bytes,
+    };
+    qemu_iovec_init_external(&qiov, &iov, 1);

    /* Byteswap table */
    for (i = start; i < end; i++) {
        uint64_t le_offset = cpu_to_le64(table->offsets[i]);
-        write_table_cb->table->offsets[i - start] = le_offset;
+        new_table->offsets[i - start] = le_offset;
    }

    /* Adjust for offset into table */
    offset += start * sizeof(uint64_t);

-    bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
-                    &write_table_cb->qiov,
-                    write_table_cb->qiov.size / BDRV_SECTOR_SIZE,
-                    qed_write_table_cb, write_table_cb);
-}
+    if (qemu_in_coroutine()) {
+        qemu_co_mutex_unlock(&s->table_lock);
+    }
+    ret = bdrv_pwritev(s->bs->file, offset, &qiov);
+    if (qemu_in_coroutine()) {
+        qemu_co_mutex_lock(&s->table_lock);
+    }
+    trace_qed_write_table_cb(s, table, flush, ret);
+    if (ret < 0) {
+        goto out;
+    }

-/**
- * Propagate return value from async callback
- */
-static void qed_sync_cb(void *opaque, int ret)
-{
-    *(int *)opaque = ret;
+    if (flush) {
+        ret = bdrv_flush(s->bs);
+        if (ret < 0) {
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    qemu_vfree(new_table);
+    return ret;
 }

 int qed_read_l1_table_sync(BDRVQEDState *s)
 {
-    int ret = -EINPROGRESS;
-
-    qed_read_table(s, s->header.l1_table_offset,
-                   s->l1_table, qed_sync_cb, &ret);
-    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
-
-    return ret;
+    return qed_read_table(s, s->header.l1_table_offset, s->l1_table);
 }

-void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
-                        BlockCompletionFunc *cb, void *opaque)
+/* Called either from qed_check or with table_lock held.  */
+int qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n)
 {
    BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE);
-    qed_write_table(s, s->header.l1_table_offset,
-                    s->l1_table, index, n, false, cb, opaque);
+    return qed_write_table(s, s->header.l1_table_offset,
+                           s->l1_table, index, n, false);
 }

 int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
                            unsigned int n)
 {
-    int ret = -EINPROGRESS;
-
-    qed_write_l1_table(s, index, n, qed_sync_cb, &ret);
-    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
-
-    return ret;
+    return qed_write_l1_table(s, index, n);
 }

-typedef struct {
-    GenericCB gencb;
-    BDRVQEDState *s;
-    uint64_t l2_offset;
-    QEDRequest *request;
-} QEDReadL2TableCB;
-
-static void qed_read_l2_table_cb(void *opaque, int ret)
+/* Called either from qed_check or with table_lock held.  */
+int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
 {
-    QEDReadL2TableCB *read_l2_table_cb = opaque;
-    QEDRequest *request = read_l2_table_cb->request;
-    BDRVQEDState *s = read_l2_table_cb->s;
-    CachedL2Table *l2_table = request->l2_table;
-    uint64_t l2_offset = read_l2_table_cb->l2_offset;
-
-    qed_acquire(s);
-    if (ret) {
-        /* can't trust loaded L2 table anymore */
-        qed_unref_l2_cache_entry(l2_table);
-        request->l2_table = NULL;
-    } else {
-        l2_table->offset = l2_offset;
-
-        qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
-
-        /* This is guaranteed to succeed because we just committed the entry
-         * to the cache.
-         */
-        request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
-        assert(request->l2_table != NULL);
-    }
-    qed_release(s);
-
-    gencb_complete(&read_l2_table_cb->gencb, ret);
-}
-
-void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
-                       BlockCompletionFunc *cb, void *opaque)
-{
-    QEDReadL2TableCB *read_l2_table_cb;
+    int ret;

    qed_unref_l2_cache_entry(request->l2_table);

    /* Check for cached L2 entry */
    request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset);
    if (request->l2_table) {
-        cb(opaque, 0);
-        return;
+        return 0;
    }

    request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
    request->l2_table->table = qed_alloc_table(s);

-    read_l2_table_cb = gencb_alloc(sizeof(*read_l2_table_cb), cb, opaque);
-    read_l2_table_cb->s = s;
-    read_l2_table_cb->l2_offset = offset;
-    read_l2_table_cb->request = request;
-
    BLKDBG_EVENT(s->bs->file, BLKDBG_L2_LOAD);
-    qed_read_table(s, offset, request->l2_table->table,
-                   qed_read_l2_table_cb, read_l2_table_cb);
-}
+    ret = qed_read_table(s, offset, request->l2_table->table);

-int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
-{
-    int ret = -EINPROGRESS;
+    if (ret) {
+        /* can't trust loaded L2 table anymore */
+        qed_unref_l2_cache_entry(request->l2_table);
+        request->l2_table = NULL;
+    } else {
+        request->l2_table->offset = offset;

-    qed_read_l2_table(s, request, offset, qed_sync_cb, &ret);
-    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
+        qed_commit_l2_cache_entry(&s->l2_cache, request->l2_table);
+
+        /* This is guaranteed to succeed because we just committed the entry
+         * to the cache.
+         */
+        request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset);
+        assert(request->l2_table != NULL);
+    }

    return ret;
 }

-void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
-                        unsigned int index, unsigned int n, bool flush,
-                        BlockCompletionFunc *cb, void *opaque)
+int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
+{
+    return qed_read_l2_table(s, request, offset);
+}
+
+/* Called either from qed_check or with table_lock held.  */
+int qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
+                       unsigned int index, unsigned int n, bool flush)
 {
    BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE);
-    qed_write_table(s, request->l2_table->offset,
-                    request->l2_table->table, index, n, flush, cb, opaque);
+    return qed_write_table(s, request->l2_table->offset,
+                           request->l2_table->table, index, n, flush);
 }

 int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
                            unsigned int index, unsigned int n, bool flush)
 {
-    int ret = -EINPROGRESS;
-
-    qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret);
-    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
-
-    return ret;
+    return qed_write_l2_table(s, request, index, n, flush);
 }
--- a/block/qed.c
+++ b/block/qed.c
--- a/block/qed.h
+++ b/block/qed.h
@@ -129,8 +129,7 @@ enum {
 };

 typedef struct QEDAIOCB {
-    BlockAIOCB common;
-    int bh_ret;                     /* final return status for completion bh */
+    BlockDriverState *bs;
    QSIMPLEQ_ENTRY(QEDAIOCB) next;  /* next request */
    int flags;                      /* QED_AIOCB_* bits ORed together */
    uint64_t end_pos;               /* request end on block device, in bytes */
@@ -152,18 +151,25 @@ typedef struct QEDAIOCB {

 typedef struct {
    BlockDriverState *bs;           /* device */
-    uint64_t file_size;             /* length of image file, in bytes */

+    /* Written only by an allocating write or the timer handler (the latter
+     * while allocating reqs are plugged).
+     */
    QEDHeader header;               /* always cpu-endian */
+
+    /* Protected by table_lock.  */
+    CoMutex table_lock;
    QEDTable *l1_table;
    L2TableCache l2_cache;          /* l2 table cache */
    uint32_t table_nelems;
    uint32_t l1_shift;
    uint32_t l2_shift;
    uint32_t l2_mask;
+    uint64_t file_size;             /* length of image file, in bytes */

    /* Allocating write request queue */
-    QSIMPLEQ_HEAD(, QEDAIOCB) allocating_write_reqs;
+    QEDAIOCB *allocating_acb;
+    CoQueue allocating_write_reqs;
    bool allocating_write_reqs_plugged;

    /* Periodic flush and clear need check flag */
@@ -177,41 +183,6 @@ enum {
    QED_CLUSTER_L1,            /* cluster missing in L1 */
 };

-/**
- * qed_find_cluster() completion callback
- *
- * @opaque:     User data for completion callback
- * @ret:        QED_CLUSTER_FOUND   Success
- *              QED_CLUSTER_L2      Data cluster unallocated in L2
- *              QED_CLUSTER_L1      L2 unallocated in L1
- *              -errno              POSIX error occurred
- * @offset:     Data cluster offset
- * @len:        Contiguous bytes starting from cluster offset
- *
- * This function is invoked when qed_find_cluster() completes.
- *
- * On success ret is QED_CLUSTER_FOUND and offset/len are a contiguous range
- * in the image file.
- *
- * On failure ret is QED_CLUSTER_L2 or QED_CLUSTER_L1 for missing L2 or L1
- * table offset, respectively.  len is number of contiguous unallocated bytes.
- */
-typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len);
-
-void qed_acquire(BDRVQEDState *s);
-void qed_release(BDRVQEDState *s);
-
-/**
- * Generic callback for chaining async callbacks
- */
-typedef struct {
-    BlockCompletionFunc *cb;
-    void *opaque;
-} GenericCB;
-
-void *gencb_alloc(size_t len, BlockCompletionFunc *cb, void *opaque);
-void gencb_complete(void *opaque, int ret);
-
 /**
 * Header functions
 */
@@ -231,25 +202,23 @@ void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table);
 * Table I/O functions
 */
 int qed_read_l1_table_sync(BDRVQEDState *s);
-void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
-                        BlockCompletionFunc *cb, void *opaque);
+int qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n);
 int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
                            unsigned int n);
 int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
                           uint64_t offset);
-void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
-                       BlockCompletionFunc *cb, void *opaque);
-void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
-                        unsigned int index, unsigned int n, bool flush,
-                        BlockCompletionFunc *cb, void *opaque);
+int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset);
+int qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
+                       unsigned int index, unsigned int n, bool flush);
 int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
                            unsigned int index, unsigned int n, bool flush);

 /**
 * Cluster functions
 */
-void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
-                      size_t len, QEDFindClusterFunc *cb, void *opaque);
+int coroutine_fn qed_find_cluster(BDRVQEDState *s, QEDRequest *request,
+                                  uint64_t pos, size_t *len,
+                                  uint64_t *img_offset);

 /**
 * Consistency check
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -785,8 +785,7 @@ static coroutine_fn int quorum_co_flush(BlockDriverState *bs)
    for (i = 0; i < s->num_children; i++) {
        result = bdrv_co_flush(s->children[i]->bs);
        if (result) {
-            quorum_report_bad(QUORUM_OP_TYPE_FLUSH, 0,
-                              bdrv_getlength(s->children[i]->bs),
+            quorum_report_bad(QUORUM_OP_TYPE_FLUSH, 0, 0,
                              s->children[i]->bs->node_name, result);
            result_value.l = result;
            quorum_count_vote(&error_votes, &result_value, i);
@@ -868,30 +867,13 @@ static QemuOptsList quorum_runtime_opts = {
    },
 };

-static int parse_read_pattern(const char *opt)
-{
-    int i;
-
-    if (!opt) {
-        /* Set quorum as default */
-        return QUORUM_READ_PATTERN_QUORUM;
-    }
-
-    for (i = 0; i < QUORUM_READ_PATTERN__MAX; i++) {
-        if (!strcmp(opt, QuorumReadPattern_lookup[i])) {
-            return i;
-        }
-    }
-
-    return -EINVAL;
-}
-
 static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
                       Error **errp)
 {
    BDRVQuorumState *s = bs->opaque;
    Error *local_err = NULL;
    QemuOpts *opts = NULL;
+    const char *pattern_str;
    bool *opened;
    int i;
    int ret = 0;
@@ -926,7 +908,13 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
        goto exit;
    }

-    ret = parse_read_pattern(qemu_opt_get(opts, QUORUM_OPT_READ_PATTERN));
+    pattern_str = qemu_opt_get(opts, QUORUM_OPT_READ_PATTERN);
+    if (!pattern_str) {
+        ret = QUORUM_READ_PATTERN_QUORUM;
+    } else {
+        ret = qapi_enum_parse(&QuorumReadPattern_lookup, pattern_str,
+                              -EINVAL, NULL);
+    }
    if (ret < 0) {
        error_setg(&local_err, "Please set read-pattern as fifo or quorum");
        goto exit;
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -259,12 +259,12 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
    *pnum = nb_sectors;
    *file = bs->file->bs;
    sector_num += s->offset / BDRV_SECTOR_SIZE;
-    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
+    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
           (sector_num << BDRV_SECTOR_BITS);
 }

 static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs,
-                                             int64_t offset, int count,
+                                             int64_t offset, int bytes,
                                             BdrvRequestFlags flags)
 {
    BDRVRawState *s = bs->opaque;
@@ -272,18 +272,18 @@ static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs,
        return -EINVAL;
    }
    offset += s->offset;
-    return bdrv_co_pwrite_zeroes(bs->file, offset, count, flags);
+    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
 }

 static int coroutine_fn raw_co_pdiscard(BlockDriverState *bs,
-                                        int64_t offset, int count)
+                                        int64_t offset, int bytes)
 {
    BDRVRawState *s = bs->opaque;
    if (offset > UINT64_MAX - s->offset) {
        return -EINVAL;
    }
    offset += s->offset;
-    return bdrv_co_pdiscard(bs->file->bs, offset, count);
+    return bdrv_co_pdiscard(bs->file->bs, offset, bytes);
 }

 static int64_t raw_getlength(BlockDriverState *bs)
@@ -312,6 +312,31 @@ static int64_t raw_getlength(BlockDriverState *bs)
    return s->size;
 }

+static BlockMeasureInfo *raw_measure(QemuOpts *opts, BlockDriverState *in_bs,
+                                     Error **errp)
+{
+    BlockMeasureInfo *info;
+    int64_t required;
+
+    if (in_bs) {
+        required = bdrv_getlength(in_bs);
+        if (required < 0) {
+            error_setg_errno(errp, -required, "Unable to get image size");
+            return NULL;
+        }
+    } else {
+        required = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                            BDRV_SECTOR_SIZE);
+    }
+
+    info = g_new(BlockMeasureInfo, 1);
+    info->required = required;
+
+    /* Unallocated sectors count towards the file size in raw images */
+    info->fully_allocated = info->required;
+    return info;
+}
+
 static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
    return bdrv_get_info(bs->file->bs, bdi);
@@ -327,7 +352,8 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
    }
 }

-static int raw_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
+static int raw_truncate(BlockDriverState *bs, int64_t offset,
+                        PreallocMode prealloc, Error **errp)
 {
    BDRVRawState *s = bs->opaque;

@@ -343,12 +369,7 @@ static int raw_truncate(BlockDriverState *bs, int64_t offset, Error **errp)

    s->size = offset;
    offset += s->offset;
-    return bdrv_truncate(bs->file, offset, errp);
-}
-
-static int raw_media_changed(BlockDriverState *bs)
-{
-    return bdrv_media_changed(bs->file->bs);
+    return bdrv_truncate(bs->file, offset, prealloc, errp);
 }

 static void raw_eject(BlockDriverState *bs, bool eject_flag)
@@ -479,11 +500,11 @@ BlockDriver bdrv_raw = {
    .bdrv_truncate        = &raw_truncate,
    .bdrv_getlength       = &raw_getlength,
    .has_variable_length  = true,
+    .bdrv_measure         = &raw_measure,
    .bdrv_get_info        = &raw_get_info,
    .bdrv_refresh_limits  = &raw_refresh_limits,
    .bdrv_probe_blocksizes = &raw_probe_blocksizes,
    .bdrv_probe_geometry  = &raw_probe_geometry,
-    .bdrv_media_changed   = &raw_media_changed,
    .bdrv_eject           = &raw_eject,
    .bdrv_lock_medium     = &raw_lock_medium,
    .bdrv_co_ioctl        = &raw_co_ioctl,
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -555,9 +555,9 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
     * filename encoded options */
    filename = qdict_get_try_str(options, "filename");
    if (filename) {
-        error_report("Warning: 'filename' option specified. "
-                      "This is an unsupported option, and may be deprecated "
-                      "in the future");
+        warn_report("'filename' option specified. "
+                    "This is an unsupported option, and may be deprecated "
+                    "in the future");
        qemu_rbd_parse_filename(filename, options, &local_err);
        if (local_err) {
            r = -EINVAL;
@@ -936,11 +936,18 @@ static int64_t qemu_rbd_getlength(BlockDriverState *bs)
    return info.size;
 }

-static int qemu_rbd_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
+static int qemu_rbd_truncate(BlockDriverState *bs, int64_t offset,
+                             PreallocMode prealloc, Error **errp)
 {
    BDRVRBDState *s = bs->opaque;
    int r;

+    if (prealloc != PREALLOC_MODE_OFF) {
+        error_setg(errp, "Unsupported preallocation mode '%s'",
+                   PreallocMode_str(prealloc));
+        return -ENOTSUP;
+    }
+
    r = rbd_resize(s->image, offset);
    if (r < 0) {
        error_setg_errno(errp, -r, "Failed to resize file");
@@ -1065,11 +1072,11 @@ static int qemu_rbd_snap_list(BlockDriverState *bs,
 #ifdef LIBRBD_SUPPORTS_DISCARD
 static BlockAIOCB *qemu_rbd_aio_pdiscard(BlockDriverState *bs,
                                         int64_t offset,
-                                         int count,
+                                         int bytes,
                                         BlockCompletionFunc *cb,
                                         void *opaque)
 {
-    return rbd_start_aio(bs, offset, NULL, count, cb, opaque,
+    return rbd_start_aio(bs, offset, NULL, bytes, cb, opaque,
                         RBD_AIO_DISCARD);
 }
 #endif
--- a/block/replication.c
+++ b/block/replication.c
@@ -234,10 +234,14 @@ static coroutine_fn int replication_co_readv(BlockDriverState *bs,
    }

    if (job) {
-        backup_wait_for_overlapping_requests(child->bs->job, sector_num,
-                                             remaining_sectors);
-        backup_cow_request_begin(&req, child->bs->job, sector_num,
-                                 remaining_sectors);
+        uint64_t remaining_bytes = remaining_sectors * BDRV_SECTOR_SIZE;
+
+        backup_wait_for_overlapping_requests(child->bs->job,
+                                             sector_num * BDRV_SECTOR_SIZE,
+                                             remaining_bytes);
+        backup_cow_request_begin(&req, child->bs->job,
+                                 sector_num * BDRV_SECTOR_SIZE,
+                                 remaining_bytes);
        ret = bdrv_co_readv(bs->file, sector_num, remaining_sectors,
                            qiov);
        backup_cow_request_end(&req);
@@ -260,7 +264,8 @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs,
    BdrvChild *top = bs->file;
    BdrvChild *base = s->secondary_disk;
    BdrvChild *target;
-    int ret, n;
+    int ret;
+    int64_t n;

    ret = replication_get_io_status(s);
    if (ret < 0) {
@@ -279,14 +284,20 @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs,
     */
    qemu_iovec_init(&hd_qiov, qiov->niov);
    while (remaining_sectors > 0) {
-        ret = bdrv_is_allocated_above(top->bs, base->bs, sector_num,
-                                      remaining_sectors, &n);
+        int64_t count;
+
+        ret = bdrv_is_allocated_above(top->bs, base->bs,
+                                      sector_num * BDRV_SECTOR_SIZE,
+                                      remaining_sectors * BDRV_SECTOR_SIZE,
+                                      &count);
        if (ret < 0) {
            goto out1;
        }

+        assert(QEMU_IS_ALIGNED(count, BDRV_SECTOR_SIZE));
+        n = count >> BDRV_SECTOR_BITS;
        qemu_iovec_reset(&hd_qiov);
-        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, n * BDRV_SECTOR_SIZE);
+        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, count);

        target = ret ? top : base;
        ret = bdrv_co_writev(target, sector_num, n, &hd_qiov);
@@ -296,7 +307,7 @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs,

        remaining_sectors -= n;
        sector_num += n;
-        bytes_done += n * BDRV_SECTOR_SIZE;
+        bytes_done += count;
    }

 out1:
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -390,6 +390,7 @@ struct BDRVSheepdogState {
    QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head;
    QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head;

+    CoMutex queue_lock;
    CoQueue overlapping_queue;
    QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head;
 };
@@ -488,7 +489,7 @@ static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *acb)
 retry:
    QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
        if (AIOCBOverlapping(acb, cb)) {
-            qemu_co_queue_wait(&s->overlapping_queue, NULL);
+            qemu_co_queue_wait(&s->overlapping_queue, &s->queue_lock);
            goto retry;
        }
    }
@@ -525,8 +526,10 @@ static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
        return;
    }

+    qemu_co_mutex_lock(&s->queue_lock);
    wait_for_overlapping_aiocb(s, acb);
    QLIST_INSERT_HEAD(&s->inflight_aiocb_head, acb, aiocb_siblings);
+    qemu_co_mutex_unlock(&s->queue_lock);
 }

 static SocketAddress *sd_socket_address(const char *path,
@@ -588,7 +591,7 @@ static int connect_to_sdog(BDRVSheepdogState *s, Error **errp)
 {
    int fd;

-    fd = socket_connect(s->addr, NULL, NULL, errp);
+    fd = socket_connect(s->addr, errp);

    if (s->addr->type == SOCKET_ADDRESS_TYPE_INET && fd >= 0) {
        int ret = socket_set_nodelay(fd);
@@ -785,6 +788,7 @@ static coroutine_fn void reconnect_to_sdog(void *opaque)
     * have to move all the inflight requests to the failed queue before
     * resend_aioreq() is called.
     */
+    qemu_co_mutex_lock(&s->queue_lock);
    QLIST_FOREACH_SAFE(aio_req, &s->inflight_aio_head, aio_siblings, next) {
        QLIST_REMOVE(aio_req, aio_siblings);
        QLIST_INSERT_HEAD(&s->failed_aio_head, aio_req, aio_siblings);
@@ -794,8 +798,11 @@ static coroutine_fn void reconnect_to_sdog(void *opaque)
    while (!QLIST_EMPTY(&s->failed_aio_head)) {
        aio_req = QLIST_FIRST(&s->failed_aio_head);
        QLIST_REMOVE(aio_req, aio_siblings);
+        qemu_co_mutex_unlock(&s->queue_lock);
        resend_aioreq(s, aio_req);
+        qemu_co_mutex_lock(&s->queue_lock);
    }
+    qemu_co_mutex_unlock(&s->queue_lock);
 }

 /*
@@ -887,7 +894,10 @@ static void coroutine_fn aio_read_response(void *opaque)
    */
    s->co_recv = NULL;

+    qemu_co_mutex_lock(&s->queue_lock);
    QLIST_REMOVE(aio_req, aio_siblings);
+    qemu_co_mutex_unlock(&s->queue_lock);
+
    switch (rsp.result) {
    case SD_RES_SUCCESS:
        break;
@@ -1046,11 +1056,11 @@ static void sd_parse_uri(SheepdogConfig *cfg, const char *filename,
    }

    /* transport */
-    if (!strcmp(uri->scheme, "sheepdog")) {
+    if (!g_strcmp0(uri->scheme, "sheepdog")) {
        is_unix = false;
-    } else if (!strcmp(uri->scheme, "sheepdog+tcp")) {
+    } else if (!g_strcmp0(uri->scheme, "sheepdog+tcp")) {
        is_unix = false;
-    } else if (!strcmp(uri->scheme, "sheepdog+unix")) {
+    } else if (!g_strcmp0(uri->scheme, "sheepdog+unix")) {
        is_unix = true;
    } else {
        error_setg(&err, "URI scheme must be 'sheepdog', 'sheepdog+tcp',"
@@ -1307,7 +1317,9 @@ static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
    uint64_t old_oid = aio_req->base_oid;
    bool create = aio_req->create;

+    qemu_co_mutex_lock(&s->queue_lock);
    QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
+    qemu_co_mutex_unlock(&s->queue_lock);

    if (!nr_copies) {
        error_report("bug");
@@ -1678,6 +1690,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,
    bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE;
    pstrcpy(s->name, sizeof(s->name), vdi);
    qemu_co_mutex_init(&s->lock);
+    qemu_co_mutex_init(&s->queue_lock);
    qemu_co_queue_init(&s->overlapping_queue);
    qemu_opts_del(opts);
    g_free(buf);
@@ -2153,13 +2166,20 @@ static int64_t sd_getlength(BlockDriverState *bs)
    return s->inode.vdi_size;
 }

-static int sd_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
+static int sd_truncate(BlockDriverState *bs, int64_t offset,
+                       PreallocMode prealloc, Error **errp)
 {
    BDRVSheepdogState *s = bs->opaque;
    int ret, fd;
    unsigned int datalen;
    uint64_t max_vdi_size;

+    if (prealloc != PREALLOC_MODE_OFF) {
+        error_setg(errp, "Unsupported preallocation mode '%s'",
+                   PreallocMode_str(prealloc));
+        return -ENOTSUP;
+    }
+
    max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
    if (offset < s->inode.vdi_size) {
        error_setg(errp, "shrinking is not supported");
@@ -2431,12 +2451,16 @@ static void coroutine_fn sd_co_rw_vector(SheepdogAIOCB *acb)

 static void sd_aio_complete(SheepdogAIOCB *acb)
 {
+    BDRVSheepdogState *s;
    if (acb->aiocb_type == AIOCB_FLUSH_CACHE) {
        return;
    }

+    s = acb->s;
+    qemu_co_mutex_lock(&s->queue_lock);
    QLIST_REMOVE(acb, aiocb_siblings);
-    qemu_co_queue_restart_all(&acb->s->overlapping_queue);
+    qemu_co_queue_restart_all(&s->overlapping_queue);
+    qemu_co_mutex_unlock(&s->queue_lock);
 }

 static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
@@ -2448,7 +2472,7 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
    BDRVSheepdogState *s = bs->opaque;

    if (offset > s->inode.vdi_size) {
-        ret = sd_truncate(bs, offset, NULL);
+        ret = sd_truncate(bs, offset, PREALLOC_MODE_OFF, NULL);
        if (ret < 0) {
            return ret;
        }
@@ -2935,7 +2959,7 @@ static int sd_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,


 static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
-                                      int count)
+                                      int bytes)
 {
    SheepdogAIOCB acb;
    BDRVSheepdogState *s = bs->opaque;
@@ -2953,11 +2977,11 @@ static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
    iov.iov_len = sizeof(zero);
    discard_iov.iov = &iov;
    discard_iov.niov = 1;
-    if (!QEMU_IS_ALIGNED(offset | count, BDRV_SECTOR_SIZE)) {
+    if (!QEMU_IS_ALIGNED(offset | bytes, BDRV_SECTOR_SIZE)) {
        return -ENOTSUP;
    }
    sd_aio_setup(&acb, s, &discard_iov, offset >> BDRV_SECTOR_BITS,
-                 count >> BDRV_SECTOR_BITS, AIOCB_DISCARD_OBJ);
+                 bytes >> BDRV_SECTOR_BITS, AIOCB_DISCARD_OBJ);
    sd_co_rw_vector(&acb);
    sd_aio_complete(&acb);

--- a/block/ssh.c
+++ b/block/ssh.c
@@ -204,7 +204,7 @@ static int parse_uri(const char *filename, QDict *options, Error **errp)
        return -EINVAL;
    }

-    if (strcmp(uri->scheme, "ssh") != 0) {
+    if (g_strcmp0(uri->scheme, "ssh") != 0) {
        error_setg(errp, "URI scheme must be 'ssh'");
        goto err;
    }
@@ -678,7 +678,7 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
    }

    /* Open the socket and connect. */
-    s->sock = inet_connect_saddr(s->inet, NULL, NULL, errp);
+    s->sock = inet_connect_saddr(s->inet, errp);
    if (s->sock < 0) {
        ret = -EIO;
        goto err;
@@ -888,13 +888,22 @@ static int ssh_has_zero_init(BlockDriverState *bs)
    return has_zero_init;
 }

+typedef struct BDRVSSHRestart {
+    BlockDriverState *bs;
+    Coroutine *co;
+} BDRVSSHRestart;
+
 static void restart_coroutine(void *opaque)
 {
-    Coroutine *co = opaque;
+    BDRVSSHRestart *restart = opaque;
+    BlockDriverState *bs = restart->bs;
+    BDRVSSHState *s = bs->opaque;
+    AioContext *ctx = bdrv_get_aio_context(bs);

-    DPRINTF("co=%p", co);
+    DPRINTF("co=%p", restart->co);
+    aio_set_fd_handler(ctx, s->sock, false, NULL, NULL, NULL, NULL);

-    aio_co_wake(co);
+    aio_co_wake(restart->co);
 }

 /* A non-blocking call returned EAGAIN, so yield, ensuring the
@@ -905,7 +914,10 @@ static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs)
 {
    int r;
    IOHandler *rd_handler = NULL, *wr_handler = NULL;
-    Coroutine *co = qemu_coroutine_self();
+    BDRVSSHRestart restart = {
+        .bs = bs,
+        .co = qemu_coroutine_self()
+    };

    r = libssh2_session_block_directions(s->session);

@@ -920,11 +932,9 @@ static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs)
            rd_handler, wr_handler);

    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
-                       false, rd_handler, wr_handler, NULL, co);
+                       false, rd_handler, wr_handler, NULL, &restart);
    qemu_coroutine_yield();
    DPRINTF("s->sock=%d - back", s->sock);
-    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, false,
-                       NULL, NULL, NULL, NULL);
 }

 /* SFTP has a function `libssh2_sftp_seek64' which seeks to a position
@@ -1114,8 +1124,8 @@ static coroutine_fn int ssh_co_writev(BlockDriverState *bs,
 static void unsafe_flush_warning(BDRVSSHState *s, const char *what)
 {
    if (!s->unsafe_flush_warning) {
-        error_report("warning: ssh server %s does not support fsync",
-                     s->inet->host);
+        warn_report("ssh server %s does not support fsync",
+                    s->inet->host);
        if (what) {
            error_report("to support fsync, you need %s", what);
        }
--- a/block/stream.c
+++ b/block/stream.c
@@ -41,25 +41,24 @@ typedef struct StreamBlockJob {
 } StreamBlockJob;

 static int coroutine_fn stream_populate(BlockBackend *blk,
-                                        int64_t sector_num, int nb_sectors,
+                                        int64_t offset, uint64_t bytes,
                                        void *buf)
 {
    struct iovec iov = {
        .iov_base = buf,
-        .iov_len  = nb_sectors * BDRV_SECTOR_SIZE,
+        .iov_len  = bytes,
    };
    QEMUIOVector qiov;

+    assert(bytes < SIZE_MAX);
    qemu_iovec_init_external(&qiov, &iov, 1);

    /* Copy-on-read the unallocated clusters */
-    return blk_co_preadv(blk, sector_num * BDRV_SECTOR_SIZE, qiov.size, &qiov,
-                         BDRV_REQ_COPY_ON_READ);
+    return blk_co_preadv(blk, offset, qiov.size, &qiov, BDRV_REQ_COPY_ON_READ);
 }

 typedef struct {
    int ret;
-    bool reached_end;
 } StreamCompleteData;

 static void stream_complete(BlockJob *job, void *opaque)
@@ -70,7 +69,7 @@ static void stream_complete(BlockJob *job, void *opaque)
    BlockDriverState *base = s->base;
    Error *local_err = NULL;

-    if (!block_job_is_cancelled(&s->common) && data->reached_end &&
+    if (!block_job_is_cancelled(&s->common) && bs->backing &&
        data->ret == 0) {
        const char *base_id = NULL, *base_fmt = NULL;
        if (base) {
@@ -108,12 +107,11 @@ static void coroutine_fn stream_run(void *opaque)
    BlockBackend *blk = s->common.blk;
    BlockDriverState *bs = blk_bs(blk);
    BlockDriverState *base = s->base;
-    int64_t sector_num = 0;
-    int64_t end = -1;
+    int64_t offset = 0;
    uint64_t delay_ns = 0;
    int error = 0;
    int ret = 0;
-    int n = 0;
+    int64_t n = 0; /* bytes */
    void *buf;

    if (!bs->backing) {
@@ -126,7 +124,6 @@ static void coroutine_fn stream_run(void *opaque)
        goto out;
    }

-    end = s->common.len >> BDRV_SECTOR_BITS;
    buf = qemu_blockalign(bs, STREAM_BUFFER_SIZE);

    /* Turn on copy-on-read for the whole block device so that guest read
@@ -138,7 +135,7 @@ static void coroutine_fn stream_run(void *opaque)
        bdrv_enable_copy_on_read(bs);
    }

-    for (sector_num = 0; sector_num < end; sector_num += n) {
+    for ( ; offset < s->common.len; offset += n) {
        bool copy;

        /* Note that even when no rate limit is applied we need to yield
@@ -151,26 +148,25 @@ static void coroutine_fn stream_run(void *opaque)

        copy = false;

-        ret = bdrv_is_allocated(bs, sector_num,
-                                STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n);
+        ret = bdrv_is_allocated(bs, offset, STREAM_BUFFER_SIZE, &n);
        if (ret == 1) {
            /* Allocated in the top, no need to copy.  */
        } else if (ret >= 0) {
            /* Copy if allocated in the intermediate images.  Limit to the
-             * known-unallocated area [sector_num, sector_num+n).  */
+             * known-unallocated area [offset, offset+n*BDRV_SECTOR_SIZE).  */
            ret = bdrv_is_allocated_above(backing_bs(bs), base,
-                                          sector_num, n, &n);
+                                          offset, n, &n);

            /* Finish early if end of backing file has been reached */
            if (ret == 0 && n == 0) {
-                n = end - sector_num;
+                n = s->common.len - offset;
            }

            copy = (ret == 1);
        }
-        trace_stream_one_iteration(s, sector_num, n, ret);
+        trace_stream_one_iteration(s, offset, n, ret);
        if (copy) {
-            ret = stream_populate(blk, sector_num, n, buf);
+            ret = stream_populate(blk, offset, n, buf);
        }
        if (ret < 0) {
            BlockErrorAction action =
@@ -189,7 +185,7 @@ static void coroutine_fn stream_run(void *opaque)
        ret = 0;

        /* Publish progress */
-        s->common.offset += n * BDRV_SECTOR_SIZE;
+        s->common.offset += n;
        if (copy && s->common.speed) {
            delay_ns = ratelimit_calculate_delay(&s->limit, n);
        }
@@ -208,7 +204,6 @@ out:
    /* Modify backing chain and close BDSes in main loop */
    data = g_malloc(sizeof(*data));
    data->ret = ret;
-    data->reached_end = sector_num == end;
    block_job_defer_to_main_loop(&s->common, stream_complete, data);
 }

@@ -220,7 +215,7 @@ static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)
        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
        return;
    }
-    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
+    ratelimit_set_speed(&s->limit, speed, SLICE_TIME);
 }

 static const BlockJobDriver stream_job_driver = {
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
--- a/block/throttle.c
+++ b/block/throttle.c
@@ -0,0 +1,237 @@
+/*
+ * QEMU block throttling filter driver infrastructure
+ *
+ * Copyright (c) 2017 Manos Pitsidianakis
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "block/throttle-groups.h"
+#include "qemu/throttle-options.h"
+#include "qapi/error.h"
+
+static QemuOptsList throttle_opts = {
+    .name = "throttle",
+    .head = QTAILQ_HEAD_INITIALIZER(throttle_opts.head),
+    .desc = {
+        {
+            .name = QEMU_OPT_THROTTLE_GROUP_NAME,
+            .type = QEMU_OPT_STRING,
+            .help = "Name of the throttle group",
+        },
+        { /* end of list */ }
+    },
+};
+
+static int throttle_configure_tgm(BlockDriverState *bs,
+                                  ThrottleGroupMember *tgm,
+                                  QDict *options, Error **errp)
+{
+    int ret;
+    const char *group_name;
+    Error *local_err = NULL;
+    QemuOpts *opts = qemu_opts_create(&throttle_opts, NULL, 0, &error_abort);
+
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto fin;
+    }
+
+    group_name = qemu_opt_get(opts, QEMU_OPT_THROTTLE_GROUP_NAME);
+    if (!group_name) {
+        error_setg(errp, "Please specify a throttle group");
+        ret = -EINVAL;
+        goto fin;
+    } else if (!throttle_group_exists(group_name)) {
+        error_setg(errp, "Throttle group '%s' does not exist", group_name);
+        ret = -EINVAL;
+        goto fin;
+    }
+
+    /* Register membership to group with name group_name */
+    throttle_group_register_tgm(tgm, group_name, bdrv_get_aio_context(bs));
+    ret = 0;
+fin:
+    qemu_opts_del(opts);
+    return ret;
+}
+
+static int throttle_open(BlockDriverState *bs, QDict *options,
+                         int flags, Error **errp)
+{
+    ThrottleGroupMember *tgm = bs->opaque;
+
+    bs->file = bdrv_open_child(NULL, options, "file", bs,
+                               &child_file, false, errp);
+    if (!bs->file) {
+        return -EINVAL;
+    }
+    bs->supported_write_flags = bs->file->bs->supported_write_flags;
+    bs->supported_zero_flags = bs->file->bs->supported_zero_flags;
+
+    return throttle_configure_tgm(bs, tgm, options, errp);
+}
+
+static void throttle_close(BlockDriverState *bs)
+{
+    ThrottleGroupMember *tgm = bs->opaque;
+    throttle_group_unregister_tgm(tgm);
+}
+
+
+static int64_t throttle_getlength(BlockDriverState *bs)
+{
+    return bdrv_getlength(bs->file->bs);
+}
+
+static int coroutine_fn throttle_co_preadv(BlockDriverState *bs,
+                                           uint64_t offset, uint64_t bytes,
+                                           QEMUIOVector *qiov, int flags)
+{
+
+    ThrottleGroupMember *tgm = bs->opaque;
+    throttle_group_co_io_limits_intercept(tgm, bytes, false);
+
+    return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
+}
+
+static int coroutine_fn throttle_co_pwritev(BlockDriverState *bs,
+                                            uint64_t offset, uint64_t bytes,
+                                            QEMUIOVector *qiov, int flags)
+{
+    ThrottleGroupMember *tgm = bs->opaque;
+    throttle_group_co_io_limits_intercept(tgm, bytes, true);
+
+    return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
+}
+
+static int coroutine_fn throttle_co_pwrite_zeroes(BlockDriverState *bs,
+                                                  int64_t offset, int bytes,
+                                                  BdrvRequestFlags flags)
+{
+    ThrottleGroupMember *tgm = bs->opaque;
+    throttle_group_co_io_limits_intercept(tgm, bytes, true);
+
+    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
+}
+
+static int coroutine_fn throttle_co_pdiscard(BlockDriverState *bs,
+                                             int64_t offset, int bytes)
+{
+    ThrottleGroupMember *tgm = bs->opaque;
+    throttle_group_co_io_limits_intercept(tgm, bytes, true);
+
+    return bdrv_co_pdiscard(bs->file->bs, offset, bytes);
+}
+
+static int throttle_co_flush(BlockDriverState *bs)
+{
+    return bdrv_co_flush(bs->file->bs);
+}
+
+static void throttle_detach_aio_context(BlockDriverState *bs)
+{
+    ThrottleGroupMember *tgm = bs->opaque;
+    throttle_group_detach_aio_context(tgm);
+}
+
+static void throttle_attach_aio_context(BlockDriverState *bs,
+                                        AioContext *new_context)
+{
+    ThrottleGroupMember *tgm = bs->opaque;
+    throttle_group_attach_aio_context(tgm, new_context);
+}
+
+static int throttle_reopen_prepare(BDRVReopenState *reopen_state,
+                                   BlockReopenQueue *queue, Error **errp)
+{
+    ThrottleGroupMember *tgm;
+
+    assert(reopen_state != NULL);
+    assert(reopen_state->bs != NULL);
+
+    reopen_state->opaque = g_new0(ThrottleGroupMember, 1);
+    tgm = reopen_state->opaque;
+
+    return throttle_configure_tgm(reopen_state->bs, tgm, reopen_state->options,
+            errp);
+}
+
+static void throttle_reopen_commit(BDRVReopenState *reopen_state)
+{
+    ThrottleGroupMember *old_tgm = reopen_state->bs->opaque;
+    ThrottleGroupMember *new_tgm = reopen_state->opaque;
+
+    throttle_group_unregister_tgm(old_tgm);
+    g_free(old_tgm);
+    reopen_state->bs->opaque = new_tgm;
+    reopen_state->opaque = NULL;
+}
+
+static void throttle_reopen_abort(BDRVReopenState *reopen_state)
+{
+    ThrottleGroupMember *tgm = reopen_state->opaque;
+
+    throttle_group_unregister_tgm(tgm);
+    g_free(tgm);
+    reopen_state->opaque = NULL;
+}
+
+static bool throttle_recurse_is_first_non_filter(BlockDriverState *bs,
+                                                 BlockDriverState *candidate)
+{
+    return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate);
+}
+
+static BlockDriver bdrv_throttle = {
+    .format_name                        =   "throttle",
+    .protocol_name                      =   "throttle",
+    .instance_size                      =   sizeof(ThrottleGroupMember),
+
+    .bdrv_file_open                     =   throttle_open,
+    .bdrv_close                         =   throttle_close,
+    .bdrv_co_flush                      =   throttle_co_flush,
+
+    .bdrv_child_perm                    =   bdrv_filter_default_perms,
+
+    .bdrv_getlength                     =   throttle_getlength,
+
+    .bdrv_co_preadv                     =   throttle_co_preadv,
+    .bdrv_co_pwritev                    =   throttle_co_pwritev,
+
+    .bdrv_co_pwrite_zeroes              =   throttle_co_pwrite_zeroes,
+    .bdrv_co_pdiscard                   =   throttle_co_pdiscard,
+
+    .bdrv_recurse_is_first_non_filter   =   throttle_recurse_is_first_non_filter,
+
+    .bdrv_attach_aio_context            =   throttle_attach_aio_context,
+    .bdrv_detach_aio_context            =   throttle_detach_aio_context,
+
+    .bdrv_reopen_prepare                =   throttle_reopen_prepare,
+    .bdrv_reopen_commit                 =   throttle_reopen_commit,
+    .bdrv_reopen_abort                  =   throttle_reopen_abort,
+    .bdrv_co_get_block_status           =   bdrv_co_get_block_status_from_file,
+
+    .is_filter                          =   true,
+};
+
+static void bdrv_throttle_init(void)
+{
+    bdrv_register(&bdrv_throttle);
+}
+
+block_init(bdrv_throttle_init);
--- a/block/trace-events
+++ b/block/trace-events
@@ -1,28 +1,25 @@
-# See docs/tracing.txt for syntax documentation.
+# See docs/devel/tracing.txt for syntax documentation.

 # block.c
-bdrv_open_common(void *bs, const char *filename, int flags, const char *format_name) "bs %p filename \"%s\" flags %#x format_name \"%s\""
+bdrv_open_common(void *bs, const char *filename, int flags, const char *format_name) "bs %p filename \"%s\" flags 0x%x format_name \"%s\""
 bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d"

 # block/block-backend.c
-blk_co_preadv(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags) "blk %p bs %p offset %"PRId64" bytes %u flags %x"
-blk_co_pwritev(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags) "blk %p bs %p offset %"PRId64" bytes %u flags %x"
+blk_co_preadv(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags) "blk %p bs %p offset %"PRId64" bytes %u flags 0x%x"
+blk_co_pwritev(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags) "blk %p bs %p offset %"PRId64" bytes %u flags 0x%x"

 # block/io.c
-bdrv_aio_flush(void *bs, void *opaque) "bs %p opaque %p"
-bdrv_aio_readv(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
-bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
-bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
-bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
-bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int count, int flags) "bs %p offset %"PRId64" count %d flags %#x"
+bdrv_co_preadv(void *bs, int64_t offset, int64_t nbytes, unsigned int flags) "bs %p offset %"PRId64" nbytes %"PRId64" flags 0x%x"
+bdrv_co_pwritev(void *bs, int64_t offset, int64_t nbytes, unsigned int flags) "bs %p offset %"PRId64" nbytes %"PRId64" flags 0x%x"
+bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int count, int flags) "bs %p offset %"PRId64" count %d flags 0x%x"
 bdrv_co_do_copy_on_readv(void *bs, int64_t offset, unsigned int bytes, int64_t cluster_offset, unsigned int cluster_bytes) "bs %p offset %"PRId64" bytes %u cluster_offset %"PRId64" cluster_bytes %u"

 # block/stream.c
-stream_one_iteration(void *s, int64_t sector_num, int nb_sectors, int is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d"
+stream_one_iteration(void *s, int64_t offset, uint64_t bytes, int is_allocated) "s %p offset %" PRId64 " bytes %" PRIu64 " is_allocated %d"
 stream_start(void *bs, void *base, void *s) "bs %p base %p s %p"

 # block/commit.c
-commit_one_iteration(void *s, int64_t sector_num, int nb_sectors, int is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d"
+commit_one_iteration(void *s, int64_t offset, uint64_t bytes, int is_allocated) "s %p offset %" PRId64 " bytes %" PRIu64 " is_allocated %d"
 commit_start(void *bs, void *base, void *top, void *s) "bs %p base %p top %p s %p"

 # block/mirror.c
@@ -31,14 +28,14 @@ mirror_restart_iter(void *s, int64_t cnt) "s %p dirty count %"PRId64
 mirror_before_flush(void *s) "s %p"
 mirror_before_drain(void *s, int64_t cnt) "s %p dirty count %"PRId64
 mirror_before_sleep(void *s, int64_t cnt, int synced, uint64_t delay_ns) "s %p dirty count %"PRId64" synced %d delay %"PRIu64"ns"
-mirror_one_iteration(void *s, int64_t sector_num, int nb_sectors) "s %p sector_num %"PRId64" nb_sectors %d"
-mirror_iteration_done(void *s, int64_t sector_num, int nb_sectors, int ret) "s %p sector_num %"PRId64" nb_sectors %d ret %d"
+mirror_one_iteration(void *s, int64_t offset, uint64_t bytes) "s %p offset %" PRId64 " bytes %" PRIu64
+mirror_iteration_done(void *s, int64_t offset, uint64_t bytes, int ret) "s %p offset %" PRId64 " bytes %" PRIu64 " ret %d"
 mirror_yield(void *s, int64_t cnt, int buf_free_count, int in_flight) "s %p dirty count %"PRId64" free buffers %d in_flight %d"
-mirror_yield_in_flight(void *s, int64_t sector_num, int in_flight) "s %p sector_num %"PRId64" in_flight %d"
+mirror_yield_in_flight(void *s, int64_t offset, int in_flight) "s %p offset %" PRId64 " in_flight %d"

 # block/backup.c
-backup_do_cow_enter(void *job, int64_t start, int64_t sector_num, int nb_sectors) "job %p start %"PRId64" sector_num %"PRId64" nb_sectors %d"
-backup_do_cow_return(void *job, int64_t sector_num, int nb_sectors, int ret) "job %p sector_num %"PRId64" nb_sectors %d ret %d"
+backup_do_cow_enter(void *job, int64_t start, int64_t offset, uint64_t bytes) "job %p start %" PRId64 " offset %" PRId64 " bytes %" PRIu64
+backup_do_cow_return(void *job, int64_t offset, uint64_t bytes, int ret) "job %p offset %" PRId64 " bytes %" PRIu64 " ret %d"
 backup_do_cow_skip(void *job, int64_t start) "job %p start %"PRId64
 backup_do_cow_process(void *job, int64_t start) "job %p start %"PRId64
 backup_do_cow_read_fail(void *job, int64_t start, int ret) "job %p start %"PRId64" ret %d"
@@ -57,19 +54,19 @@ paio_submit_co(int64_t offset, int count, int type) "offset %"PRId64" count %d t
 paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d"

 # block/qcow2.c
-qcow2_writev_start_req(void *co, int64_t offset, int bytes) "co %p offset %" PRIx64 " bytes %d"
+qcow2_writev_start_req(void *co, int64_t offset, int bytes) "co %p offset 0x%" PRIx64 " bytes %d"
 qcow2_writev_done_req(void *co, int ret) "co %p ret %d"
 qcow2_writev_start_part(void *co) "co %p"
 qcow2_writev_done_part(void *co, int cur_bytes) "co %p cur_bytes %d"
-qcow2_writev_data(void *co, uint64_t offset) "co %p offset %" PRIx64
-qcow2_pwrite_zeroes_start_req(void *co, int64_t offset, int count) "co %p offset %" PRIx64 " count %d"
-qcow2_pwrite_zeroes(void *co, int64_t offset, int count) "co %p offset %" PRIx64 " count %d"
+qcow2_writev_data(void *co, uint64_t offset) "co %p offset 0x%" PRIx64
+qcow2_pwrite_zeroes_start_req(void *co, int64_t offset, int count) "co %p offset 0x%" PRIx64 " count %d"
+qcow2_pwrite_zeroes(void *co, int64_t offset, int count) "co %p offset 0x%" PRIx64 " count %d"

 # block/qcow2-cluster.c
-qcow2_alloc_clusters_offset(void *co, uint64_t offset, int bytes) "co %p offset %" PRIx64 " bytes %d"
-qcow2_handle_copied(void *co, uint64_t guest_offset, uint64_t host_offset, uint64_t bytes) "co %p guest_offset %" PRIx64 " host_offset %" PRIx64 " bytes %" PRIx64
-qcow2_handle_alloc(void *co, uint64_t guest_offset, uint64_t host_offset, uint64_t bytes) "co %p guest_offset %" PRIx64 " host_offset %" PRIx64 " bytes %" PRIx64
-qcow2_do_alloc_clusters_offset(void *co, uint64_t guest_offset, uint64_t host_offset, int nb_clusters) "co %p guest_offset %" PRIx64 " host_offset %" PRIx64 " nb_clusters %d"
+qcow2_alloc_clusters_offset(void *co, uint64_t offset, int bytes) "co %p offset 0x%" PRIx64 " bytes %d"
+qcow2_handle_copied(void *co, uint64_t guest_offset, uint64_t host_offset, uint64_t bytes) "co %p guest_offset 0x%" PRIx64 " host_offset 0x%" PRIx64 " bytes 0x%" PRIx64
+qcow2_handle_alloc(void *co, uint64_t guest_offset, uint64_t host_offset, uint64_t bytes) "co %p guest_offset 0x%" PRIx64 " host_offset 0x%" PRIx64 " bytes 0x%" PRIx64
+qcow2_do_alloc_clusters_offset(void *co, uint64_t guest_offset, uint64_t host_offset, int nb_clusters) "co %p guest_offset 0x%" PRIx64 " host_offset 0x%" PRIx64 " nb_clusters %d"
 qcow2_cluster_alloc_phys(void *co) "co %p"
 qcow2_cluster_link_l2(void *co, int nb_clusters) "co %p nb_clusters %d"

@@ -80,7 +77,7 @@ qcow2_l2_allocate_write_l1(void *bs, int l1_index) "bs %p l1_index %d"
 qcow2_l2_allocate_done(void *bs, int l1_index, int ret) "bs %p l1_index %d ret %d"

 # block/qcow2-cache.c
-qcow2_cache_get(void *co, int c, uint64_t offset, bool read_from_disk) "co %p is_l2_cache %d offset %" PRIx64 " read_from_disk %d"
+qcow2_cache_get(void *co, int c, uint64_t offset, bool read_from_disk) "co %p is_l2_cache %d offset 0x%" PRIx64 " read_from_disk %d"
 qcow2_cache_get_replace_entry(void *co, int c, int i) "co %p is_l2_cache %d index %d"
 qcow2_cache_get_read(void *co, int c, int i) "co %p is_l2_cache %d index %d"
 qcow2_cache_get_done(void *co, int c, int i) "co %p is_l2_cache %d index %d"
@@ -103,7 +100,7 @@ qed_need_check_timer_cb(void *s) "s %p"
 qed_start_need_check_timer(void *s) "s %p"
 qed_cancel_need_check_timer(void *s) "s %p"
 qed_aio_complete(void *s, void *acb, int ret) "s %p acb %p ret %d"
-qed_aio_setup(void *s, void *acb, int64_t sector_num, int nb_sectors, void *opaque, int flags) "s %p acb %p sector_num %"PRId64" nb_sectors %d opaque %p flags %#x"
+qed_aio_setup(void *s, void *acb, int64_t sector_num, int nb_sectors, void *opaque, int flags) "s %p acb %p sector_num %"PRId64" nb_sectors %d opaque %p flags 0x%x"
 qed_aio_next_io(void *s, void *acb, int ret, uint64_t cur_pos) "s %p acb %p ret %d cur_pos %"PRIu64
 qed_aio_read_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
 qed_aio_write_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -172,7 +172,7 @@ typedef struct {
    /* VDI header (converted to host endianness). */
    VdiHeader header;

-    CoMutex write_lock;
+    CoRwlock bmap_lock;

    Error *migration_blocker;
 } BDRVVdiState;
@@ -485,7 +485,7 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
        goto fail_free_bmap;
    }

-    qemu_co_mutex_init(&s->write_lock);
+    qemu_co_rwlock_init(&s->bmap_lock);

    return 0;

@@ -557,7 +557,9 @@ vdi_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
               n_bytes, offset);

        /* prepare next AIO request */
+        qemu_co_rwlock_rdlock(&s->bmap_lock);
        bmap_entry = le32_to_cpu(s->bmap[block_index]);
+        qemu_co_rwlock_unlock(&s->bmap_lock);
        if (!VDI_IS_ALLOCATED(bmap_entry)) {
            /* Block not allocated, return zeros, no need to wait. */
            qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
@@ -595,6 +597,7 @@ vdi_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
    uint32_t block_index;
    uint32_t offset_in_block;
    uint32_t n_bytes;
+    uint64_t data_offset;
    uint32_t bmap_first = VDI_UNALLOCATED;
    uint32_t bmap_last = VDI_UNALLOCATED;
    uint8_t *block = NULL;
@@ -614,10 +617,19 @@ vdi_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
               n_bytes, offset);

        /* prepare next AIO request */
+        qemu_co_rwlock_rdlock(&s->bmap_lock);
        bmap_entry = le32_to_cpu(s->bmap[block_index]);
        if (!VDI_IS_ALLOCATED(bmap_entry)) {
            /* Allocate new block and write to it. */
            uint64_t data_offset;
+            qemu_co_rwlock_upgrade(&s->bmap_lock);
+            bmap_entry = le32_to_cpu(s->bmap[block_index]);
+            if (VDI_IS_ALLOCATED(bmap_entry)) {
+                /* A concurrent allocation did the work for us.  */
+                qemu_co_rwlock_downgrade(&s->bmap_lock);
+                goto nonallocating_write;
+            }
+
            bmap_entry = s->header.blocks_allocated;
            s->bmap[block_index] = cpu_to_le32(bmap_entry);
            s->header.blocks_allocated++;
@@ -635,30 +647,18 @@ vdi_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
            memset(block + offset_in_block + n_bytes, 0,
                   s->block_size - n_bytes - offset_in_block);

-            /* Note that this coroutine does not yield anywhere from reading the
-             * bmap entry until here, so in regards to all the coroutines trying
-             * to write to this cluster, the one doing the allocation will
-             * always be the first to try to acquire the lock.
-             * Therefore, it is also the first that will actually be able to
-             * acquire the lock and thus the padded cluster is written before
-             * the other coroutines can write to the affected area. */
-            qemu_co_mutex_lock(&s->write_lock);
+            /* Write the new block under CoRwLock write-side protection,
+             * so this full-cluster write does not overlap a partial write
+             * of the same cluster, issued from the "else" branch.
+             */
            ret = bdrv_pwrite(bs->file, data_offset, block, s->block_size);
-            qemu_co_mutex_unlock(&s->write_lock);
+            qemu_co_rwlock_unlock(&s->bmap_lock);
        } else {
-            uint64_t data_offset = s->header.offset_data +
-                                   (uint64_t)bmap_entry * s->block_size +
-                                   offset_in_block;
-            qemu_co_mutex_lock(&s->write_lock);
-            /* This lock is only used to make sure the following write operation
-             * is executed after the write issued by the coroutine allocating
-             * this cluster, therefore we do not need to keep it locked.
-             * As stated above, the allocating coroutine will always try to lock
-             * the mutex before all the other concurrent accesses to that
-             * cluster, therefore at this point we can be absolutely certain
-             * that that write operation has returned (there may be other writes
-             * in flight, but they do not concern this very operation). */
-            qemu_co_mutex_unlock(&s->write_lock);
+nonallocating_write:
+            data_offset = s->header.offset_data +
+                           (uint64_t)bmap_entry * s->block_size +
+                           offset_in_block;
+            qemu_co_rwlock_unlock(&s->bmap_lock);

            qemu_iovec_reset(&local_qiov);
            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
@@ -832,7 +832,8 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    if (image_type == VDI_TYPE_STATIC) {
-        ret = blk_truncate(blk, offset + blocks * block_size, errp);
+        ret = blk_truncate(blk, offset + blocks * block_size,
+                           PREALLOC_MODE_OFF, errp);
        if (ret < 0) {
            error_prepend(errp, "Failed to statically allocate %s", filename);
            goto exit;
--- a/block/vhdx-log.c
+++ b/block/vhdx-log.c
@@ -491,6 +491,7 @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s,
    uint32_t cnt, sectors_read;
    uint64_t new_file_size;
    void *data = NULL;
+    int64_t file_length;
    VHDXLogDescEntries *desc_entries = NULL;
    VHDXLogEntryHeader hdr_tmp = { 0 };

@@ -510,10 +511,15 @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s,
        if (ret < 0) {
            goto exit;
        }
+        file_length = bdrv_getlength(bs->file->bs);
+        if (file_length < 0) {
+            ret = file_length;
+            goto exit;
+        }
        /* if the log shows a FlushedFileOffset larger than our current file
         * size, then that means the file has been truncated / corrupted, and
         * we must refused to open it / use it */
-        if (hdr_tmp.flushed_file_offset > bdrv_getlength(bs->file->bs)) {
+        if (hdr_tmp.flushed_file_offset > file_length) {
            ret = -EINVAL;
            goto exit;
        }
@@ -543,19 +549,30 @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s,
                goto exit;
            }
        }
-        if (bdrv_getlength(bs->file->bs) < desc_entries->hdr.last_file_offset) {
+        if (file_length < desc_entries->hdr.last_file_offset) {
            new_file_size = desc_entries->hdr.last_file_offset;
            if (new_file_size % (1024*1024)) {
                /* round up to nearest 1MB boundary */
-                new_file_size = ((new_file_size >> 20) + 1) << 20;
-                bdrv_truncate(bs->file, new_file_size, NULL);
+                new_file_size = QEMU_ALIGN_UP(new_file_size, MiB);
+                if (new_file_size > INT64_MAX) {
+                    ret = -EINVAL;
+                    goto exit;
+                }
+                ret = bdrv_truncate(bs->file, new_file_size, PREALLOC_MODE_OFF,
+                                    NULL);
+                if (ret < 0) {
+                    goto exit;
+                }
            }
        }
        qemu_vfree(desc_entries);
        desc_entries = NULL;
    }

-    bdrv_flush(bs);
+    ret = bdrv_flush(bs);
+    if (ret < 0) {
+        goto exit;
+    }
    /* once the log is fully flushed, indicate that we have an empty log
     * now.  This also sets the log guid to 0, to indicate an empty log */
    vhdx_log_reset(bs, s);
@@ -851,6 +868,7 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,
    uint32_t partial_sectors = 0;
    uint32_t bytes_written = 0;
    uint64_t file_offset;
+    int64_t file_length;
    VHDXHeader *header;
    VHDXLogEntryHeader new_hdr;
    VHDXLogDescriptor *new_desc = NULL;
@@ -884,7 +902,7 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,
    }

    sector_offset = offset % VHDX_LOG_SECTOR_SIZE;
-    file_offset = (offset / VHDX_LOG_SECTOR_SIZE) * VHDX_LOG_SECTOR_SIZE;
+    file_offset = QEMU_ALIGN_DOWN(offset, VHDX_LOG_SECTOR_SIZE);

    aligned_length = length;

@@ -904,6 +922,12 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,

    sectors += partial_sectors;

+    file_length = bdrv_getlength(bs->file->bs);
+    if (file_length < 0) {
+        ret = file_length;
+        goto exit;
+    }
+
    /* sectors is now how many sectors the data itself takes, not
     * including the header and descriptor metadata */

@@ -913,11 +937,11 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,
                .sequence_number     = s->log.sequence,
                .descriptor_count    = sectors,
                .reserved            = 0,
-                .flushed_file_offset = bdrv_getlength(bs->file->bs),
-                .last_file_offset    = bdrv_getlength(bs->file->bs),
+                .flushed_file_offset = file_length,
+                .last_file_offset    = file_length,
+                .log_guid            = header->log_guid,
              };

-    new_hdr.log_guid = header->log_guid;

    desc_sectors = vhdx_compute_desc_sectors(new_hdr.descriptor_count);

@@ -1022,7 +1046,11 @@ int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s,

    /* Make sure data written (new and/or changed blocks) is stable
     * on disk, before creating log entry */
-    bdrv_flush(bs);
+    ret = bdrv_flush(bs);
+    if (ret < 0) {
+        goto exit;
+    }
+
    ret = vhdx_log_write(bs, s, data, length, offset);
    if (ret < 0) {
        goto exit;
@@ -1030,7 +1058,11 @@ int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s,
    logs.log = s->log;

    /* Make sure log is stable on disk */
-    bdrv_flush(bs);
+    ret = bdrv_flush(bs);
+    if (ret < 0) {
+        goto exit;
+    }
+
    ret = vhdx_log_flush(bs, s, &logs);
    if (ret < 0) {
        goto exit;
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -1166,12 +1166,23 @@ exit:
 static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s,
                                    uint64_t *new_offset)
 {
-    *new_offset = bdrv_getlength(bs->file->bs);
+    int64_t current_len;
+
+    current_len = bdrv_getlength(bs->file->bs);
+    if (current_len < 0) {
+        return current_len;
+    }
+
+    *new_offset = current_len;

    /* per the spec, the address for a block is in units of 1MB */
    *new_offset = ROUND_UP(*new_offset, 1024 * 1024);
+    if (*new_offset > INT64_MAX) {
+        return -EINVAL;
+    }

-    return bdrv_truncate(bs->file, *new_offset + s->block_size, NULL);
+    return bdrv_truncate(bs->file, *new_offset + s->block_size,
+                         PREALLOC_MODE_OFF, NULL);
 }

 /*
@@ -1607,12 +1618,13 @@ static int vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s,
    if (type == VHDX_TYPE_DYNAMIC) {
        /* All zeroes, so we can just extend the file - the end of the BAT
         * is the furthest thing we have written yet */
-        ret = blk_truncate(blk, data_file_offset, errp);
+        ret = blk_truncate(blk, data_file_offset, PREALLOC_MODE_OFF, errp);
        if (ret < 0) {
            goto exit;
        }
    } else if (type == VHDX_TYPE_FIXED) {
-        ret = blk_truncate(blk, data_file_offset + image_size, errp);
+        ret = blk_truncate(blk, data_file_offset + image_size,
+                           PREALLOC_MODE_OFF, errp);
        if (ret < 0) {
            goto exit;
        }
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -242,10 +242,11 @@ static void vmdk_free_last_extent(BlockDriverState *bs)
    s->extents = g_renew(VmdkExtent, s->extents, s->num_extents);
 }

-static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
+/* Return -ve errno, or 0 on success and write CID into *pcid. */
+static int vmdk_read_cid(BlockDriverState *bs, int parent, uint32_t *pcid)
 {
    char *desc;
-    uint32_t cid = 0xffffffff;
+    uint32_t cid;
    const char *p_name, *cid_str;
    size_t cid_str_size;
    BDRVVmdkState *s = bs->opaque;
@@ -254,8 +255,7 @@ static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
    desc = g_malloc0(DESC_SIZE);
    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
    if (ret < 0) {
-        g_free(desc);
-        return 0;
+        goto out;
    }

    if (parent) {
@@ -268,13 +268,21 @@ static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)

    desc[DESC_SIZE - 1] = '\0';
    p_name = strstr(desc, cid_str);
-    if (p_name != NULL) {
-        p_name += cid_str_size;
-        sscanf(p_name, "%" SCNx32, &cid);
+    if (p_name == NULL) {
+        ret = -EINVAL;
+        goto out;
    }
+    p_name += cid_str_size;
+    if (sscanf(p_name, "%" SCNx32, &cid) != 1) {
+        ret = -EINVAL;
+        goto out;
+    }
+    *pcid = cid;
+    ret = 0;

+out:
    g_free(desc);
-    return cid;
+    return ret;
 }

 static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
@@ -322,7 +330,10 @@ static int vmdk_is_cid_valid(BlockDriverState *bs)
    if (!s->cid_checked && bs->backing) {
        BlockDriverState *p_bs = bs->backing->bs;

-        cur_pcid = vmdk_read_cid(p_bs, 0);
+        if (vmdk_read_cid(p_bs, 0, &cur_pcid) != 0) {
+            /* read failure: report as not valid */
+            return 0;
+        }
        if (s->parent_cid != cur_pcid) {
            /* CID not valid */
            return 0;
@@ -975,8 +986,14 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
    if (ret) {
        goto fail;
    }
-    s->cid = vmdk_read_cid(bs, 0);
-    s->parent_cid = vmdk_read_cid(bs, 1);
+    ret = vmdk_read_cid(bs, 0, &s->cid);
+    if (ret) {
+        goto fail;
+    }
+    ret = vmdk_read_cid(bs, 1, &s->parent_cid);
+    if (ret) {
+        goto fail;
+    }
    qemu_co_mutex_init(&s->lock);

    /* Disable migration when VMDK images are used */
@@ -1714,7 +1731,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
    blk_set_allow_write_beyond_eof(blk, true);

    if (flat) {
-        ret = blk_truncate(blk, filesize, errp);
+        ret = blk_truncate(blk, filesize, PREALLOC_MODE_OFF, errp);
        goto exit;
    }
    magic = cpu_to_be32(VMDK4_MAGIC);
@@ -1777,7 +1794,8 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
        goto exit;
    }

-    ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9, errp);
+    ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9,
+                       PREALLOC_MODE_OFF, errp);
    if (ret < 0) {
        goto exit;
    }
@@ -2007,8 +2025,11 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
            ret = -EINVAL;
            goto exit;
        }
-        parent_cid = vmdk_read_cid(blk_bs(blk), 0);
+        ret = vmdk_read_cid(blk_bs(blk), 0, &parent_cid);
        blk_unref(blk);
+        if (ret) {
+            goto exit;
+        }
        snprintf(parent_desc_line, BUF_SIZE,
                "parentFileNameHint=\"%s\"", backing_file);
    }
@@ -2086,7 +2107,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
    /* bdrv_pwrite write padding zeros to align to sector, we don't need that
     * for description file */
    if (desc_offset == 0) {
-        ret = blk_truncate(new_blk, desc_len, errp);
+        ret = blk_truncate(new_blk, desc_len, PREALLOC_MODE_OFF, errp);
    }
 exit:
    if (new_blk) {
@@ -2215,6 +2236,7 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result,
            fprintf(stderr,
                    "ERROR: could not find extent for sector %" PRId64 "\n",
                    sector_num);
+            ret = -EINVAL;
            break;
        }
        ret = get_cluster_offset(bs, extent, NULL,
@@ -2226,19 +2248,28 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result,
                    PRId64 "\n", sector_num);
            break;
        }
-        if (ret == VMDK_OK &&
-            cluster_offset >= bdrv_getlength(extent->file->bs))
-        {
-            fprintf(stderr,
-                    "ERROR: cluster offset for sector %"
-                    PRId64 " points after EOF\n", sector_num);
-            break;
+        if (ret == VMDK_OK) {
+            int64_t extent_len = bdrv_getlength(extent->file->bs);
+            if (extent_len < 0) {
+                fprintf(stderr,
+                        "ERROR: could not get extent file length for sector %"
+                        PRId64 "\n", sector_num);
+                ret = extent_len;
+                break;
+            }
+            if (cluster_offset >= extent_len) {
+                fprintf(stderr,
+                        "ERROR: cluster offset for sector %"
+                        PRId64 " points after EOF\n", sector_num);
+                ret = -EINVAL;
+                break;
+            }
        }
        sector_num += extent->cluster_sectors;
    }

    result->corruptions++;
-    return 0;
+    return ret;
 }

 static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs)
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -219,6 +219,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
    uint64_t pagetable_size;
    int disk_type = VHD_DYNAMIC;
    int ret;
+    int64_t bs_size;

    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
                               false, errp);
@@ -411,7 +412,13 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
            }
        }

-        if (s->free_data_block_offset > bdrv_getlength(bs->file->bs)) {
+        bs_size = bdrv_getlength(bs->file->bs);
+        if (bs_size < 0) {
+            error_setg_errno(errp, -bs_size, "Unable to learn image size");
+            ret = bs_size;
+            goto fail;
+        }
+        if (s->free_data_block_offset > bs_size) {
            error_setg(errp, "block-vpc: free_data_block_offset points after "
                             "the end of file. The image has been truncated.");
            ret = -EINVAL;
@@ -460,17 +467,23 @@ static int vpc_reopen_prepare(BDRVReopenState *state,
 /*
 * Returns the absolute byte offset of the given sector in the image file.
 * If the sector is not allocated, -1 is returned instead.
+ * If an error occurred trying to write an updated block bitmap back to
+ * the file, -2 is returned, and the error value is written to *err.
+ * This can only happen for a write operation.
 *
 * The parameter write must be 1 if the offset will be used for a write
 * operation (the block bitmaps is updated then), 0 otherwise.
+ * If write is true then err must not be NULL.
 */
 static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
-                                       bool write)
+                                       bool write, int *err)
 {
    BDRVVPCState *s = bs->opaque;
    uint64_t bitmap_offset, block_offset;
    uint32_t pagetable_index, offset_in_block;

+    assert(!(write && err == NULL));
+
    pagetable_index = offset / s->block_size;
    offset_in_block = offset % s->block_size;

@@ -487,21 +500,20 @@ static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
       correctness. */
    if (write && (s->last_bitmap_offset != bitmap_offset)) {
        uint8_t bitmap[s->bitmap_size];
+        int r;

        s->last_bitmap_offset = bitmap_offset;
        memset(bitmap, 0xff, s->bitmap_size);
-        bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size);
+        r = bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size);
+        if (r < 0) {
+            *err = r;
+            return -2;
+        }
    }

    return block_offset;
 }

-static inline int64_t get_sector_offset(BlockDriverState *bs,
-                                        int64_t sector_num, bool write)
-{
-    return get_image_offset(bs, sector_num * BDRV_SECTOR_SIZE, write);
-}
-
 /*
 * Writes the footer to the end of the image file. This is needed when the
 * file grows as it overwrites the old footer
@@ -567,7 +579,7 @@ static int64_t alloc_block(BlockDriverState* bs, int64_t offset)
    if (ret < 0)
        goto fail;

-    return get_image_offset(bs, offset, false);
+    return get_image_offset(bs, offset, false, NULL);

 fail:
    s->free_data_block_offset -= (s->block_size + s->bitmap_size);
@@ -607,7 +619,7 @@ vpc_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
    qemu_iovec_init(&local_qiov, qiov->niov);

    while (bytes > 0) {
-        image_offset = get_image_offset(bs, offset, false);
+        image_offset = get_image_offset(bs, offset, false, NULL);
        n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));

        if (image_offset == -1) {
@@ -644,7 +656,7 @@ vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
    int64_t image_offset;
    int64_t n_bytes;
    int64_t bytes_done = 0;
-    int ret;
+    int ret = 0;
    VHDFooter *footer =  (VHDFooter *) s->footer_buf;
    QEMUIOVector local_qiov;

@@ -656,7 +668,11 @@ vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
    qemu_iovec_init(&local_qiov, qiov->niov);

    while (bytes > 0) {
-        image_offset = get_image_offset(bs, offset, true);
+        image_offset = get_image_offset(bs, offset, true, &ret);
+        if (image_offset == -2) {
+            /* Failed to write block bitmap: can't proceed with write */
+            goto fail;
+        }
        n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));

        if (image_offset == -1) {
@@ -696,19 +712,23 @@ static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs,
    VHDFooter *footer = (VHDFooter*) s->footer_buf;
    int64_t start, offset;
    bool allocated;
+    int64_t ret;
    int n;

    if (be32_to_cpu(footer->type) == VHD_FIXED) {
        *pnum = nb_sectors;
        *file = bs->file->bs;
-        return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
+        return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
               (sector_num << BDRV_SECTOR_BITS);
    }

-    offset = get_sector_offset(bs, sector_num, 0);
+    qemu_co_mutex_lock(&s->lock);
+
+    offset = get_image_offset(bs, sector_num << BDRV_SECTOR_BITS, false, NULL);
    start = offset;
    allocated = (offset != -1);
    *pnum = 0;
+    ret = 0;

    do {
        /* All sectors in a block are contiguous (without using the bitmap) */
@@ -723,15 +743,18 @@ static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs,
         * sectors since there is always a bitmap in between. */
        if (allocated) {
            *file = bs->file->bs;
-            return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
+            ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
+            break;
        }
        if (nb_sectors == 0) {
            break;
        }
-        offset = get_sector_offset(bs, sector_num, 0);
+        offset = get_image_offset(bs, sector_num << BDRV_SECTOR_BITS, false,
+                                  NULL);
    } while (offset == -1);

-    return 0;
+    qemu_co_mutex_unlock(&s->lock);
+    return ret;
 }

 /*
@@ -760,7 +783,7 @@ static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
    } else {
        *secs_per_cyl = 17;
        cyls_times_heads = total_sectors / *secs_per_cyl;
-        *heads = (cyls_times_heads + 1023) / 1024;
+        *heads = DIV_ROUND_UP(cyls_times_heads, 1024);

        if (*heads < 4) {
            *heads = 4;
@@ -813,7 +836,7 @@ static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
    offset = 3 * 512;

    memset(buf, 0xFF, 512);
-    for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) {
+    for (i = 0; i < DIV_ROUND_UP(num_bat_entries * 4, 512); i++) {
        ret = blk_pwrite(blk, offset, buf, 512, 0);
        if (ret < 0) {
            goto fail;
@@ -858,7 +881,7 @@ static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
    /* Add footer to total size */
    total_size += HEADER_SIZE;

-    ret = blk_truncate(blk, total_size, errp);
+    ret = blk_truncate(blk, total_size, PREALLOC_MODE_OFF, errp);
    if (ret < 0) {
        return ret;
    }
--- a/block/vvfat.c
+++ b/block/vvfat.c
--- a/blockdev.c
+++ b/blockdev.c
@@ -44,12 +44,12 @@
 #include "qapi-visit.h"
 #include "qapi/qmp/qerror.h"
 #include "qapi/qobject-output-visitor.h"
-#include "qapi/util.h"
 #include "sysemu/sysemu.h"
 #include "block/block_int.h"
 #include "qmp-commands.h"
 #include "block/trace.h"
 #include "sysemu/arch_init.h"
+#include "sysemu/qtest.h"
 #include "qemu/cutils.h"
 #include "qemu/help_option.h"
 #include "qemu/throttle-options.h"
@@ -437,9 +437,8 @@ static void extract_common_blockdev_options(QemuOpts *opts, int *bdrv_flags,

    if (detect_zeroes) {
        *detect_zeroes =
-            qapi_enum_parse(BlockdevDetectZeroesOptions_lookup,
+            qapi_enum_parse(&BlockdevDetectZeroesOptions_lookup,
                            qemu_opt_get(opts, "detect-zeroes"),
-                            BLOCKDEV_DETECT_ZEROES_OPTIONS__MAX,
                            BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF,
                            &local_error);
        if (local_error) {
@@ -592,10 +591,6 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,

        bs->detect_zeroes = detect_zeroes;

-        if (bdrv_key_required(bs)) {
-            autostart = 0;
-        }
-
        block_acct_setup(blk_get_stats(blk), account_invalid, account_failed);

        if (!parse_stats_intervals(blk_get_stats(blk), interval_list, errp)) {
@@ -798,6 +793,9 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type)
    const char *filename;
    Error *local_err = NULL;
    int i;
+    const char *deprecated[] = {
+        "serial", "trans", "secs", "heads", "cyls", "addr"
+    };

    /* Change legacy command line options into QMP ones */
    static const struct {
@@ -881,6 +879,16 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type)
                "update your scripts.\n");
    }

+    /* Other deprecated options */
+    if (!qtest_enabled()) {
+        for (i = 0; i < ARRAY_SIZE(deprecated); i++) {
+            if (qemu_opt_get(legacy_opts, deprecated[i]) != NULL) {
+                error_report("'%s' is deprecated, please use the corresponding "
+                             "option of '-device' instead", deprecated[i]);
+            }
+        }
+    }
+
    /* Media type */
    value = qemu_opt_get(legacy_opts, "media");
    if (value) {
@@ -900,7 +908,7 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type)
    copy_on_read = qemu_opt_get_bool(legacy_opts, "copy-on-read", false);

    if (read_only && copy_on_read) {
-        error_report("warning: disabling copy-on-read on read-only drive");
+        warn_report("disabling copy-on-read on read-only drive");
        copy_on_read = false;
    }

@@ -1458,8 +1466,8 @@ static int action_check_completion_mode(BlkActionState *s, Error **errp)
        error_setg(errp,
                   "Action '%s' does not support Transaction property "
                   "completion-mode = %s",
-                   TransactionActionKind_lookup[s->action->type],
-                   ActionCompletionMode_lookup[s->txn_props->completion_mode]);
+                   TransactionActionKind_str(s->action->type),
+                   ActionCompletionMode_str(s->txn_props->completion_mode));
        return -1;
    }
    return 0;
@@ -1700,7 +1708,8 @@ static void external_snapshot_prepare(BlkActionState *common,
        }

        flags = state->old_bs->open_flags;
-        flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
+        flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_COPY_ON_READ);
+        flags |= BDRV_O_NO_BACKING;

        /* create new image w/backing file */
        mode = s->has_mode ? s->mode : NEW_IMAGE_MODE_ABSOLUTE_PATHS;
@@ -1725,8 +1734,6 @@ static void external_snapshot_prepare(BlkActionState *common,
            qdict_put_str(options, "node-name", snapshot_node_name);
        }
        qdict_put_str(options, "driver", format);
-
-        flags |= BDRV_O_NO_BACKING;
    }

    state->new_bs = bdrv_open(new_image_file, snapshot_ref, options, flags,
@@ -1973,6 +1980,8 @@ static void block_dirty_bitmap_add_prepare(BlkActionState *common,
    /* AIO context taken and released within qmp_block_dirty_bitmap_add */
    qmp_block_dirty_bitmap_add(action->node, action->name,
                               action->has_granularity, action->granularity,
+                               action->has_persistent, action->persistent,
+                               action->has_autoload, action->autoload,
                               &local_err);

    if (!local_err) {
@@ -2023,6 +2032,9 @@ static void block_dirty_bitmap_clear_prepare(BlkActionState *common,
    } else if (!bdrv_dirty_bitmap_enabled(state->bitmap)) {
        error_setg(errp, "Cannot clear a disabled bitmap");
        return;
+    } else if (bdrv_dirty_bitmap_readonly(state->bitmap)) {
+        error_setg(errp, "Cannot clear a readonly bitmap");
+        return;
    }

    bdrv_clear_dirty_bitmap(state->bitmap, &state->backup);
@@ -2251,24 +2263,8 @@ void qmp_block_passwd(bool has_device, const char *device,
                      bool has_node_name, const char *node_name,
                      const char *password, Error **errp)
 {
-    Error *local_err = NULL;
-    BlockDriverState *bs;
-    AioContext *aio_context;
-
-    bs = bdrv_lookup_bs(has_device ? device : NULL,
-                        has_node_name ? node_name : NULL,
-                        &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        return;
-    }
-
-    aio_context = bdrv_get_aio_context(bs);
-    aio_context_acquire(aio_context);
-
-    bdrv_add_key(bs, password, errp);
-
-    aio_context_release(aio_context);
+    error_setg(errp,
+               "Setting block passwords directly is no longer supported");
 }

 /*
@@ -2577,12 +2573,6 @@ void qmp_blockdev_change_medium(bool has_device, const char *device,
        goto fail;
    }

-    bdrv_add_key(medium_bs, NULL, &err);
-    if (err) {
-        error_propagate(errp, err);
-        goto fail;
-    }
-
    rc = do_open_tray(has_device ? device : NULL,
                      has_id ? id : NULL,
                      false, &err);
@@ -2696,7 +2686,7 @@ void qmp_block_set_io_throttle(BlockIOThrottle *arg, Error **errp)
    if (throttle_enabled(&cfg)) {
        /* Enable I/O limits if they're not enabled yet, otherwise
         * just update the throttling group. */
-        if (!blk_get_public(blk)->throttle_state) {
+        if (!blk_get_public(blk)->throttle_group_member.throttle_state) {
            blk_io_limits_enable(blk,
                                 arg->has_group ? arg->group :
                                 arg->has_device ? arg->device :
@@ -2706,7 +2696,7 @@ void qmp_block_set_io_throttle(BlockIOThrottle *arg, Error **errp)
        }
        /* Set the new throttling configuration */
        blk_set_io_limits(blk, &cfg);
-    } else if (blk_get_public(blk)->throttle_state) {
+    } else if (blk_get_public(blk)->throttle_group_member.throttle_state) {
        /* If all throttling settings are set to 0, disable I/O limits */
        blk_io_limits_disable(blk);
    }
@@ -2717,9 +2707,12 @@ out:

 void qmp_block_dirty_bitmap_add(const char *node, const char *name,
                                bool has_granularity, uint32_t granularity,
+                                bool has_persistent, bool persistent,
+                                bool has_autoload, bool autoload,
                                Error **errp)
 {
    BlockDriverState *bs;
+    BdrvDirtyBitmap *bitmap;

    if (!name || name[0] == '\0') {
        error_setg(errp, "Bitmap name cannot be empty");
@@ -2742,7 +2735,32 @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name,
        granularity = bdrv_get_default_bitmap_granularity(bs);
    }

-    bdrv_create_dirty_bitmap(bs, granularity, name, errp);
+    if (!has_persistent) {
+        persistent = false;
+    }
+    if (!has_autoload) {
+        autoload = false;
+    }
+
+    if (has_autoload && !persistent) {
+        error_setg(errp, "Autoload flag must be used only for persistent "
+                         "bitmaps");
+        return;
+    }
+
+    if (persistent &&
+        !bdrv_can_store_new_dirty_bitmap(bs, name, granularity, errp))
+    {
+        return;
+    }
+
+    bitmap = bdrv_create_dirty_bitmap(bs, granularity, name, errp);
+    if (bitmap == NULL) {
+        return;
+    }
+
+    bdrv_dirty_bitmap_set_persistance(bitmap, persistent);
+    bdrv_dirty_bitmap_set_autoload(bitmap, autoload);
 }

 void qmp_block_dirty_bitmap_remove(const char *node, const char *name,
@@ -2750,6 +2768,7 @@ void qmp_block_dirty_bitmap_remove(const char *node, const char *name,
 {
    BlockDriverState *bs;
    BdrvDirtyBitmap *bitmap;
+    Error *local_err = NULL;

    bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp);
    if (!bitmap || !bs) {
@@ -2762,6 +2781,15 @@ void qmp_block_dirty_bitmap_remove(const char *node, const char *name,
                   name);
        return;
    }
+
+    if (bdrv_dirty_bitmap_get_persistance(bitmap)) {
+        bdrv_remove_persistent_dirty_bitmap(bs, name, &local_err);
+        if (local_err != NULL) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+
    bdrv_dirty_bitmap_make_anon(bitmap);
    bdrv_release_dirty_bitmap(bs, bitmap);
 }
@@ -2791,11 +2819,39 @@ void qmp_block_dirty_bitmap_clear(const char *node, const char *name,
                   "Bitmap '%s' is currently disabled and cannot be cleared",
                   name);
        return;
+    } else if (bdrv_dirty_bitmap_readonly(bitmap)) {
+        error_setg(errp, "Bitmap '%s' is readonly and cannot be cleared", name);
+        return;
    }

    bdrv_clear_dirty_bitmap(bitmap, NULL);
 }

+BlockDirtyBitmapSha256 *qmp_x_debug_block_dirty_bitmap_sha256(const char *node,
+                                                              const char *name,
+                                                              Error **errp)
+{
+    BdrvDirtyBitmap *bitmap;
+    BlockDriverState *bs;
+    BlockDirtyBitmapSha256 *ret = NULL;
+    char *sha256;
+
+    bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp);
+    if (!bitmap || !bs) {
+        return NULL;
+    }
+
+    sha256 = bdrv_dirty_bitmap_sha256(bitmap, errp);
+    if (sha256 == NULL) {
+        return NULL;
+    }
+
+    ret = g_new(BlockDirtyBitmapSha256, 1);
+    ret->sha256 = sha256;
+
+    return ret;
+}
+
 void hmp_drive_del(Monitor *mon, const QDict *qdict)
 {
    const char *id = qdict_get_str(qdict, "id");
@@ -2899,7 +2955,7 @@ void qmp_block_resize(bool has_device, const char *device,
    }

    bdrv_drained_begin(bs);
-    ret = blk_truncate(blk, size, errp);
+    ret = blk_truncate(blk, size, PREALLOC_MODE_OFF, errp);
    bdrv_drained_end(bs);

 out:
@@ -3489,6 +3545,9 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
        backing_mode = MIRROR_OPEN_BACKING_CHAIN;
    }

+    /* Don't open backing image in create() */
+    flags |= BDRV_O_NO_BACKING;
+
    if ((arg->sync == MIRROR_SYNC_MODE_FULL || !source)
        && arg->mode != NEW_IMAGE_MODE_EXISTING)
    {
@@ -3528,8 +3587,7 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
    /* Mirroring takes care of copy-on-write using the source's backing
     * file.
     */
-    target_bs = bdrv_open(arg->target, NULL, options,
-                          flags | BDRV_O_NO_BACKING, errp);
+    target_bs = bdrv_open(arg->target, NULL, options, flags, errp);
    if (!target_bs) {
        goto out;
    }
@@ -3827,6 +3885,7 @@ void qmp_blockdev_add(BlockdevOptions *options, Error **errp)
    QObject *obj;
    Visitor *v = qobject_output_visitor_new(&obj);
    QDict *qdict;
+    const QDictEntry *ent;
    Error *local_err = NULL;

    visit_type_BlockdevOptions(v, NULL, &options, &local_err);
@@ -3840,6 +3899,19 @@ void qmp_blockdev_add(BlockdevOptions *options, Error **errp)

    qdict_flatten(qdict);

+    /*
+     * Rewrite "backing": null to "backing": ""
+     * TODO Rewrite "" to null instead, and perhaps not even here
+     */
+    for (ent = qdict_first(qdict); ent; ent = qdict_next(qdict, ent)) {
+        char *dot = strrchr(ent->key, '.');
+
+        if (!strcmp(dot ? dot + 1 : ent->key, "backing")
+            && qobject_type(ent->value) == QTYPE_QNULL) {
+            qdict_put(qdict, ent->key, qstring_new());
+        }
+    }
+
    if (!qdict_get_try_str(qdict, "node-name")) {
        error_setg(errp, "'node-name' must be specified for the root node");
        goto fail;
@@ -3852,13 +3924,6 @@ void qmp_blockdev_add(BlockdevOptions *options, Error **errp)

    QTAILQ_INSERT_TAIL(&monitor_bdrv_states, bs, monitor_list);

-    if (bs && bdrv_key_required(bs)) {
-        QTAILQ_REMOVE(&monitor_bdrv_states, bs, monitor_list);
-        bdrv_unref(bs);
-        error_setg(errp, "blockdev-add doesn't support encrypted devices");
-        goto fail;
-    }
-
 fail:
    visit_free(v);
 }
--- a/blockjob.c
+++ b/blockjob.c
@@ -139,7 +139,7 @@ static void block_job_resume(BlockJob *job)
    block_job_enter(job);
 }

-static void block_job_ref(BlockJob *job)
+void block_job_ref(BlockJob *job)
 {
    ++job->refcnt;
 }
@@ -148,7 +148,7 @@ static void block_job_attached_aio_context(AioContext *new_context,
                                           void *opaque);
 static void block_job_detach_aio_context(void *opaque);

-static void block_job_unref(BlockJob *job)
+void block_job_unref(BlockJob *job)
 {
    if (--job->refcnt == 0) {
        BlockDriverState *bs = blk_bs(job->blk);
@@ -208,7 +208,7 @@ static char *child_job_get_parent_desc(BdrvChild *c)
 {
    BlockJob *job = c->opaque;
    return g_strdup_printf("%s job '%s'",
-                           BlockJobType_lookup[job->driver->job_type],
+                           BlockJobType_str(job->driver->job_type),
                           job->id);
 }

@@ -553,7 +553,7 @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
        return NULL;
    }
    info = g_new0(BlockJobInfo, 1);
-    info->type      = g_strdup(BlockJobType_lookup[job->driver->job_type]);
+    info->type      = g_strdup(BlockJobType_str(job->driver->job_type));
    info->device    = g_strdup(job->id);
    info->len       = job->len;
    info->busy      = job->busy;
@@ -666,7 +666,7 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
    job->refcnt        = 1;

    error_setg(&job->blocker, "block device is in use by block job: %s",
-               BlockJobType_lookup[driver->job_type]);
+               BlockJobType_str(driver->job_type));
    block_job_add_bdrv(job, "main node", bs, 0, BLK_PERM_ALL, &error_abort);
    bs->job = job;

--- a/bootdevice.c
+++ b/bootdevice.c
@@ -27,7 +27,7 @@
 #include "sysemu/sysemu.h"
 #include "qapi/visitor.h"
 #include "qemu/error-report.h"
-#include "hw/hw.h"
+#include "sysemu/reset.h"
 #include "hw/qdev-core.h"

 typedef struct FWBootEntry FWBootEntry;
--- a/bsd-user/bsdload.c
+++ b/bsd-user/bsdload.c
@@ -20,22 +20,6 @@ abi_long memcpy_to_target(abi_ulong dest, const void *src,
    return 0;
 }

-static int in_group_p(gid_t g)
-{
-    /* return TRUE if we're in the specified group, FALSE otherwise */
-    int         ngroup;
-    int         i;
-    gid_t       grouplist[TARGET_NGROUPS];
-
-    ngroup = getgroups(TARGET_NGROUPS, grouplist);
-    for(i = 0; i < ngroup; i++) {
-        if(grouplist[i] == g) {
-            return 1;
-        }
-    }
-    return 0;
-}
-
 static int count(char ** vec)
 {
    int         i;
@@ -51,7 +35,7 @@ static int prepare_binprm(struct linux_binprm *bprm)
 {
    struct stat         st;
    int mode;
-    int retval, id_change;
+    int retval;

    if(fstat(bprm->fd, &st) < 0) {
        return(-errno);
@@ -67,14 +51,10 @@ static int prepare_binprm(struct linux_binprm *bprm)

    bprm->e_uid = geteuid();
    bprm->e_gid = getegid();
-    id_change = 0;

    /* Set-uid? */
    if(mode & S_ISUID) {
        bprm->e_uid = st.st_uid;
-        if(bprm->e_uid != geteuid()) {
-            id_change = 1;
-        }
    }

    /* Set-gid? */
@@ -85,9 +65,6 @@ static int prepare_binprm(struct linux_binprm *bprm)
     */
    if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
        bprm->e_gid = st.st_gid;
-        if (!in_group_p(bprm->e_gid)) {
-                id_change = 1;
-        }
    }

    memset(bprm->buf, 0, sizeof(bprm->buf));
--- a/bsd-user/elfload.c
+++ b/bsd-user/elfload.c
@@ -1155,21 +1155,20 @@ int load_elf_binary(struct linux_binprm * bprm, struct target_pt_regs * regs,
    unsigned int interpreter_type = INTERPRETER_NONE;
    unsigned char ibcs2_interpreter;
    int i;
-    abi_ulong mapped_addr;
    struct elf_phdr * elf_ppnt;
    struct elf_phdr *elf_phdata;
    abi_ulong elf_bss, k, elf_brk;
    int retval;
    char * elf_interpreter;
    abi_ulong elf_entry, interp_load_addr = 0;
-    int status;
    abi_ulong start_code, end_code, start_data, end_data;
    abi_ulong reloc_func_desc = 0;
-    abi_ulong elf_stack;
+#ifdef LOW_ELF_STACK
+    abi_ulong elf_stack = ~((abi_ulong)0UL);
+#endif
    char passed_fileno[6];

    ibcs2_interpreter = 0;
-    status = 0;
    load_addr = 0;
    load_bias = 0;
    elf_ex = *((struct elfhdr *) bprm->buf);          /* exec-header */
@@ -1221,7 +1220,6 @@ int load_elf_binary(struct linux_binprm * bprm, struct target_pt_regs * regs,
    elf_brk = 0;


-    elf_stack = ~((abi_ulong)0UL);
    elf_interpreter = NULL;
    start_code = ~((abi_ulong)0UL);
    end_code = 0;
@@ -1546,7 +1544,7 @@ int load_elf_binary(struct linux_binprm * bprm, struct target_pt_regs * regs,
               and some applications "depend" upon this behavior.
               Since we do not have the power to recompile these, we
               emulate the SVr4 behavior.  Sigh.  */
-            mapped_addr = target_mmap(0, qemu_host_page_size, PROT_READ | PROT_EXEC,
+            target_mmap(0, qemu_host_page_size, PROT_READ | PROT_EXEC,
                                      MAP_FIXED | MAP_PRIVATE, -1, 0);
    }

--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -25,7 +25,6 @@
 #include "qemu/config-file.h"
 #include "qemu/path.h"
 #include "qemu/help_option.h"
-/* For tb_lock */
 #include "cpu.h"
 #include "exec/exec-all.h"
 #include "tcg.h"
@@ -620,9 +619,10 @@ void cpu_loop(CPUSPARCState *env)
            break;
        case EXCP_DEBUG:
            {
-                int sig;
-
-                sig = gdb_handlesig(cs, TARGET_SIGTRAP);
+#if 0
+                int sig =
+#endif
+                gdb_handlesig(cs, TARGET_SIGTRAP);
 #if 0
                if (sig)
                  {
@@ -686,6 +686,8 @@ static void usage(void)
           "    -E var1=val2 -E var2=val2 -U LD_PRELOAD -U LD_DEBUG\n"
           "Note that if you provide several changes to single variable\n"
           "last change will stay in effect.\n"
+           "\n"
+           QEMU_HELP_BOTTOM "\n"
           ,
           TARGET_NAME,
           interp_prefix,
@@ -900,10 +902,6 @@ int main(int argc, char **argv)
    /* NOTE: we need to init the CPU at this stage to get
       qemu_host_page_size */
    cpu = cpu_init(cpu_model);
-    if (!cpu) {
-        fprintf(stderr, "Unable to find CPU definition\n");
-        exit(1);
-    }
    env = cpu->env_ptr;
 #if defined(TARGET_SPARC) || defined(TARGET_PPC)
    cpu_reset(cpu);
--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .9.50
 .10.50