Merge remote-tracking branch 'mreitz/tags/pull-block-for-kevin-2015-09-04' into queue-block
Block patches from 2015-08-24 until 2015-09-04. # gpg: Signature made Fri Sep 4 21:02:10 2015 CEST using RSA key ID E838ACAD # gpg: Good signature from "Max Reitz <mreitz@redhat.com>" * mreitz/tags/pull-block-for-kevin-2015-09-04: quorum: validate vote threshold against num_children even if read-pattern is fifo qcow2: reorder fields in Qcow2CachedTable to reduce padding docs: document how to configure the qcow2 L2/refcount caches qcow2: add option to clean unused cache entries after some time qcow2: mark the memory as no longer needed after qcow2_cache_empty() Signed-off-by: Kevin Wolf <kwolf@redhat.com>
This commit is contained in:
		@@ -22,16 +22,24 @@
 | 
				
			|||||||
 * THE SOFTWARE.
 | 
					 * THE SOFTWARE.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* Needed for CONFIG_MADVISE */
 | 
				
			||||||
 | 
					#include "config-host.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if defined(CONFIG_MADVISE) || defined(CONFIG_POSIX_MADVISE)
 | 
				
			||||||
 | 
					#include <sys/mman.h>
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include "block/block_int.h"
 | 
					#include "block/block_int.h"
 | 
				
			||||||
#include "qemu-common.h"
 | 
					#include "qemu-common.h"
 | 
				
			||||||
 | 
					#include "qemu/osdep.h"
 | 
				
			||||||
#include "qcow2.h"
 | 
					#include "qcow2.h"
 | 
				
			||||||
#include "trace.h"
 | 
					#include "trace.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
typedef struct Qcow2CachedTable {
 | 
					typedef struct Qcow2CachedTable {
 | 
				
			||||||
    int64_t  offset;
 | 
					    int64_t  offset;
 | 
				
			||||||
    bool     dirty;
 | 
					 | 
				
			||||||
    uint64_t lru_counter;
 | 
					    uint64_t lru_counter;
 | 
				
			||||||
    int      ref;
 | 
					    int      ref;
 | 
				
			||||||
 | 
					    bool     dirty;
 | 
				
			||||||
} Qcow2CachedTable;
 | 
					} Qcow2CachedTable;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct Qcow2Cache {
 | 
					struct Qcow2Cache {
 | 
				
			||||||
@@ -41,6 +49,7 @@ struct Qcow2Cache {
 | 
				
			|||||||
    bool                    depends_on_flush;
 | 
					    bool                    depends_on_flush;
 | 
				
			||||||
    void                   *table_array;
 | 
					    void                   *table_array;
 | 
				
			||||||
    uint64_t                lru_counter;
 | 
					    uint64_t                lru_counter;
 | 
				
			||||||
 | 
					    uint64_t                cache_clean_lru_counter;
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline void *qcow2_cache_get_table_addr(BlockDriverState *bs,
 | 
					static inline void *qcow2_cache_get_table_addr(BlockDriverState *bs,
 | 
				
			||||||
@@ -60,6 +69,56 @@ static inline int qcow2_cache_get_table_idx(BlockDriverState *bs,
 | 
				
			|||||||
    return idx;
 | 
					    return idx;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void qcow2_cache_table_release(BlockDriverState *bs, Qcow2Cache *c,
 | 
				
			||||||
 | 
					                                      int i, int num_tables)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					#if QEMU_MADV_DONTNEED != QEMU_MADV_INVALID
 | 
				
			||||||
 | 
					    BDRVQcowState *s = bs->opaque;
 | 
				
			||||||
 | 
					    void *t = qcow2_cache_get_table_addr(bs, c, i);
 | 
				
			||||||
 | 
					    int align = getpagesize();
 | 
				
			||||||
 | 
					    size_t mem_size = (size_t) s->cluster_size * num_tables;
 | 
				
			||||||
 | 
					    size_t offset = QEMU_ALIGN_UP((uintptr_t) t, align) - (uintptr_t) t;
 | 
				
			||||||
 | 
					    size_t length = QEMU_ALIGN_DOWN(mem_size - offset, align);
 | 
				
			||||||
 | 
					    if (length > 0) {
 | 
				
			||||||
 | 
					        qemu_madvise((uint8_t *) t + offset, length, QEMU_MADV_DONTNEED);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static inline bool can_clean_entry(Qcow2Cache *c, int i)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    Qcow2CachedTable *t = &c->entries[i];
 | 
				
			||||||
 | 
					    return t->ref == 0 && !t->dirty && t->offset != 0 &&
 | 
				
			||||||
 | 
					        t->lru_counter <= c->cache_clean_lru_counter;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void qcow2_cache_clean_unused(BlockDriverState *bs, Qcow2Cache *c)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    int i = 0;
 | 
				
			||||||
 | 
					    while (i < c->size) {
 | 
				
			||||||
 | 
					        int to_clean = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        /* Skip the entries that we don't need to clean */
 | 
				
			||||||
 | 
					        while (i < c->size && !can_clean_entry(c, i)) {
 | 
				
			||||||
 | 
					            i++;
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        /* And count how many we can clean in a row */
 | 
				
			||||||
 | 
					        while (i < c->size && can_clean_entry(c, i)) {
 | 
				
			||||||
 | 
					            c->entries[i].offset = 0;
 | 
				
			||||||
 | 
					            c->entries[i].lru_counter = 0;
 | 
				
			||||||
 | 
					            i++;
 | 
				
			||||||
 | 
					            to_clean++;
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if (to_clean > 0) {
 | 
				
			||||||
 | 
					            qcow2_cache_table_release(bs, c, i - to_clean, to_clean);
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    c->cache_clean_lru_counter = c->lru_counter;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables)
 | 
					Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    BDRVQcowState *s = bs->opaque;
 | 
					    BDRVQcowState *s = bs->opaque;
 | 
				
			||||||
@@ -237,6 +296,8 @@ int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c)
 | 
				
			|||||||
        c->entries[i].lru_counter = 0;
 | 
					        c->entries[i].lru_counter = 0;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    qcow2_cache_table_release(bs, c, 0, c->size);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    c->lru_counter = 0;
 | 
					    c->lru_counter = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return 0;
 | 
					    return 0;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -467,6 +467,11 @@ static QemuOptsList qcow2_runtime_opts = {
 | 
				
			|||||||
            .type = QEMU_OPT_SIZE,
 | 
					            .type = QEMU_OPT_SIZE,
 | 
				
			||||||
            .help = "Maximum refcount block cache size",
 | 
					            .help = "Maximum refcount block cache size",
 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            .name = QCOW2_OPT_CACHE_CLEAN_INTERVAL,
 | 
				
			||||||
 | 
					            .type = QEMU_OPT_NUMBER,
 | 
				
			||||||
 | 
					            .help = "Clean unused cache entries after this time (in seconds)",
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
        { /* end of list */ }
 | 
					        { /* end of list */ }
 | 
				
			||||||
    },
 | 
					    },
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
@@ -482,6 +487,49 @@ static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = {
 | 
				
			|||||||
    [QCOW2_OL_INACTIVE_L2_BITNR]    = QCOW2_OPT_OVERLAP_INACTIVE_L2,
 | 
					    [QCOW2_OL_INACTIVE_L2_BITNR]    = QCOW2_OPT_OVERLAP_INACTIVE_L2,
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void cache_clean_timer_cb(void *opaque)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    BlockDriverState *bs = opaque;
 | 
				
			||||||
 | 
					    BDRVQcowState *s = bs->opaque;
 | 
				
			||||||
 | 
					    qcow2_cache_clean_unused(bs, s->l2_table_cache);
 | 
				
			||||||
 | 
					    qcow2_cache_clean_unused(bs, s->refcount_block_cache);
 | 
				
			||||||
 | 
					    timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
 | 
				
			||||||
 | 
					              (int64_t) s->cache_clean_interval * 1000);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void cache_clean_timer_init(BlockDriverState *bs, AioContext *context)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    BDRVQcowState *s = bs->opaque;
 | 
				
			||||||
 | 
					    if (s->cache_clean_interval > 0) {
 | 
				
			||||||
 | 
					        s->cache_clean_timer = aio_timer_new(context, QEMU_CLOCK_VIRTUAL,
 | 
				
			||||||
 | 
					                                             SCALE_MS, cache_clean_timer_cb,
 | 
				
			||||||
 | 
					                                             bs);
 | 
				
			||||||
 | 
					        timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
 | 
				
			||||||
 | 
					                  (int64_t) s->cache_clean_interval * 1000);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void cache_clean_timer_del(BlockDriverState *bs)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    BDRVQcowState *s = bs->opaque;
 | 
				
			||||||
 | 
					    if (s->cache_clean_timer) {
 | 
				
			||||||
 | 
					        timer_del(s->cache_clean_timer);
 | 
				
			||||||
 | 
					        timer_free(s->cache_clean_timer);
 | 
				
			||||||
 | 
					        s->cache_clean_timer = NULL;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void qcow2_detach_aio_context(BlockDriverState *bs)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    cache_clean_timer_del(bs);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void qcow2_attach_aio_context(BlockDriverState *bs,
 | 
				
			||||||
 | 
					                                     AioContext *new_context)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    cache_clean_timer_init(bs, new_context);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
 | 
					static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
 | 
				
			||||||
                             uint64_t *l2_cache_size,
 | 
					                             uint64_t *l2_cache_size,
 | 
				
			||||||
                             uint64_t *refcount_cache_size, Error **errp)
 | 
					                             uint64_t *refcount_cache_size, Error **errp)
 | 
				
			||||||
@@ -555,6 +603,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
 | 
				
			|||||||
    const char *opt_overlap_check, *opt_overlap_check_template;
 | 
					    const char *opt_overlap_check, *opt_overlap_check_template;
 | 
				
			||||||
    int overlap_check_template = 0;
 | 
					    int overlap_check_template = 0;
 | 
				
			||||||
    uint64_t l2_cache_size, refcount_cache_size;
 | 
					    uint64_t l2_cache_size, refcount_cache_size;
 | 
				
			||||||
 | 
					    uint64_t cache_clean_interval;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
 | 
					    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
 | 
				
			||||||
    if (ret < 0) {
 | 
					    if (ret < 0) {
 | 
				
			||||||
@@ -848,6 +897,16 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
 | 
				
			|||||||
        goto fail;
 | 
					        goto fail;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cache_clean_interval =
 | 
				
			||||||
 | 
					        qemu_opt_get_number(opts, QCOW2_OPT_CACHE_CLEAN_INTERVAL, 0);
 | 
				
			||||||
 | 
					    if (cache_clean_interval > UINT_MAX) {
 | 
				
			||||||
 | 
					        error_setg(errp, "Cache clean interval too big");
 | 
				
			||||||
 | 
					        ret = -EINVAL;
 | 
				
			||||||
 | 
					        goto fail;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    s->cache_clean_interval = cache_clean_interval;
 | 
				
			||||||
 | 
					    cache_clean_timer_init(bs, bdrv_get_aio_context(bs));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    s->cluster_cache = g_malloc(s->cluster_size);
 | 
					    s->cluster_cache = g_malloc(s->cluster_size);
 | 
				
			||||||
    /* one more sector for decompressed data alignment */
 | 
					    /* one more sector for decompressed data alignment */
 | 
				
			||||||
    s->cluster_data = qemu_try_blockalign(bs->file, QCOW_MAX_CRYPT_CLUSTERS
 | 
					    s->cluster_data = qemu_try_blockalign(bs->file, QCOW_MAX_CRYPT_CLUSTERS
 | 
				
			||||||
@@ -1013,6 +1072,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
 | 
				
			|||||||
    qemu_vfree(s->l1_table);
 | 
					    qemu_vfree(s->l1_table);
 | 
				
			||||||
    /* else pre-write overlap checks in cache_destroy may crash */
 | 
					    /* else pre-write overlap checks in cache_destroy may crash */
 | 
				
			||||||
    s->l1_table = NULL;
 | 
					    s->l1_table = NULL;
 | 
				
			||||||
 | 
					    cache_clean_timer_del(bs);
 | 
				
			||||||
    if (s->l2_table_cache) {
 | 
					    if (s->l2_table_cache) {
 | 
				
			||||||
        qcow2_cache_destroy(bs, s->l2_table_cache);
 | 
					        qcow2_cache_destroy(bs, s->l2_table_cache);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@@ -1471,6 +1531,7 @@ static void qcow2_close(BlockDriverState *bs)
 | 
				
			|||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cache_clean_timer_del(bs);
 | 
				
			||||||
    qcow2_cache_destroy(bs, s->l2_table_cache);
 | 
					    qcow2_cache_destroy(bs, s->l2_table_cache);
 | 
				
			||||||
    qcow2_cache_destroy(bs, s->refcount_block_cache);
 | 
					    qcow2_cache_destroy(bs, s->refcount_block_cache);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -2977,6 +3038,9 @@ BlockDriver bdrv_qcow2 = {
 | 
				
			|||||||
    .create_opts         = &qcow2_create_opts,
 | 
					    .create_opts         = &qcow2_create_opts,
 | 
				
			||||||
    .bdrv_check          = qcow2_check,
 | 
					    .bdrv_check          = qcow2_check,
 | 
				
			||||||
    .bdrv_amend_options  = qcow2_amend_options,
 | 
					    .bdrv_amend_options  = qcow2_amend_options,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    .bdrv_detach_aio_context  = qcow2_detach_aio_context,
 | 
				
			||||||
 | 
					    .bdrv_attach_aio_context  = qcow2_attach_aio_context,
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static void bdrv_qcow2_init(void)
 | 
					static void bdrv_qcow2_init(void)
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -96,6 +96,7 @@
 | 
				
			|||||||
#define QCOW2_OPT_CACHE_SIZE "cache-size"
 | 
					#define QCOW2_OPT_CACHE_SIZE "cache-size"
 | 
				
			||||||
#define QCOW2_OPT_L2_CACHE_SIZE "l2-cache-size"
 | 
					#define QCOW2_OPT_L2_CACHE_SIZE "l2-cache-size"
 | 
				
			||||||
#define QCOW2_OPT_REFCOUNT_CACHE_SIZE "refcount-cache-size"
 | 
					#define QCOW2_OPT_REFCOUNT_CACHE_SIZE "refcount-cache-size"
 | 
				
			||||||
 | 
					#define QCOW2_OPT_CACHE_CLEAN_INTERVAL "cache-clean-interval"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
typedef struct QCowHeader {
 | 
					typedef struct QCowHeader {
 | 
				
			||||||
    uint32_t magic;
 | 
					    uint32_t magic;
 | 
				
			||||||
@@ -239,6 +240,8 @@ typedef struct BDRVQcowState {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    Qcow2Cache* l2_table_cache;
 | 
					    Qcow2Cache* l2_table_cache;
 | 
				
			||||||
    Qcow2Cache* refcount_block_cache;
 | 
					    Qcow2Cache* refcount_block_cache;
 | 
				
			||||||
 | 
					    QEMUTimer *cache_clean_timer;
 | 
				
			||||||
 | 
					    unsigned cache_clean_interval;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    uint8_t *cluster_cache;
 | 
					    uint8_t *cluster_cache;
 | 
				
			||||||
    uint8_t *cluster_data;
 | 
					    uint8_t *cluster_data;
 | 
				
			||||||
@@ -581,6 +584,7 @@ int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
 | 
				
			|||||||
    Qcow2Cache *dependency);
 | 
					    Qcow2Cache *dependency);
 | 
				
			||||||
void qcow2_cache_depends_on_flush(Qcow2Cache *c);
 | 
					void qcow2_cache_depends_on_flush(Qcow2Cache *c);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void qcow2_cache_clean_unused(BlockDriverState *bs, Qcow2Cache *c);
 | 
				
			||||||
int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c);
 | 
					int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
 | 
					int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -889,6 +889,12 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    s->threshold = qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHOLD, 0);
 | 
					    s->threshold = qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHOLD, 0);
 | 
				
			||||||
 | 
					    /* and validate it against s->num_children */
 | 
				
			||||||
 | 
					    ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err);
 | 
				
			||||||
 | 
					    if (ret < 0) {
 | 
				
			||||||
 | 
					        goto exit;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    ret = parse_read_pattern(qemu_opt_get(opts, QUORUM_OPT_READ_PATTERN));
 | 
					    ret = parse_read_pattern(qemu_opt_get(opts, QUORUM_OPT_READ_PATTERN));
 | 
				
			||||||
    if (ret < 0) {
 | 
					    if (ret < 0) {
 | 
				
			||||||
        error_setg(&local_err, "Please set read-pattern as fifo or quorum");
 | 
					        error_setg(&local_err, "Please set read-pattern as fifo or quorum");
 | 
				
			||||||
@@ -897,12 +903,6 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
 | 
				
			|||||||
    s->read_pattern = ret;
 | 
					    s->read_pattern = ret;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
 | 
					    if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
 | 
				
			||||||
        /* and validate it against s->num_children */
 | 
					 | 
				
			||||||
        ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err);
 | 
					 | 
				
			||||||
        if (ret < 0) {
 | 
					 | 
				
			||||||
            goto exit;
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        /* is the driver in blkverify mode */
 | 
					        /* is the driver in blkverify mode */
 | 
				
			||||||
        if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) &&
 | 
					        if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) &&
 | 
				
			||||||
            s->num_children == 2 && s->threshold == 2) {
 | 
					            s->num_children == 2 && s->threshold == 2) {
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										164
									
								
								docs/qcow2-cache.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										164
									
								
								docs/qcow2-cache.txt
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,164 @@
 | 
				
			|||||||
 | 
					qcow2 L2/refcount cache configuration
 | 
				
			||||||
 | 
					=====================================
 | 
				
			||||||
 | 
					Copyright (C) 2015 Igalia, S.L.
 | 
				
			||||||
 | 
					Author: Alberto Garcia <berto@igalia.com>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This work is licensed under the terms of the GNU GPL, version 2 or
 | 
				
			||||||
 | 
					later. See the COPYING file in the top-level directory.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Introduction
 | 
				
			||||||
 | 
					------------
 | 
				
			||||||
 | 
					The QEMU qcow2 driver has two caches that can improve the I/O
 | 
				
			||||||
 | 
					performance significantly. However, setting the right cache sizes is
 | 
				
			||||||
 | 
					not a straightforward operation.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This document attempts to give an overview of the L2 and refcount
 | 
				
			||||||
 | 
					caches, and how to configure them.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Please refer to the docs/specs/qcow2.txt file for an in-depth
 | 
				
			||||||
 | 
					technical description of the qcow2 file format.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Clusters
 | 
				
			||||||
 | 
					--------
 | 
				
			||||||
 | 
					A qcow2 file is organized in units of constant size called clusters.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The cluster size is configurable, but it must be a power of two and
 | 
				
			||||||
 | 
					its value 512 bytes or higher. QEMU currently defaults to 64 KB
 | 
				
			||||||
 | 
					clusters, and it does not support sizes larger than 2MB.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The 'qemu-img create' command supports specifying the size using the
 | 
				
			||||||
 | 
					cluster_size option:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   qemu-img create -f qcow2 -o cluster_size=128K hd.qcow2 4G
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The L2 tables
 | 
				
			||||||
 | 
					-------------
 | 
				
			||||||
 | 
					The qcow2 format uses a two-level structure to map the virtual disk as
 | 
				
			||||||
 | 
					seen by the guest to the disk image in the host. These structures are
 | 
				
			||||||
 | 
					called the L1 and L2 tables.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					There is one single L1 table per disk image. The table is small and is
 | 
				
			||||||
 | 
					always kept in memory.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					There can be many L2 tables, depending on how much space has been
 | 
				
			||||||
 | 
					allocated in the image. Each table is one cluster in size. In order to
 | 
				
			||||||
 | 
					read or write data from the virtual disk, QEMU needs to read its
 | 
				
			||||||
 | 
					corresponding L2 table to find out where that data is located. Since
 | 
				
			||||||
 | 
					reading the table for each I/O operation can be expensive, QEMU keeps
 | 
				
			||||||
 | 
					an L2 cache in memory to speed up disk access.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The size of the L2 cache can be configured, and setting the right
 | 
				
			||||||
 | 
					value can improve the I/O performance significantly.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The refcount blocks
 | 
				
			||||||
 | 
					-------------------
 | 
				
			||||||
 | 
					The qcow2 format also mantains a reference count for each cluster.
 | 
				
			||||||
 | 
					Reference counts are used for cluster allocation and internal
 | 
				
			||||||
 | 
					snapshots. The data is stored in a two-level structure similar to the
 | 
				
			||||||
 | 
					L1/L2 tables described above.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The second level structures are called refcount blocks, are also one
 | 
				
			||||||
 | 
					cluster in size and the number is also variable and dependent on the
 | 
				
			||||||
 | 
					amount of allocated space.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Each block contains a number of refcount entries. Their size (in bits)
 | 
				
			||||||
 | 
					is a power of two and must not be higher than 64. It defaults to 16
 | 
				
			||||||
 | 
					bits, but a different value can be set using the refcount_bits option:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   qemu-img create -f qcow2 -o refcount_bits=8 hd.qcow2 4G
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					QEMU keeps a refcount cache to speed up I/O much like the
 | 
				
			||||||
 | 
					aforementioned L2 cache, and its size can also be configured.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Choosing the right cache sizes
 | 
				
			||||||
 | 
					------------------------------
 | 
				
			||||||
 | 
					In order to choose the cache sizes we need to know how they relate to
 | 
				
			||||||
 | 
					the amount of allocated space.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The amount of virtual disk that can be mapped by the L2 and refcount
 | 
				
			||||||
 | 
					caches (in bytes) is:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   disk_size = l2_cache_size * cluster_size / 8
 | 
				
			||||||
 | 
					   disk_size = refcount_cache_size * cluster_size * 8 / refcount_bits
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					With the default values for cluster_size (64KB) and refcount_bits
 | 
				
			||||||
 | 
					(16), that is
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   disk_size = l2_cache_size * 8192
 | 
				
			||||||
 | 
					   disk_size = refcount_cache_size * 32768
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					So in order to cover n GB of disk space with the default values we
 | 
				
			||||||
 | 
					need:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   l2_cache_size = disk_size_GB * 131072
 | 
				
			||||||
 | 
					   refcount_cache_size = disk_size_GB * 32768
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					QEMU has a default L2 cache of 1MB (1048576 bytes) and a refcount
 | 
				
			||||||
 | 
					cache of 256KB (262144 bytes), so using the formulas we've just seen
 | 
				
			||||||
 | 
					we have
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   1048576 / 131072 = 8 GB of virtual disk covered by that cache
 | 
				
			||||||
 | 
					    262144 /  32768 = 8 GB
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					How to configure the cache sizes
 | 
				
			||||||
 | 
					--------------------------------
 | 
				
			||||||
 | 
					Cache sizes can be configured using the -drive option in the
 | 
				
			||||||
 | 
					command-line, or the 'blockdev-add' QMP command.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					There are three options available, and all of them take bytes:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					"l2-cache-size":         maximum size of the L2 table cache
 | 
				
			||||||
 | 
					"refcount-cache-size":   maximum size of the refcount block cache
 | 
				
			||||||
 | 
					"cache-size":            maximum size of both caches combined
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					There are two things that need to be taken into account:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 - Both caches must have a size that is a multiple of the cluster
 | 
				
			||||||
 | 
					   size.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 - If you only set one of the options above, QEMU will automatically
 | 
				
			||||||
 | 
					   adjust the others so that the L2 cache is 4 times bigger than the
 | 
				
			||||||
 | 
					   refcount cache.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This means that these options are equivalent:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   -drive file=hd.qcow2,l2-cache-size=2097152
 | 
				
			||||||
 | 
					   -drive file=hd.qcow2,refcount-cache-size=524288
 | 
				
			||||||
 | 
					   -drive file=hd.qcow2,cache-size=2621440
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The reason for this 1/4 ratio is to ensure that both caches cover the
 | 
				
			||||||
 | 
					same amount of disk space. Note however that this is only valid with
 | 
				
			||||||
 | 
					the default value of refcount_bits (16). If you are using a different
 | 
				
			||||||
 | 
					value you might want to calculate both cache sizes yourself since QEMU
 | 
				
			||||||
 | 
					will always use the same 1/4 ratio.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					It's also worth mentioning that there's no strict need for both caches
 | 
				
			||||||
 | 
					to cover the same amount of disk space. The refcount cache is used
 | 
				
			||||||
 | 
					much less often than the L2 cache, so it's perfectly reasonable to
 | 
				
			||||||
 | 
					keep it small.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Reducing the memory usage
 | 
				
			||||||
 | 
					-------------------------
 | 
				
			||||||
 | 
					It is possible to clean unused cache entries in order to reduce the
 | 
				
			||||||
 | 
					memory usage during periods of low I/O activity.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The parameter "cache-clean-interval" defines an interval (in seconds).
 | 
				
			||||||
 | 
					All cache entries that haven't been accessed during that interval are
 | 
				
			||||||
 | 
					removed from memory.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This example removes all unused cache entries every 15 minutes:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   -drive file=hd.qcow2,cache-clean-interval=900
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If unset, the default value for this parameter is 0 and it disables
 | 
				
			||||||
 | 
					this feature.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Note that this functionality currently relies on the MADV_DONTNEED
 | 
				
			||||||
 | 
					argument for madvise() to actually free the memory, so it is not
 | 
				
			||||||
 | 
					useful in systems that don't follow that behavior.
 | 
				
			||||||
@@ -1592,6 +1592,10 @@
 | 
				
			|||||||
# @refcount-cache-size:   #optional the maximum size of the refcount block cache
 | 
					# @refcount-cache-size:   #optional the maximum size of the refcount block cache
 | 
				
			||||||
#                         in bytes (since 2.2)
 | 
					#                         in bytes (since 2.2)
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
 | 
					# @cache-clean-interval:  #optional clean unused entries in the L2 and refcount
 | 
				
			||||||
 | 
					#                         caches. The interval is in seconds. The default value
 | 
				
			||||||
 | 
					#                         is 0 and it disables this feature (since 2.5)
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
# Since: 1.7
 | 
					# Since: 1.7
 | 
				
			||||||
##
 | 
					##
 | 
				
			||||||
{ 'struct': 'BlockdevOptionsQcow2',
 | 
					{ 'struct': 'BlockdevOptionsQcow2',
 | 
				
			||||||
@@ -1603,7 +1607,8 @@
 | 
				
			|||||||
            '*overlap-check': 'Qcow2OverlapChecks',
 | 
					            '*overlap-check': 'Qcow2OverlapChecks',
 | 
				
			||||||
            '*cache-size': 'int',
 | 
					            '*cache-size': 'int',
 | 
				
			||||||
            '*l2-cache-size': 'int',
 | 
					            '*l2-cache-size': 'int',
 | 
				
			||||||
            '*refcount-cache-size': 'int' } }
 | 
					            '*refcount-cache-size': 'int',
 | 
				
			||||||
 | 
					            '*cache-clean-interval': 'int' } }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
##
 | 
					##
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user