Files
qemu/backends/hostmem-memfd-private.c

351 lines
11 KiB
C
Raw Normal View History

/*
* QEMU host private memfd memory backend
*
* Copyright (C) 2021 Intel Corporation
*
* Authors:
* Chao Peng <chao.p.peng@linux.intel.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*/
#include "qemu/osdep.h"
hostmem: Add RamDiscardManager interface in private memfd backend RamDiscardManager is firstly introduced for some special RAM memory regions (managed by virtio-mem). This interface can coordinate which parts of specific memory regions are currently populated to be used by the VM, notifying after parts were discarded and before parts will be populated. It is required by VFIO because VFIO tries to register the whole region with the kernel and pin all pages, which is incompatible with discarding of RAM. With RamDiscardManager, VFIO can require proper coordination to only map the currently populated parts, to hinder parts that are expected to remain discarded from silently getting populated and consuming memory. This interface also applies to the private memfd backend, which can represent two types of memory (shared and private). Shared memory is accessiable by host VMM and can be mapped in IOMMU page tables. Private memory is inaccessible and can't be DMA-mapped. So the memory regions backed by private memfd can coordinate to only ap the shared parts, update eh mapping when notified about page attribute conversion between shared and private. Introduce the RamDiscardManager interface in private memfd backend and utilize the existing VFIORamDiscardListener (registered in vfio_register_ram_discard_listener()). The users can call priv_memfd_backend_state_change() notifier to DMA map the shared parts and unmap the private parts. There are some notes in current implementation. - VFIO has registerd the population ram discard listener which will DMA map in minimun granularity (i.e. block_size in virtio-mem and private memfd). The block_size of private memfd is set to host page size because the conversion size can be as small as one page. - Current VFIO do the map/unmap with minimal pranularity each time, which can avoid partial unmap request from TD guest. Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
2022-12-27 05:45:47 +00:00
#include "qemu/error-report.h"
#include "sysemu/hostmem-memfd-private.h"
#include "qom/object_interfaces.h"
#include "qemu/memfd.h"
#include "qemu/module.h"
#include "qapi/error.h"
#include "qom/object.h"
static void
priv_memfd_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
{
HostMemoryBackendPrivateMemfd *m = MEMORY_BACKEND_MEMFD_PRIVATE(backend);
uint32_t ram_flags;
char *name;
int fd, priv_fd;
if (!backend->size) {
error_setg(errp, "can't create backend with size 0");
return;
}
fd = qemu_memfd_create("memory-backend-memfd-shared", backend->size,
m->hugetlb, m->hugetlbsize, 0, errp);
if (fd == -1) {
return;
}
priv_fd = qemu_memfd_restricted(backend->size, 0, errp);
if (priv_fd == -1) {
return;
}
name = host_memory_backend_get_name(backend);
ram_flags = backend->share ? RAM_SHARED : 0;
ram_flags |= backend->reserve ? 0 : RAM_NORESERVE;
memory_region_init_ram_from_fd(&backend->mr, OBJECT(backend), name,
backend->size, ram_flags, fd, 0, errp);
g_free(name);
memory_region_set_restricted_fd(&backend->mr, priv_fd);
hostmem: Add RamDiscardManager interface in private memfd backend RamDiscardManager is firstly introduced for some special RAM memory regions (managed by virtio-mem). This interface can coordinate which parts of specific memory regions are currently populated to be used by the VM, notifying after parts were discarded and before parts will be populated. It is required by VFIO because VFIO tries to register the whole region with the kernel and pin all pages, which is incompatible with discarding of RAM. With RamDiscardManager, VFIO can require proper coordination to only map the currently populated parts, to hinder parts that are expected to remain discarded from silently getting populated and consuming memory. This interface also applies to the private memfd backend, which can represent two types of memory (shared and private). Shared memory is accessiable by host VMM and can be mapped in IOMMU page tables. Private memory is inaccessible and can't be DMA-mapped. So the memory regions backed by private memfd can coordinate to only ap the shared parts, update eh mapping when notified about page attribute conversion between shared and private. Introduce the RamDiscardManager interface in private memfd backend and utilize the existing VFIORamDiscardListener (registered in vfio_register_ram_discard_listener()). The users can call priv_memfd_backend_state_change() notifier to DMA map the shared parts and unmap the private parts. There are some notes in current implementation. - VFIO has registerd the population ram discard listener which will DMA map in minimun granularity (i.e. block_size in virtio-mem and private memfd). The block_size of private memfd is set to host page size because the conversion size can be as small as one page. - Current VFIO do the map/unmap with minimal pranularity each time, which can avoid partial unmap request from TD guest. Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
2022-12-27 05:45:47 +00:00
memory_region_set_ram_discard_manager(&backend->mr,
RAM_DISCARD_MANAGER(m));
ram_block_alloc_cgs_bitmap(backend->mr.ram_block);
}
static bool
priv_memfd_backend_get_hugetlb(Object *o, Error **errp)
{
return MEMORY_BACKEND_MEMFD_PRIVATE(o)->hugetlb;
}
static void
priv_memfd_backend_set_hugetlb(Object *o, bool value, Error **errp)
{
MEMORY_BACKEND_MEMFD_PRIVATE(o)->hugetlb = value;
}
static void
priv_memfd_backend_set_hugetlbsize(Object *obj, Visitor *v, const char *name,
void *opaque, Error **errp)
{
HostMemoryBackendPrivateMemfd *m = MEMORY_BACKEND_MEMFD_PRIVATE(obj);
uint64_t value;
if (host_memory_backend_mr_inited(MEMORY_BACKEND(obj))) {
error_setg(errp, "cannot change property value");
return;
}
if (!visit_type_size(v, name, &value, errp)) {
return;
}
if (!value) {
error_setg(errp, "Property '%s.%s' doesn't take value '%" PRIu64 "'",
object_get_typename(obj), name, value);
return;
}
m->hugetlbsize = value;
}
static void
priv_memfd_backend_get_hugetlbsize(Object *obj, Visitor *v, const char *name,
void *opaque, Error **errp)
{
HostMemoryBackendPrivateMemfd *m = MEMORY_BACKEND_MEMFD_PRIVATE(obj);
uint64_t value = m->hugetlbsize;
visit_type_size(v, name, &value, errp);
}
static void
priv_memfd_backend_instance_init(Object *obj)
{
hostmem: Add RamDiscardManager interface in private memfd backend RamDiscardManager is firstly introduced for some special RAM memory regions (managed by virtio-mem). This interface can coordinate which parts of specific memory regions are currently populated to be used by the VM, notifying after parts were discarded and before parts will be populated. It is required by VFIO because VFIO tries to register the whole region with the kernel and pin all pages, which is incompatible with discarding of RAM. With RamDiscardManager, VFIO can require proper coordination to only map the currently populated parts, to hinder parts that are expected to remain discarded from silently getting populated and consuming memory. This interface also applies to the private memfd backend, which can represent two types of memory (shared and private). Shared memory is accessiable by host VMM and can be mapped in IOMMU page tables. Private memory is inaccessible and can't be DMA-mapped. So the memory regions backed by private memfd can coordinate to only ap the shared parts, update eh mapping when notified about page attribute conversion between shared and private. Introduce the RamDiscardManager interface in private memfd backend and utilize the existing VFIORamDiscardListener (registered in vfio_register_ram_discard_listener()). The users can call priv_memfd_backend_state_change() notifier to DMA map the shared parts and unmap the private parts. There are some notes in current implementation. - VFIO has registerd the population ram discard listener which will DMA map in minimun granularity (i.e. block_size in virtio-mem and private memfd). The block_size of private memfd is set to host page size because the conversion size can be as small as one page. - Current VFIO do the map/unmap with minimal pranularity each time, which can avoid partial unmap request from TD guest. Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
2022-12-27 05:45:47 +00:00
HostMemoryBackendPrivateMemfd *m = MEMORY_BACKEND_MEMFD_PRIVATE(obj);
m->block_size = qemu_real_host_page_size();
QLIST_INIT(&m->rdl_list);
MEMORY_BACKEND(obj)->reserve = false;
}
hostmem: Add RamDiscardManager interface in private memfd backend RamDiscardManager is firstly introduced for some special RAM memory regions (managed by virtio-mem). This interface can coordinate which parts of specific memory regions are currently populated to be used by the VM, notifying after parts were discarded and before parts will be populated. It is required by VFIO because VFIO tries to register the whole region with the kernel and pin all pages, which is incompatible with discarding of RAM. With RamDiscardManager, VFIO can require proper coordination to only map the currently populated parts, to hinder parts that are expected to remain discarded from silently getting populated and consuming memory. This interface also applies to the private memfd backend, which can represent two types of memory (shared and private). Shared memory is accessiable by host VMM and can be mapped in IOMMU page tables. Private memory is inaccessible and can't be DMA-mapped. So the memory regions backed by private memfd can coordinate to only ap the shared parts, update eh mapping when notified about page attribute conversion between shared and private. Introduce the RamDiscardManager interface in private memfd backend and utilize the existing VFIORamDiscardListener (registered in vfio_register_ram_discard_listener()). The users can call priv_memfd_backend_state_change() notifier to DMA map the shared parts and unmap the private parts. There are some notes in current implementation. - VFIO has registerd the population ram discard listener which will DMA map in minimun granularity (i.e. block_size in virtio-mem and private memfd). The block_size of private memfd is set to host page size because the conversion size can be as small as one page. - Current VFIO do the map/unmap with minimal pranularity each time, which can avoid partial unmap request from TD guest. Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
2022-12-27 05:45:47 +00:00
/*
* Adjust the memory section to cover the intersection with the given range.
*
* Returns false if the intersection is empty, otherwise returns true.
*/
static bool
priv_memfd_backend_intersect_memory_section(MemoryRegionSection *s,
uint64_t offset, uint64_t size)
{
uint64_t start = MAX(s->offset_within_region, offset);
uint64_t end = MIN(s->offset_within_region + int128_get64(s->size),
offset + size);
if (end <= start) {
return false;
}
s->offset_within_address_space += start - s->offset_within_region;
s->offset_within_region = start;
s->size = int128_make64(end - start);
return true;
}
static uint64_t
priv_memfd_backend_rdm_get_min_granularity(const RamDiscardManager *rdm,
const MemoryRegion *mr)
{
HostMemoryBackendPrivateMemfd *m = MEMORY_BACKEND_MEMFD_PRIVATE(rdm);
HostMemoryBackend *backend = MEMORY_BACKEND(rdm);
g_assert(mr == &backend->mr);
return m->block_size;
}
static bool
priv_memfd_backend_valid_range(const HostMemoryBackendPrivateMemfd *memfd,
uint64_t offset, uint64_t size)
{
MemoryRegion *mr = &(MEMORY_BACKEND(memfd)->mr);
uint64_t region_size = memory_region_size(mr);
if (!QEMU_IS_ALIGNED(offset, memfd->block_size)) {
return false;
}
if (offset + size < offset || !size) {
return false;
}
if (offset >= region_size || offset + size > region_size) {
return false;
}
return true;
}
static void priv_memfd_backend_rdm_register_listener(RamDiscardManager *rdm,
RamDiscardListener *rdl,
MemoryRegionSection *s)
{
HostMemoryBackendPrivateMemfd *m = MEMORY_BACKEND_MEMFD_PRIVATE(rdm);
HostMemoryBackend *backend = MEMORY_BACKEND(rdm);
g_assert(s->mr == &backend->mr);
rdl->section = memory_region_section_new_copy(s);
QLIST_INSERT_HEAD(&m->rdl_list, rdl, next);
}
static void priv_memfd_backend_rdm_unregister_listener(RamDiscardManager *rdm,
RamDiscardListener *rdl)
{
HostMemoryBackend *backend = MEMORY_BACKEND(rdm);
g_assert(rdl->section->mr == &backend->mr);
memory_region_section_free_copy(rdl->section);
rdl->section = NULL;
QLIST_REMOVE(rdl, next);
}
static void
priv_memfd_backend_notify_discard(HostMemoryBackendPrivateMemfd *memfd,
uint64_t offset, uint64_t size)
{
RamDiscardListener *rdl;
QLIST_FOREACH(rdl, &memfd->rdl_list, next) {
MemoryRegionSection tmp = *rdl->section;
if (!priv_memfd_backend_intersect_memory_section(&tmp, offset, size)) {
continue;
}
rdl->notify_discard(rdl, &tmp);
}
}
static int
priv_memfd_backend_notify_populate(HostMemoryBackendPrivateMemfd *memfd,
uint64_t offset, uint64_t size)
{
RamDiscardListener *rdl, *rdl2;
int ret = 0;
QLIST_FOREACH(rdl, &memfd->rdl_list, next) {
MemoryRegionSection tmp = *rdl->section;
if (!priv_memfd_backend_intersect_memory_section(&tmp, offset, size)) {
continue;
}
ret = rdl->notify_populate(rdl, &tmp);
if (ret) {
break;
}
}
if (ret) {
/* Notify all already-notified listeners. */
QLIST_FOREACH(rdl2, &memfd->rdl_list, next) {
MemoryRegionSection tmp = *rdl2->section;
if (rdl2 == rdl) {
break;
}
if (!priv_memfd_backend_intersect_memory_section(&tmp, offset, size)) {
continue;
}
rdl2->notify_discard(rdl2, &tmp);
}
}
return ret;
}
static int
priv_memfd_backend_state_change_notify(HostMemoryBackendPrivateMemfd *memfd,
uint64_t offset, uint64_t size, bool shared)
{
int ret = 0;
if (!QEMU_IS_ALIGNED(offset, memfd->block_size) ||
!QEMU_IS_ALIGNED(size, memfd->block_size)) {
return -1;
}
if (!shared) {
priv_memfd_backend_notify_discard(memfd, offset, size);
} else {
ret = priv_memfd_backend_notify_populate(memfd, offset, size);
}
return ret;
}
int priv_memfd_backend_state_change(HostMemoryBackendPrivateMemfd *memfd,
uint64_t offset, uint64_t size,
bool shared)
{
if (!priv_memfd_backend_valid_range(memfd, offset, size)) {
error_report("%s, invalid range: offset 0x%lx, size 0x%lx", __func__, offset, size);
return -1;
}
return priv_memfd_backend_state_change_notify(memfd, offset, size, shared);
}
static bool priv_memfd_backend_rdm_is_populated(const RamDiscardManager *rdm,
const MemoryRegionSection *s)
{
/* TODO: TD live migration with pass-thru device */
return 0;
}
static int priv_memfd_backend_replay_populated(const RamDiscardManager *rdm,
MemoryRegionSection *s,
ReplayRamPopulate replay_fn,
void *opaque)
{
/* TODO: TD live migration with pass-thru device */
return 0;
}
static void priv_memfd_backend_replay_discarded(const RamDiscardManager *rdm,
MemoryRegionSection *s,
ReplayRamDiscard replay_fn,
void *opaque)
{
/* TODO: TD live migration with pass-thru device */
return;
}
static void
priv_memfd_backend_class_init(ObjectClass *oc, void *data)
{
HostMemoryBackendClass *bc = MEMORY_BACKEND_CLASS(oc);
hostmem: Add RamDiscardManager interface in private memfd backend RamDiscardManager is firstly introduced for some special RAM memory regions (managed by virtio-mem). This interface can coordinate which parts of specific memory regions are currently populated to be used by the VM, notifying after parts were discarded and before parts will be populated. It is required by VFIO because VFIO tries to register the whole region with the kernel and pin all pages, which is incompatible with discarding of RAM. With RamDiscardManager, VFIO can require proper coordination to only map the currently populated parts, to hinder parts that are expected to remain discarded from silently getting populated and consuming memory. This interface also applies to the private memfd backend, which can represent two types of memory (shared and private). Shared memory is accessiable by host VMM and can be mapped in IOMMU page tables. Private memory is inaccessible and can't be DMA-mapped. So the memory regions backed by private memfd can coordinate to only ap the shared parts, update eh mapping when notified about page attribute conversion between shared and private. Introduce the RamDiscardManager interface in private memfd backend and utilize the existing VFIORamDiscardListener (registered in vfio_register_ram_discard_listener()). The users can call priv_memfd_backend_state_change() notifier to DMA map the shared parts and unmap the private parts. There are some notes in current implementation. - VFIO has registerd the population ram discard listener which will DMA map in minimun granularity (i.e. block_size in virtio-mem and private memfd). The block_size of private memfd is set to host page size because the conversion size can be as small as one page. - Current VFIO do the map/unmap with minimal pranularity each time, which can avoid partial unmap request from TD guest. Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
2022-12-27 05:45:47 +00:00
RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(oc);
bc->alloc = priv_memfd_backend_memory_alloc;
if (qemu_memfd_check(MFD_HUGETLB)) {
object_class_property_add_bool(oc, "hugetlb",
priv_memfd_backend_get_hugetlb,
priv_memfd_backend_set_hugetlb);
object_class_property_set_description(oc, "hugetlb",
"Use huge pages");
object_class_property_add(oc, "hugetlbsize", "int",
priv_memfd_backend_get_hugetlbsize,
priv_memfd_backend_set_hugetlbsize,
NULL, NULL);
object_class_property_set_description(oc, "hugetlbsize",
"Huge pages size (ex: 2M, 1G)");
}
hostmem: Add RamDiscardManager interface in private memfd backend RamDiscardManager is firstly introduced for some special RAM memory regions (managed by virtio-mem). This interface can coordinate which parts of specific memory regions are currently populated to be used by the VM, notifying after parts were discarded and before parts will be populated. It is required by VFIO because VFIO tries to register the whole region with the kernel and pin all pages, which is incompatible with discarding of RAM. With RamDiscardManager, VFIO can require proper coordination to only map the currently populated parts, to hinder parts that are expected to remain discarded from silently getting populated and consuming memory. This interface also applies to the private memfd backend, which can represent two types of memory (shared and private). Shared memory is accessiable by host VMM and can be mapped in IOMMU page tables. Private memory is inaccessible and can't be DMA-mapped. So the memory regions backed by private memfd can coordinate to only ap the shared parts, update eh mapping when notified about page attribute conversion between shared and private. Introduce the RamDiscardManager interface in private memfd backend and utilize the existing VFIORamDiscardListener (registered in vfio_register_ram_discard_listener()). The users can call priv_memfd_backend_state_change() notifier to DMA map the shared parts and unmap the private parts. There are some notes in current implementation. - VFIO has registerd the population ram discard listener which will DMA map in minimun granularity (i.e. block_size in virtio-mem and private memfd). The block_size of private memfd is set to host page size because the conversion size can be as small as one page. - Current VFIO do the map/unmap with minimal pranularity each time, which can avoid partial unmap request from TD guest. Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
2022-12-27 05:45:47 +00:00
rdmc->get_min_granularity = priv_memfd_backend_rdm_get_min_granularity;
rdmc->register_listener = priv_memfd_backend_rdm_register_listener;
rdmc->unregister_listener = priv_memfd_backend_rdm_unregister_listener;
rdmc->is_populated = priv_memfd_backend_rdm_is_populated;
rdmc->replay_populated = priv_memfd_backend_replay_populated;
rdmc->replay_discarded = priv_memfd_backend_replay_discarded;
}
static const TypeInfo priv_memfd_backend_info = {
.name = TYPE_MEMORY_BACKEND_MEMFD_PRIVATE,
.parent = TYPE_MEMORY_BACKEND,
.instance_init = priv_memfd_backend_instance_init,
.class_init = priv_memfd_backend_class_init,
.instance_size = sizeof(HostMemoryBackendPrivateMemfd),
hostmem: Add RamDiscardManager interface in private memfd backend RamDiscardManager is firstly introduced for some special RAM memory regions (managed by virtio-mem). This interface can coordinate which parts of specific memory regions are currently populated to be used by the VM, notifying after parts were discarded and before parts will be populated. It is required by VFIO because VFIO tries to register the whole region with the kernel and pin all pages, which is incompatible with discarding of RAM. With RamDiscardManager, VFIO can require proper coordination to only map the currently populated parts, to hinder parts that are expected to remain discarded from silently getting populated and consuming memory. This interface also applies to the private memfd backend, which can represent two types of memory (shared and private). Shared memory is accessiable by host VMM and can be mapped in IOMMU page tables. Private memory is inaccessible and can't be DMA-mapped. So the memory regions backed by private memfd can coordinate to only ap the shared parts, update eh mapping when notified about page attribute conversion between shared and private. Introduce the RamDiscardManager interface in private memfd backend and utilize the existing VFIORamDiscardListener (registered in vfio_register_ram_discard_listener()). The users can call priv_memfd_backend_state_change() notifier to DMA map the shared parts and unmap the private parts. There are some notes in current implementation. - VFIO has registerd the population ram discard listener which will DMA map in minimun granularity (i.e. block_size in virtio-mem and private memfd). The block_size of private memfd is set to host page size because the conversion size can be as small as one page. - Current VFIO do the map/unmap with minimal pranularity each time, which can avoid partial unmap request from TD guest. Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
2022-12-27 05:45:47 +00:00
.interfaces = (InterfaceInfo[]) {
{ TYPE_RAM_DISCARD_MANAGER },
{ }
},
};
static void register_types(void)
{
if (qemu_memfd_check(MFD_ALLOW_SEALING)) {
type_register_static(&priv_memfd_backend_info);
}
}
type_init(register_types);