2007-03-13 Gwenole Beauchesne * Merge in KVM rev 4486. Requires kernel 2.6.17 >= 12mdv. ================================================================================ --- qemu-0.9.0/Makefile.target +++ qemu-0.9.0/Makefile.target @@ -1,5 +1,9 @@ +CFLAGS= +LDFLAGS= + include config.mak +LDFLAGS_BASE:=$(LDFLAGS) TARGET_BASE_ARCH:=$(TARGET_ARCH) ifeq ($(TARGET_ARCH), x86_64) TARGET_BASE_ARCH:=i386 @@ -227,8 +231,8 @@ OBJS+= libqemu.a # cpu emulator library -LIBOBJS=exec.o kqemu.o translate-op.o translate-all.o cpu-exec.o\ - translate.o op.o +LIBOBJS=exec.o kqemu.o qemu-kvm.o translate-op.o translate-all.o cpu-exec.o\ + translate.o op.o ifdef CONFIG_SOFTFLOAT LIBOBJS+=fpu/softfloat.o else @@ -365,6 +369,13 @@ # PCI network cards VL_OBJS+= ne2000.o rtl8139.o pcnet.o +# KVM layer +ifeq ($(USE_KVM), yes) +VL_OBJS+= kvmctl.o +# PCI Hypercall +VL_OBJS+= hypercall.o +endif + ifeq ($(TARGET_BASE_ARCH), i386) # Hardware support VL_OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o $(AUDIODRV) @@ -429,7 +440,7 @@ VL_OBJS+=$(addprefix slirp/, $(SLIRP_OBJS)) endif -VL_LDFLAGS= +VL_LDFLAGS=$(LDFLAGS_BASE) # specific flags are needed for non soft mmu emulator ifdef CONFIG_STATIC VL_LDFLAGS+=-static @@ -440,7 +451,7 @@ ifndef CONFIG_DARWIN ifndef CONFIG_WIN32 ifndef CONFIG_SOLARIS -VL_LIBS=-lutil -lrt +VL_LIBS=-lutil -lrt -luuid endif endif endif @@ -462,7 +473,7 @@ SDL_LIBS := $(filter-out -mwindows, $(SDL_LIBS)) -mconsole endif -$(QEMU_SYSTEM): $(VL_OBJS) libqemu.a +$(QEMU_SYSTEM): $(VL_OBJS) libqemu.a $(DEPLIBS) $(CC) $(VL_LDFLAGS) -o $@ $^ $(LIBS) $(SDL_LIBS) $(COCOA_LIBS) $(VL_LIBS) cocoa.o: cocoa.m @@ -521,6 +532,9 @@ cpu-exec.o: cpu-exec.c $(CC) $(HELPER_CFLAGS) $(CPPFLAGS) $(BASE_CFLAGS) -c -o $@ $< +qemu-kvm.o: qemu-kvm.c + $(CC) $(HELPER_CFLAGS) $(CPPFLAGS) $(BASE_CFLAGS) -c -o $@ $< + # Note: this is a workaround. The real fix is to avoid compiling # cpu_signal_handler() in cpu-exec.c. signal.o: signal.c --- qemu-0.9.0/configure +++ qemu-0.9.0/configure @@ -89,7 +89,9 @@ bsd="no" linux="no" kqemu="no" +kvm="no" profiler="no" +kernel_path="" cocoa="no" check_gfx="yes" check_gcc="yes" @@ -114,6 +116,7 @@ oss="yes" if [ "$cpu" = "i386" -o "$cpu" = "x86_64" ] ; then kqemu="yes" + kvm="yes" fi ;; NetBSD) @@ -141,6 +144,7 @@ linux_user="yes" if [ "$cpu" = "i386" -o "$cpu" = "x86_64" ] ; then kqemu="yes" + kvm="yes" fi ;; esac @@ -232,8 +236,12 @@ ;; --disable-kqemu) kqemu="no" ;; + --enable-kvm) kvm="yes" + ;; --enable-profiler) profiler="yes" ;; + --kernel-path=*) kernel_path="$optarg" + ;; --enable-cocoa) cocoa="yes" ; coreaudio="yes" ; sdl="no" ;; --disable-gfx-check) check_gfx="no" @@ -277,6 +285,8 @@ echo "" echo "kqemu kernel acceleration support:" echo " --disable-kqemu disable kqemu support" +echo " --kernel-path=PATH set the kernel path (configure probes it)" +echo " --enable-kvm enable kernel virtual machine support" echo "" echo "Advanced options (experts only):" echo " --source-path=PATH path of source code [$source_path]" @@ -623,6 +633,7 @@ fi echo "FMOD support $fmod $fmod_support" echo "kqemu support $kqemu" +echo "kvm support $kvm" echo "Documentation $build_docs" [ ! -z "$uname_release" ] && \ echo "uname -r $uname_release" @@ -857,6 +868,13 @@ interp_prefix1=`echo "$interp_prefix" | sed "s/%M/$target_cpu/g"` echo "#define CONFIG_QEMU_PREFIX \"$interp_prefix1\"" >> $config_h +configure_kvm() { + if test $kvm = "yes" -a "$target_softmmu" = "yes" -a $cpu = "$target_cpu" ; then + echo "#define USE_KVM 1" >> $config_h + echo "USE_KVM=yes" >> $config_mak + fi +} + if test "$target_cpu" = "i386" ; then echo "TARGET_ARCH=i386" >> $config_mak echo "#define TARGET_ARCH \"i386\"" >> $config_h @@ -864,6 +882,7 @@ if test $kqemu = "yes" -a "$target_softmmu" = "yes" -a $cpu = "i386" ; then echo "#define USE_KQEMU 1" >> $config_h fi + configure_kvm elif test "$target_cpu" = "arm" -o "$target_cpu" = "armeb" ; then echo "TARGET_ARCH=arm" >> $config_mak echo "#define TARGET_ARCH \"arm\"" >> $config_h @@ -895,6 +914,7 @@ if test $kqemu = "yes" -a "$target_softmmu" = "yes" -a $cpu = "x86_64" ; then echo "#define USE_KQEMU 1" >> $config_h fi + configure_kvm elif test "$target_cpu" = "mips" -o "$target_cpu" = "mipsel" ; then echo "TARGET_ARCH=mips" >> $config_mak echo "#define TARGET_ARCH \"mips\"" >> $config_h --- qemu-0.9.0/cpu-all.h +++ qemu-0.9.0/cpu-all.h @@ -834,6 +834,7 @@ extern int phys_ram_fd; extern uint8_t *phys_ram_base; extern uint8_t *phys_ram_dirty; +extern uint8_t *bios_mem; /* physical memory access */ #define TLB_INVALID_MASK (1 << 3) --- qemu-0.9.0/cpu-exec.c +++ qemu-0.9.0/cpu-exec.c @@ -35,6 +35,11 @@ #include #endif +#ifdef USE_KVM +#include "qemu-kvm.h" +extern int kvm_allowed; +#endif + int tb_invalidated_flag; //#define DEBUG_EXEC @@ -401,6 +406,12 @@ } #endif +#ifdef USE_KVM + if (kvm_allowed) { + kvm_cpu_exec(env); + longjmp(env->jmp_env, 1); + } +#endif T0 = 0; /* force lookup of first TB */ for(;;) { #if defined(__sparc__) && !defined(HOST_SOLARIS) --- qemu-0.9.0/exec.c +++ qemu-0.9.0/exec.c @@ -69,6 +69,10 @@ #define TARGET_PHYS_ADDR_SPACE_BITS 32 #endif +#ifdef USE_KVM +extern int kvm_allowed; +#endif + TranslationBlock tbs[CODE_GEN_MAX_BLOCKS]; TranslationBlock *tb_phys_hash[CODE_GEN_PHYS_HASH_SIZE]; int nb_tbs; @@ -82,6 +86,7 @@ int phys_ram_fd; uint8_t *phys_ram_base; uint8_t *phys_ram_dirty; +uint8_t *bios_mem; static int in_migration; CPUState *first_cpu; @@ -1044,6 +1049,11 @@ if (env->nb_breakpoints >= MAX_BREAKPOINTS) return -1; env->breakpoints[env->nb_breakpoints++] = pc; + +#ifdef USE_KVM + if (kvm_allowed) + kvm_update_debugger(env); +#endif breakpoint_invalidate(env, pc); return 0; @@ -1067,6 +1077,11 @@ if (i < env->nb_breakpoints) env->breakpoints[i] = env->breakpoints[env->nb_breakpoints]; +#ifdef USE_KVM + if (kvm_allowed) + kvm_update_debugger(env); +#endif + breakpoint_invalidate(env, pc); return 0; #else @@ -1085,6 +1100,10 @@ /* XXX: only flush what is necessary */ tb_flush(env); } +#ifdef USE_KVM + if (kvm_allowed) + kvm_update_debugger(env); +#endif #endif } @@ -1425,6 +1444,9 @@ { int r=0; +#ifdef USE_KVM + r = kvm_physical_memory_set_dirty_tracking(enable); +#endif in_migration = enable; return r; } --- qemu-0.9.0/hw/cirrus_vga.c +++ qemu-0.9.0/hw/cirrus_vga.c @@ -28,6 +28,9 @@ */ #include "vl.h" #include "vga_int.h" +#ifndef _WIN32 +#include +#endif /* * TODO: @@ -231,6 +234,10 @@ int cirrus_linear_io_addr; int cirrus_linear_bitblt_io_addr; int cirrus_mmio_io_addr; +#ifdef USE_KVM + unsigned long cirrus_lfb_addr; + unsigned long cirrus_lfb_end; +#endif uint32_t cirrus_addr_mask; uint32_t linear_mmio_mask; uint8_t cirrus_shadow_gr0; @@ -267,6 +274,10 @@ int last_hw_cursor_y_end; int real_vram_size; /* XXX: suppress that */ CPUWriteMemoryFunc **cirrus_linear_write; +#ifdef USE_KVM + unsigned long map_addr; + unsigned long map_end; +#endif } CirrusVGAState; typedef struct PCICirrusVGAState { @@ -2525,6 +2536,48 @@ cirrus_linear_bitblt_writel, }; +#ifdef USE_KVM + +#include "qemu-kvm.h" + +extern kvm_context_t kvm_context; + +static void *set_vram_mapping(unsigned long begin, unsigned long end) +{ + void *vram_pointer = NULL; + + /* align begin and end address */ + begin = begin & TARGET_PAGE_MASK; + end = begin + VGA_RAM_SIZE; + end = (end + TARGET_PAGE_SIZE -1 ) & TARGET_PAGE_MASK; + + vram_pointer = kvm_create_phys_mem(kvm_context, begin, end - begin, 1, + 1, 1); + + if (vram_pointer == NULL) { + printf("set_vram_mapping: cannot allocate memory: %m\n"); + return NULL; + } + + memset(vram_pointer, 0, end - begin); + + return vram_pointer; +} + +static int unset_vram_mapping(unsigned long begin, unsigned long end) +{ + /* align begin and end address */ + end = begin + VGA_RAM_SIZE; + begin = begin & TARGET_PAGE_MASK; + end = (end + TARGET_PAGE_SIZE -1 ) & TARGET_PAGE_MASK; + + kvm_destroy_phys_mem(kvm_context, begin, end - begin); + + return 0; +} + +#endif + /* Compute the memory access functions */ static void cirrus_update_memory_access(CirrusVGAState *s) { @@ -2543,11 +2596,45 @@ mode = s->gr[0x05] & 0x7; if (mode < 4 || mode > 5 || ((s->gr[0x0B] & 0x4) == 0)) { +#ifdef USE_KVM + if (kvm_allowed && s->cirrus_lfb_addr && s->cirrus_lfb_end && + !s->map_addr) { + void *vram_pointer, *old_vram; + + vram_pointer = set_vram_mapping(s->cirrus_lfb_addr, + s->cirrus_lfb_end); + if (!vram_pointer) + fprintf(stderr, "NULL vram_pointer\n"); + else { + old_vram = vga_update_vram((VGAState *)s, vram_pointer, + VGA_RAM_SIZE); + qemu_free(old_vram); + } + s->map_addr = s->cirrus_lfb_addr; + s->map_end = s->cirrus_lfb_end; + } +#endif s->cirrus_linear_write[0] = cirrus_linear_mem_writeb; s->cirrus_linear_write[1] = cirrus_linear_mem_writew; s->cirrus_linear_write[2] = cirrus_linear_mem_writel; } else { generic_io: +#ifdef USE_KVM + if (kvm_allowed && s->cirrus_lfb_addr && s->cirrus_lfb_end && + s->map_addr) { + int error; + void *old_vram = NULL; + + error = unset_vram_mapping(s->cirrus_lfb_addr, + s->cirrus_lfb_end); + if (!error) + old_vram = vga_update_vram((VGAState *)s, NULL, + VGA_RAM_SIZE); + if (old_vram) + munmap(old_vram, s->map_addr - s->map_end); + s->map_addr = s->map_end = 0; + } +#endif s->cirrus_linear_write[0] = cirrus_linear_writeb; s->cirrus_linear_write[1] = cirrus_linear_writew; s->cirrus_linear_write[2] = cirrus_linear_writel; @@ -2946,6 +3033,13 @@ qemu_put_be32s(f, &s->hw_cursor_y); /* XXX: we do not save the bitblt state - we assume we do not save the state when the blitter is active */ + +#ifdef USE_KVM + if (kvm_allowed) { /* XXX: KVM images ought to be loadable in QEMU */ + qemu_put_be32s(f, &s->real_vram_size); + qemu_put_buffer(f, s->vram_ptr, s->real_vram_size); + } +#endif } static int cirrus_vga_load(QEMUFile *f, void *opaque, int version_id) @@ -2996,6 +3090,22 @@ qemu_get_be32s(f, &s->hw_cursor_x); qemu_get_be32s(f, &s->hw_cursor_y); +#ifdef USE_KVM + if (kvm_allowed) { + int real_vram_size; + qemu_get_be32s(f, &real_vram_size); + if (real_vram_size != s->real_vram_size) { + if (real_vram_size > s->real_vram_size) + real_vram_size = s->real_vram_size; + printf("%s: REAL_VRAM_SIZE MISMATCH !!!!!! SAVED=%d CURRENT=%d", + __FUNCTION__, real_vram_size, s->real_vram_size); + } + qemu_get_buffer(f, s->vram_ptr, real_vram_size); + cirrus_update_memory_access(s); + } +#endif + + /* force refresh */ s->graphic_mode = -1; cirrus_update_bank_ptr(s, 0); @@ -3151,6 +3261,17 @@ /* XXX: add byte swapping apertures */ cpu_register_physical_memory(addr, s->vram_size, s->cirrus_linear_io_addr); +#ifdef USE_KVM + if (kvm_allowed) { + s->cirrus_lfb_addr = addr; + s->cirrus_lfb_end = addr + VGA_RAM_SIZE; + + if (s->map_addr && (s->cirrus_lfb_addr != s->map_addr) && + (s->cirrus_lfb_end != s->map_end)) + printf("cirrus vga map change while on lfb mode\n"); + } +#endif + cpu_register_physical_memory(addr + 0x1000000, 0x400000, s->cirrus_linear_bitblt_io_addr); } --- qemu-0.9.0/hw/hypercall.c +++ qemu-0.9.0/hw/hypercall.c @@ -0,0 +1,302 @@ +/* + * QEMU-KVM Hypercall emulation + * + * Copyright (c) 2003-2004 Fabrice Bellard + * Copyright (c) 2006 Qumranet + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "vl.h" +#include "hypercall.h" +#include + +int use_hypercall_dev = 0; + +typedef struct VmChannelCharDriverState { + CharDriverState *vmchannel_hd; + uint32_t deviceid; +} VmChannelCharDriverState; + +static VmChannelCharDriverState vmchannel_hds[MAX_VMCHANNEL_DEVICES]; + +typedef struct HypercallState { + uint32_t hcr; + uint32_t hsr; + uint32_t txsize; + uint32_t txbuff; + uint32_t rxsize; + uint8_t RxBuff[HP_MEM_SIZE]; + uint8_t txbufferaccu[HP_MEM_SIZE]; + int txbufferaccu_offset; + int irq; + PCIDevice *pci_dev; + uint32_t index; +} HypercallState; + +HypercallState *pHypercallStates[MAX_VMCHANNEL_DEVICES] = {NULL}; + +//#define HYPERCALL_DEBUG 1 + +static void hp_reset(HypercallState *s) +{ + s->hcr = 0; + s->hsr = 0; + s->txsize = 0; + s->txbuff = 0; + s->rxsize= 0; + s->txbufferaccu_offset = 0; +} + +static void hypercall_update_irq(HypercallState *s); + + +static void hp_ioport_write(void *opaque, uint32_t addr, uint32_t val) +{ + HypercallState *s = opaque; + +#ifdef HYPERCALL_DEBUG + printf("%s: addr=0x%x, val=0x%x\n", __FUNCTION__, addr, val); +#endif + addr &= 0xff; + + switch(addr) + { + case HCR_REGISTER: + { + s->hcr = val; + if (s->hcr & HCR_DI) + hypercall_update_irq(s); + if (val & HCR_GRS){ + hp_reset(s); + } + break; + } + + case HP_TXSIZE: + { + // handle the case when the we are being called when txsize is not 0 + if (s->txsize != 0) { + printf("txsize is being set, but txsize is not 0!!!\n"); + } + if (val > HP_MEM_SIZE) { + printf("txsize is larger than allowed by hw!!!\n"); + } + s->txsize = val; + s->txbufferaccu_offset = 0; + break; + } + + case HP_TXBUFF: + { + if (s->txsize == 0) { + printf("error with txbuff!!!\n"); + break; + } + + s->txbufferaccu[s->txbufferaccu_offset] = val; + s->txbufferaccu_offset++; + if (s->txbufferaccu_offset >= s->txsize) { + qemu_chr_write(vmchannel_hds[s->index].vmchannel_hd, s->txbufferaccu, s->txsize); + s->txbufferaccu_offset = 0; + s->txsize = 0; + } + break; + } + default: + { + printf("hp_ioport_write to unhandled address!!!\n"); + } + } +} + +static uint32_t hp_ioport_read(void *opaque, uint32_t addr) +{ + HypercallState *s = opaque; + int ret; + + addr &= 0xff; +#ifdef HYPERCALL_DEBUG + // Since HSR_REGISTER is being repeatedly read in the guest ISR we don't print it + if (addr != HSR_REGISTER) + printf("%s: addr=0x%x\n", __FUNCTION__, addr); +#endif + + if (addr >= offsetof(HypercallState, RxBuff) ) + { + int RxBuffOffset = addr - (offsetof(HypercallState, RxBuff)); + ret = s->RxBuff[RxBuffOffset]; + return ret; + } + + switch (addr) + { + case HSR_REGISTER: + ret = s->hsr; + if (ret & HSR_VDR) { + s->hsr &= ~HSR_VDR; + } + break; + case HP_RXSIZE: + ret = s->rxsize; + break; + + default: + ret = 0x00; + break; + } + + return ret; +} + +/***********************************************************/ +/* PCI Hypercall definitions */ + +typedef struct PCIHypercallState { + PCIDevice dev; + HypercallState hp; +} PCIHypercallState; + +static void hp_map(PCIDevice *pci_dev, int region_num, + uint32_t addr, uint32_t size, int type) +{ + PCIHypercallState *d = (PCIHypercallState *)pci_dev; + HypercallState *s = &d->hp; + + register_ioport_write(addr, 0x100, 1, hp_ioport_write, s); + register_ioport_read(addr, 0x100, 1, hp_ioport_read, s); + +} + + +static void hypercall_update_irq(HypercallState *s) +{ + /* PCI irq */ + pci_set_irq(s->pci_dev, 0, !(s->hcr & HCR_DI)); +} + +void pci_hypercall_single_init(PCIBus *bus, uint32_t deviceid, uint32_t index) +{ + PCIHypercallState *d; + HypercallState *s; + uint8_t *pci_conf; + char name[sizeof("HypercallX")]; + +#ifdef HYPERCALL_DEBUG + printf("%s\n", __FUNCTION__); +#endif + + // If the vmchannel wasn't initialized, we don't want the Hypercall device in the guest + if (use_hypercall_dev == 0) { + return; + } + + d = (PCIHypercallState *)pci_register_device(bus, + name, sizeof(PCIHypercallState), + -1, + NULL, NULL); + + pci_conf = d->dev.config; + pci_conf[0x00] = 0x02; // Qumranet vendor ID 0x5002 + pci_conf[0x01] = 0x50; + pci_conf[0x02] = deviceid & 0x00ff; + pci_conf[0x03] = (deviceid & 0xff00) >> 8; + + pci_conf[0x09] = 0x00; // ProgIf + pci_conf[0x0a] = 0x00; // SubClass + pci_conf[0x0b] = 0x05; // BaseClass + + pci_conf[0x0e] = 0x00; // header_type + pci_conf[0x3d] = 1; // interrupt pin 0 + + pci_register_io_region(&d->dev, 0, 0x100, + PCI_ADDRESS_SPACE_IO, hp_map); + s = &d->hp; + pHypercallStates[index] = s; + s->index = index; + s->irq = 16; /* PCI interrupt */ + s->pci_dev = (PCIDevice *)d; + + hp_reset(s); +} + +void pci_hypercall_init(PCIBus *bus) +{ + int i; + + // loop devices & call pci_hypercall_single_init with device id's + for(i = 0; i < MAX_VMCHANNEL_DEVICES; i++){ + if (vmchannel_hds[i].vmchannel_hd) { + pci_hypercall_single_init(bus, vmchannel_hds[i].deviceid, i); + } + } +} + +static int vmchannel_can_read(void *opaque) +{ + return 128; +} + +static void vmchannel_event(void *opaque, int event) +{ + +#ifdef HYPERCALL_DEBUG + // if index is to be used outside the printf, take it out of the #ifdef block! + long index = (long)opaque; + printf("%s index:%ld, got event %i\n", __FUNCTION__, index, event); +#endif + + return; +} + +// input from vmchannel outside caller +static void vmchannel_read(void *opaque, const uint8_t *buf, int size) +{ + int i; + long index = (long)opaque; + +#ifdef HYPERCALL_DEBUG + printf("vmchannel_read buf size:%d\n", size); +#endif + + // if the hypercall device is in interrupts disabled state, don't accept the data + if (pHypercallStates[index]->hcr & HCR_DI) { + return; + } + + for(i = 0; i < size; i++) { + pHypercallStates[index]->RxBuff[i] = buf[i]; + } + pHypercallStates[index]->rxsize = size; + pHypercallStates[index]->hsr = HSR_VDR; + hypercall_update_irq(pHypercallStates[index]); +} + +void vmchannel_init(CharDriverState *hd, uint32_t deviceid, uint32_t index) +{ +#ifdef HYPERCALL_DEBUG + printf("vmchannel_init, index=%d, deviceid=0x%x\n", index, deviceid); +#endif + + vmchannel_hds[index].deviceid = deviceid; + vmchannel_hds[index].vmchannel_hd = hd; + + use_hypercall_dev = 1; + qemu_chr_add_handlers(vmchannel_hds[index].vmchannel_hd, vmchannel_can_read, vmchannel_read, + vmchannel_event, (void *)(long)index); +} --- qemu-0.9.0/hw/hypercall.h +++ qemu-0.9.0/hw/hypercall.h @@ -0,0 +1,45 @@ +/* + * QEMU-KVM Hypercall emulation + * + * Copyright (c) 2003-2004 Fabrice Bellard + * Copyright (c) 2006 Qumranet + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define HCR_REGISTER 0x00 // Hypercall Command Register WR +#define HSR_REGISTER 0x04 // Hypercall Status Register RD +#define HP_TXSIZE 0x08 +#define HP_TXBUFF 0x0c +#define HP_RXSIZE 0x10 +#define HP_RXBUFF 0x14 + +// HCR_REGISTER commands +#define HCR_DI 1 // disable interrupts +#define HCR_EI 2 // enable interrupts +#define HCR_GRS 4 // Global reset +#define HCR_RESET (HCR_GRS|HCR_DI) + + +// Bits in HSR_REGISTER +#define HSR_VDR 0x01 // vmchannel Data is ready to be read + +#define HP_MEM_SIZE 0xE0 + + --- qemu-0.9.0/hw/pc.c +++ qemu-0.9.0/hw/pc.c @@ -22,6 +22,10 @@ * THE SOFTWARE. */ #include "vl.h" +#ifdef USE_KVM +#include "qemu-kvm.h" +extern int kvm_allowed; +#endif /* output Bochs bios info messages */ //#define DEBUG_BIOS @@ -444,6 +448,11 @@ nb_ne2k++; } +#ifdef USE_KVM +extern kvm_context_t kvm_context; +extern int kvm_allowed; +#endif + /* PC hardware initialisation */ static void pc_init1(int ram_size, int vga_ram_size, int boot_device, DisplayState *ds, const char **fd_filename, int snapshot, @@ -511,6 +520,11 @@ /* setup basic memory access */ cpu_register_physical_memory(0xc0000, 0x10000, vga_bios_offset | IO_MEM_ROM); +#ifdef USE_KVM + if (kvm_allowed) + memcpy(phys_ram_base + 0xc0000, phys_ram_base + vga_bios_offset, + 0x10000); +#endif /* map the last 128KB of the BIOS in ISA space */ isa_bios_size = bios_size; @@ -522,6 +536,26 @@ isa_bios_size, (bios_offset + bios_size - isa_bios_size) | IO_MEM_ROM); +#ifdef USE_KVM + if (kvm_allowed) + memcpy(phys_ram_base + 0x100000 - isa_bios_size, + phys_ram_base + (bios_offset + bios_size - isa_bios_size), + isa_bios_size); +#endif + +#ifdef USE_KVM + if (kvm_allowed) { + bios_mem = kvm_create_phys_mem(kvm_context, (uint32_t)(-bios_size), + bios_size, 2, 0, 1); + if (!bios_mem) + exit(1); + memcpy(bios_mem, phys_ram_base + bios_offset, bios_size); + + cpu_register_physical_memory(phys_ram_size - KVM_EXTRA_PAGES * 4096, KVM_EXTRA_PAGES * 4096, + (phys_ram_size - KVM_EXTRA_PAGES * 4096) | IO_MEM_ROM); + } +#endif + option_rom_offset = 0; for (i = 0; i < nb_option_roms; i++) { int offset = bios_offset + bios_size + option_rom_offset; @@ -718,6 +752,11 @@ } } +#ifdef USE_KVM + if (kvm_allowed) { + pci_hypercall_init(pci_bus); + } +#endif if (pci_enabled) { pci_piix3_ide_init(pci_bus, bs_table, piix3_devfn + 1); } else { --- qemu-0.9.0/hw/vga.c +++ qemu-0.9.0/hw/vga.c @@ -1373,6 +1373,26 @@ } } +#ifdef USE_KVM + +#include "kvmctl.h" +extern kvm_context_t kvm_context; + +static int bitmap_get_dirty(unsigned long *bitmap, unsigned nr) +{ + unsigned word = nr / ((sizeof bitmap[0]) * 8); + unsigned bit = nr % ((sizeof bitmap[0]) * 8); + + //printf("%x -> %ld\n", nr, (bitmap[word] >> bit) & 1); + return (bitmap[word] >> bit) & 1; +} + +#endif + +#ifdef USE_KVM +extern int kvm_allowed; +#endif + /* * graphic modes */ @@ -1385,6 +1405,20 @@ uint32_t v, addr1, addr; vga_draw_line_func *vga_draw_line; +#ifdef USE_KVM + + /* HACK ALERT */ +#define BITMAP_SIZE ((8*1024*1024) / 4096 / 8 / sizeof(long)) + unsigned long bitmap[BITMAP_SIZE]; + int r; + + if (kvm_allowed) { + r = kvm_get_dirty_pages(kvm_context, 1, &bitmap); + if (r < 0) + fprintf(stderr, "kvm: get_dirty_pages returned %d\n", r); + } +#endif + full_update |= update_basic_params(s); s->get_resolution(s, &width, &height); @@ -1491,10 +1525,20 @@ update = full_update | cpu_physical_memory_get_dirty(page0, VGA_DIRTY_FLAG) | cpu_physical_memory_get_dirty(page1, VGA_DIRTY_FLAG); +#ifdef USE_KVM + if (kvm_allowed) { + update |= bitmap_get_dirty(bitmap, (page0 - s->vram_offset) >> TARGET_PAGE_BITS); + update |= bitmap_get_dirty(bitmap, (page1 - s->vram_offset) >> TARGET_PAGE_BITS); + } +#endif if ((page1 - page0) > TARGET_PAGE_SIZE) { /* if wide line, can use another page */ update |= cpu_physical_memory_get_dirty(page0 + TARGET_PAGE_SIZE, VGA_DIRTY_FLAG); +#ifdef USE_KVM + if (kvm_allowed) + update |= bitmap_get_dirty(bitmap, (page0 - s->vram_offset) >> TARGET_PAGE_BITS); +#endif } /* explicit invalidation for the hardware cursor */ update |= (s->invalidated_y_table[y >> 5] >> (y & 0x1f)) & 1; @@ -1751,6 +1795,7 @@ } } +/* when used on xen/kvm environment, the vga_ram_base is not used */ void vga_common_init(VGAState *s, DisplayState *ds, uint8_t *vga_ram_base, unsigned long vga_ram_offset, int vga_ram_size) { @@ -1781,7 +1826,14 @@ vga_reset(s); +#ifndef USE_KVM s->vram_ptr = vga_ram_base; +#else + if (kvm_allowed) + s->vram_ptr = qemu_malloc(vga_ram_size); + else + s->vram_ptr = vga_ram_base; +#endif s->vram_offset = vga_ram_offset; s->vram_size = vga_ram_size; s->ds = ds; @@ -1909,6 +1961,31 @@ return 0; } +void *vga_update_vram(VGAState *s, void *vga_ram_base, int vga_ram_size) +{ + uint8_t *old_pointer; + + if (s->vram_size != vga_ram_size) { + fprintf(stderr, "No support to change vga_ram_size\n"); + return NULL; + } + + if (!vga_ram_base) { + vga_ram_base = qemu_malloc(vga_ram_size); + if (!vga_ram_base) { + fprintf(stderr, "reallocate error\n"); + return NULL; + } + } + + /* XXX lock needed? */ + memcpy(vga_ram_base, s->vram_ptr, vga_ram_size); + old_pointer = s->vram_ptr; + s->vram_ptr = vga_ram_base; + + return old_pointer; +} + /********************************************************/ /* vga screen dump */ --- qemu-0.9.0/hw/vga_int.h +++ qemu-0.9.0/hw/vga_int.h @@ -174,5 +174,6 @@ unsigned int color0, unsigned int color1, unsigned int color_xor); +void *vga_update_vram(VGAState *s, void *vga_ram_base, int vga_ram_size); extern const uint8_t sr_mask[8]; extern const uint8_t gr_mask[16]; --- qemu-0.9.0/kvm.h +++ qemu-0.9.0/kvm.h @@ -0,0 +1,247 @@ +#ifndef __LINUX_KVM_H +#define __LINUX_KVM_H + +/* + * Userspace interface for /dev/kvm - kernel based virtual machine + * + * Note: this interface is considered experimental and may change without + * notice. + */ + +#include +#include + +#define KVM_API_VERSION 4 + +/* + * Architectural interrupt line count, and the size of the bitmap needed + * to hold them. + */ +#define KVM_NR_INTERRUPTS 256 +#define KVM_IRQ_BITMAP_SIZE_BYTES ((KVM_NR_INTERRUPTS + 7) / 8) +#define KVM_IRQ_BITMAP_SIZE(type) (KVM_IRQ_BITMAP_SIZE_BYTES / sizeof(type)) + + +/* for KVM_CREATE_MEMORY_REGION */ +struct kvm_memory_region { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ +}; + +/* for kvm_memory_region::flags */ +#define KVM_MEM_LOG_DIRTY_PAGES 1UL + + +#define KVM_EXIT_TYPE_FAIL_ENTRY 1 +#define KVM_EXIT_TYPE_VM_EXIT 2 + +enum kvm_exit_reason { + KVM_EXIT_UNKNOWN = 0, + KVM_EXIT_EXCEPTION = 1, + KVM_EXIT_IO = 2, + KVM_EXIT_CPUID = 3, + KVM_EXIT_DEBUG = 4, + KVM_EXIT_HLT = 5, + KVM_EXIT_MMIO = 6, + KVM_EXIT_IRQ_WINDOW_OPEN = 7, + KVM_EXIT_SHUTDOWN = 8, +}; + +/* for KVM_RUN */ +struct kvm_run { + /* in */ + __u32 emulated; /* skip current instruction */ + __u32 mmio_completed; /* mmio request completed */ + __u8 request_interrupt_window; + __u8 padding1[7]; + + /* out */ + __u32 exit_type; + __u32 exit_reason; + __u32 instruction_length; + __u8 ready_for_interrupt_injection; + __u8 if_flag; + __u16 padding2; + + /* in (pre_kvm_run), out (post_kvm_run) */ + __u64 cr8; + __u64 apic_base; + + union { + /* KVM_EXIT_UNKNOWN */ + struct { + __u32 hardware_exit_reason; + } hw; + /* KVM_EXIT_EXCEPTION */ + struct { + __u32 exception; + __u32 error_code; + } ex; + /* KVM_EXIT_IO */ + struct { +#define KVM_EXIT_IO_IN 0 +#define KVM_EXIT_IO_OUT 1 + __u8 direction; + __u8 size; /* bytes */ + __u8 string; + __u8 string_down; + __u8 rep; + __u8 pad; + __u16 port; + __u64 count; + union { + __u64 address; + __u32 value; + }; + } io; + struct { + } debug; + /* KVM_EXIT_MMIO */ + struct { + __u64 phys_addr; + __u8 data[8]; + __u32 len; + __u8 is_write; + } mmio; + }; +}; + +/* for KVM_GET_REGS and KVM_SET_REGS */ +struct kvm_regs { + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 rax, rbx, rcx, rdx; + __u64 rsi, rdi, rsp, rbp; + __u64 r8, r9, r10, r11; + __u64 r12, r13, r14, r15; + __u64 rip, rflags; +}; + +struct kvm_segment { + __u64 base; + __u32 limit; + __u16 selector; + __u8 type; + __u8 present, dpl, db, s, l, g, avl; + __u8 unusable; + __u8 padding; +}; + +struct kvm_dtable { + __u64 base; + __u16 limit; + __u16 padding[3]; +}; + +/* for KVM_GET_SREGS and KVM_SET_SREGS */ +struct kvm_sregs { + /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */ + struct kvm_segment cs, ds, es, fs, gs, ss; + struct kvm_segment tr, ldt; + struct kvm_dtable gdt, idt; + __u64 cr0, cr2, cr3, cr4, cr8; + __u64 efer; + __u64 apic_base; + __u64 interrupt_bitmap[KVM_IRQ_BITMAP_SIZE(__u64)]; +}; + +struct kvm_msr_entry { + __u32 index; + __u32 reserved; + __u64 data; +}; + +/* for KVM_GET_MSRS and KVM_SET_MSRS */ +struct kvm_msrs { + __u32 nmsrs; /* number of msrs in entries */ + __u32 pad; + + struct kvm_msr_entry entries[0]; +}; + +/* for KVM_GET_MSR_INDEX_LIST */ +struct kvm_msr_list { + __u32 nmsrs; /* number of msrs in entries */ + __u32 indices[0]; +}; + +/* for KVM_TRANSLATE */ +struct kvm_translation { + /* in */ + __u64 linear_address; + + /* out */ + __u64 physical_address; + __u8 valid; + __u8 writeable; + __u8 usermode; + __u8 pad[5]; +}; + +/* for KVM_INTERRUPT */ +struct kvm_interrupt { + /* in */ + __u32 irq; +}; + +struct kvm_breakpoint { + __u32 enabled; + __u32 padding; + __u64 address; +}; + +/* for KVM_DEBUG_GUEST */ +struct kvm_debug_guest { + /* int */ + __u32 enabled; + __u32 pad; + struct kvm_breakpoint breakpoints[4]; + __u32 singlestep; +}; + +/* for KVM_GET_DIRTY_LOG */ +struct kvm_dirty_log { + __u32 slot; + __u32 padding; + union { + void __user *dirty_bitmap; /* one bit per page */ + __u64 padding; + }; +}; + +#define KVMIO 0xAE + +/* + * ioctls for /dev/kvm fds: + */ +#define KVM_GET_API_VERSION _IO(KVMIO, 1) +#define KVM_CREATE_VM _IO(KVMIO, 2) /* returns a VM fd */ +#define KVM_GET_MSR_INDEX_LIST _IOWR(KVMIO, 15, struct kvm_msr_list) + +/* + * ioctls for VM fds + */ +#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 10, struct kvm_memory_region) +/* + * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns + * a vcpu fd. + */ +#define KVM_CREATE_VCPU _IOW(KVMIO, 11, int) +#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 12, struct kvm_dirty_log) + +/* + * ioctls for vcpu fds + */ +#define KVM_RUN _IOWR(KVMIO, 2, struct kvm_run) +#define KVM_GET_REGS _IOR(KVMIO, 3, struct kvm_regs) +#define KVM_SET_REGS _IOW(KVMIO, 4, struct kvm_regs) +#define KVM_GET_SREGS _IOR(KVMIO, 5, struct kvm_sregs) +#define KVM_SET_SREGS _IOW(KVMIO, 6, struct kvm_sregs) +#define KVM_TRANSLATE _IOWR(KVMIO, 7, struct kvm_translation) +#define KVM_INTERRUPT _IOW(KVMIO, 8, struct kvm_interrupt) +#define KVM_DEBUG_GUEST _IOW(KVMIO, 9, struct kvm_debug_guest) +#define KVM_GET_MSRS _IOWR(KVMIO, 13, struct kvm_msrs) +#define KVM_SET_MSRS _IOW(KVMIO, 14, struct kvm_msrs) + +#endif --- qemu-0.9.0/kvmctl.c +++ qemu-0.9.0/kvmctl.c @@ -0,0 +1,809 @@ +/* + * Kernel-based Virtual Machine control library + * + * This library provides an API to control the kvm hardware virtualization + * module. + * + * Copyright (C) 2006 Qumranet + * + * Authors: + * + * Avi Kivity + * Yaniv Kamay + * + * This work is licensed under the GNU LGPL license, version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "kvmctl.h" + +#define EXPECTED_KVM_API_VERSION 4 + +#if EXPECTED_KVM_API_VERSION != KVM_API_VERSION +#error libkvm: userspace and kernel version mismatch +#endif + +#define PAGE_SIZE 4096ul + +/* FIXME: share this number with kvm */ +/* FIXME: or dynamically alloc/realloc regions */ +#define KVM_MAX_NUM_MEM_REGIONS 4u + +/** + * \brief The KVM context + * + * The verbose KVM context + */ +struct kvm_context { + /// Filedescriptor to /dev/kvm + int fd; + int vm_fd; + int vcpu_fd[1]; + /// Callbacks that KVM uses to emulate various unvirtualizable functionality + struct kvm_callbacks *callbacks; + void *opaque; + /// A pointer to the memory used as the physical memory for the guest + void *physical_memory; + /// is dirty pages logging enabled for all regions or not + int dirty_pages_log_all; + /// memory regions parameters + struct kvm_memory_region mem_regions[KVM_MAX_NUM_MEM_REGIONS]; +}; + +struct translation_cache { + unsigned long linear; + void *physical; +}; + +static void translation_cache_init(struct translation_cache *tr) +{ + tr->physical = 0; +} + +static int translate(kvm_context_t kvm, int vcpu, struct translation_cache *tr, + unsigned long linear, void **physical) +{ + unsigned long page = linear & ~(PAGE_SIZE-1); + unsigned long offset = linear & (PAGE_SIZE-1); + + if (!(tr->physical && tr->linear == page)) { + struct kvm_translation kvm_tr; + int r; + + kvm_tr.linear_address = page; + + r = ioctl(kvm->vcpu_fd[vcpu], KVM_TRANSLATE, &kvm_tr); + if (r == -1) + return -errno; + + if (!kvm_tr.valid) + return -EFAULT; + + tr->linear = page; + tr->physical = kvm->physical_memory + kvm_tr.physical_address; + } + *physical = tr->physical + offset; + return 0; +} + +/* + * memory regions parameters + */ +static void kvm_memory_region_save_params(kvm_context_t kvm, + struct kvm_memory_region *mem) +{ + if (!mem || (mem->slot >= KVM_MAX_NUM_MEM_REGIONS)) { + fprintf(stderr, "BUG: %s: invalid parameters\n", __FUNCTION__); + return; + } + kvm->mem_regions[mem->slot] = *mem; +} + +static void kvm_memory_region_clear_params(kvm_context_t kvm, int regnum) +{ + if (regnum >= KVM_MAX_NUM_MEM_REGIONS) { + fprintf(stderr, "BUG: %s: invalid parameters\n", __FUNCTION__); + return; + } + kvm->mem_regions[regnum].memory_size = 0; +} + +/* + * dirty pages logging control + */ +static int kvm_dirty_pages_log_change(kvm_context_t kvm, int regnum, __u32 flag) +{ + int r; + struct kvm_memory_region *mem; + + if (regnum >= KVM_MAX_NUM_MEM_REGIONS) { + fprintf(stderr, "BUG: %s: invalid parameters\n", __FUNCTION__); + return 1; + } + mem = &kvm->mem_regions[regnum]; + if (mem->memory_size == 0) /* not used */ + return 0; + if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) /* log already enabled */ + return 0; + mem->flags |= flag; /* temporary turn on flag */ + r = ioctl(kvm->vm_fd, KVM_SET_MEMORY_REGION, mem); + mem->flags &= ~flag; /* back to previous value */ + if (r == -1) { + fprintf(stderr, "%s: %m\n", __FUNCTION__); + } + return r; +} + +static int kvm_dirty_pages_log_change_all(kvm_context_t kvm, __u32 flag) +{ + int i, r; + + for (i=r=0; idirty_pages_log_all) + return 0; + kvm->dirty_pages_log_all = 1; + return kvm_dirty_pages_log_change_all(kvm, KVM_MEM_LOG_DIRTY_PAGES); +} + +/** + * Enable dirty page logging only for memory regions that were created with + * dirty logging enabled (disable for all other memory regions). + */ +int kvm_dirty_pages_log_reset(kvm_context_t kvm) +{ + if (!kvm->dirty_pages_log_all) + return 0; + kvm->dirty_pages_log_all = 0; + return kvm_dirty_pages_log_change_all(kvm, 0); +} + + +kvm_context_t kvm_init(struct kvm_callbacks *callbacks, + void *opaque) +{ + int fd; + kvm_context_t kvm; + int r; + + fd = open("/dev/kvm", O_RDWR); + if (fd == -1) { + perror("open /dev/kvm"); + return NULL; + } + r = ioctl(fd, KVM_GET_API_VERSION, 0); + if (r == -1) { + fprintf(stderr, "kvm kernel version too old\n"); + goto out_close; + } + if (r < EXPECTED_KVM_API_VERSION) { + fprintf(stderr, "kvm kernel version too old\n"); + goto out_close; + } + if (r > EXPECTED_KVM_API_VERSION) { + fprintf(stderr, "kvm userspace version too old\n"); + goto out_close; + } + kvm = malloc(sizeof(*kvm)); + kvm->fd = fd; + kvm->vm_fd = -1; + kvm->callbacks = callbacks; + kvm->opaque = opaque; + kvm->dirty_pages_log_all = 0; + memset(&kvm->mem_regions, 0, sizeof(kvm->mem_regions)); + + return kvm; + out_close: + close(fd); + return NULL; +} + +void kvm_finalize(kvm_context_t kvm) +{ + if (kvm->vcpu_fd[0] != -1) + close(kvm->vcpu_fd[0]); + if (kvm->vm_fd != -1) + close(kvm->vm_fd); + close(kvm->fd); + free(kvm); +} + +int kvm_create(kvm_context_t kvm, unsigned long memory, void **vm_mem) +{ + unsigned long dosmem = 0xa0000; + unsigned long exmem = 0xc0000; + int fd = kvm->fd; + int r; + struct kvm_memory_region low_memory = { + .slot = 3, + .memory_size = memory < dosmem ? memory : dosmem, + .guest_phys_addr = 0, + }; + struct kvm_memory_region extended_memory = { + .slot = 0, + .memory_size = memory < exmem ? 0 : memory - exmem, + .guest_phys_addr = exmem, + }; + + kvm->vcpu_fd[0] = -1; + + fd = ioctl(fd, KVM_CREATE_VM, 0); + if (fd == -1) { + fprintf(stderr, "kvm_create_vm: %m\n"); + return -1; + } + kvm->vm_fd = fd; + + /* 640K should be enough. */ + r = ioctl(fd, KVM_SET_MEMORY_REGION, &low_memory); + if (r == -1) { + fprintf(stderr, "kvm_create_memory_region: %m\n"); + return -1; + } + if (extended_memory.memory_size) { + r = ioctl(fd, KVM_SET_MEMORY_REGION, &extended_memory); + if (r == -1) { + fprintf(stderr, "kvm_create_memory_region: %m\n"); + return -1; + } + } + + kvm_memory_region_save_params(kvm, &low_memory); + kvm_memory_region_save_params(kvm, &extended_memory); + + *vm_mem = mmap(0, memory, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (*vm_mem == MAP_FAILED) { + fprintf(stderr, "mmap: %m\n"); + return -1; + } + kvm->physical_memory = *vm_mem; + + r = ioctl(fd, KVM_CREATE_VCPU, 0); + if (r == -1) { + fprintf(stderr, "kvm_create_vcpu: %m\n"); + return -1; + } + kvm->vcpu_fd[0] = r; + return 0; +} + +void *kvm_create_phys_mem(kvm_context_t kvm, unsigned long phys_start, + unsigned long len, int slot, int log, int writable) +{ + void *ptr; + int r; + int fd = kvm->vm_fd; + int prot = PROT_READ; + struct kvm_memory_region memory = { + .slot = slot, + .memory_size = len, + .guest_phys_addr = phys_start, + .flags = log ? KVM_MEM_LOG_DIRTY_PAGES : 0, + }; + + r = ioctl(fd, KVM_SET_MEMORY_REGION, &memory); + if (r == -1) + return 0; + + kvm_memory_region_save_params(kvm, &memory); + + if (writable) + prot |= PROT_WRITE; + + ptr = mmap(0, len, prot, MAP_SHARED, fd, phys_start); + if (ptr == MAP_FAILED) + return 0; + return ptr; +} + +void kvm_destroy_phys_mem(kvm_context_t kvm, unsigned long phys_start, + unsigned long len) +{ + //for each memory region in (phys_start, phys_start+len) do + // kvm_memory_region_clear_params(kvm, region); + kvm_memory_region_clear_params(kvm, 0); /* avoid compiler warning */ + printf("kvm_destroy_phys_mem: implement me\n"); + exit(1); +} + + +int kvm_get_dirty_pages(kvm_context_t kvm, int slot, void *buf) +{ + int r; + struct kvm_dirty_log log = { + .slot = slot, + }; + + log.dirty_bitmap = buf; + + r = ioctl(kvm->vm_fd, KVM_GET_DIRTY_LOG, &log); + if (r == -1) + return -errno; + return 0; +} + +static int more_io(struct kvm_run *run, int first_time) +{ + if (!run->io.rep) + return first_time; + else + return run->io.count != 0; +} + +static int handle_io(kvm_context_t kvm, struct kvm_run *run, int vcpu) +{ + uint16_t addr = run->io.port; + struct kvm_regs regs; + int first_time = 1; + int delta; + struct translation_cache tr; + int _in = (run->io.direction == KVM_EXIT_IO_IN); + int r; + + translation_cache_init(&tr); + + if (run->io.string || _in) { + r = ioctl(kvm->vcpu_fd[vcpu], KVM_GET_REGS, ®s); + if (r == -1) + return -errno; + } + + delta = run->io.string_down ? -run->io.size : run->io.size; + + while (more_io(run, first_time)) { + void *value_addr; + + if (!run->io.string) { + if (_in) + value_addr = ®s.rax; + else + value_addr = &run->io.value; + } else { + r = translate(kvm, vcpu, &tr, run->io.address, + &value_addr); + if (r) { + fprintf(stderr, "failed translating I/O address %llx\n", + run->io.address); + return r; + } + } + + switch (run->io.direction) { + case KVM_EXIT_IO_IN: { + switch (run->io.size) { + case 1: { + uint8_t value; + r = kvm->callbacks->inb(kvm->opaque, addr, &value); + *(uint8_t *)value_addr = value; + break; + } + case 2: { + uint16_t value; + r = kvm->callbacks->inw(kvm->opaque, addr, &value); + *(uint16_t *)value_addr = value; + break; + } + case 4: { + uint32_t value; + r = kvm->callbacks->inl(kvm->opaque, addr, &value); + *(uint32_t *)value_addr = value; + break; + } + default: + fprintf(stderr, "bad I/O size %d\n", run->io.size); + return -EMSGSIZE; + } + break; + } + case KVM_EXIT_IO_OUT: + switch (run->io.size) { + case 1: + r = kvm->callbacks->outb(kvm->opaque, addr, + *(uint8_t *)value_addr); + break; + case 2: + r = kvm->callbacks->outw(kvm->opaque, addr, + *(uint16_t *)value_addr); + break; + case 4: + r = kvm->callbacks->outl(kvm->opaque, addr, + *(uint32_t *)value_addr); + break; + default: + fprintf(stderr, "bad I/O size %d\n", run->io.size); + return -EMSGSIZE; + } + break; + default: + fprintf(stderr, "bad I/O direction %d\n", run->io.direction); + return -EPROTO; + } + if (run->io.string) { + run->io.address += delta; + switch (run->io.direction) { + case KVM_EXIT_IO_IN: regs.rdi += delta; break; + case KVM_EXIT_IO_OUT: regs.rsi += delta; break; + } + if (run->io.rep) { + --regs.rcx; + --run->io.count; + } + } + first_time = 0; + if (r) { + int savedret = r; + r = ioctl(kvm->vcpu_fd[vcpu], KVM_SET_REGS, ®s); + if (r == -1) + return -errno; + + return savedret; + } + } + + if (run->io.string || _in) { + r = ioctl(kvm->vcpu_fd[vcpu], KVM_SET_REGS, ®s); + if (r == -1) + return -errno; + + } + + run->emulated = 1; + return 0; +} + +int handle_debug(kvm_context_t kvm, struct kvm_run *run, int vcpu) +{ + return kvm->callbacks->debug(kvm->opaque, vcpu); +} + +int kvm_get_regs(kvm_context_t kvm, int vcpu, struct kvm_regs *regs) +{ + return ioctl(kvm->vcpu_fd[vcpu], KVM_GET_REGS, regs); +} + +int kvm_set_regs(kvm_context_t kvm, int vcpu, struct kvm_regs *regs) +{ + return ioctl(kvm->vcpu_fd[vcpu], KVM_SET_REGS, regs); +} + +int kvm_get_sregs(kvm_context_t kvm, int vcpu, struct kvm_sregs *sregs) +{ + return ioctl(kvm->vcpu_fd[vcpu], KVM_GET_SREGS, sregs); +} + +int kvm_set_sregs(kvm_context_t kvm, int vcpu, struct kvm_sregs *sregs) +{ + return ioctl(kvm->vcpu_fd[vcpu], KVM_SET_SREGS, sregs); +} + +/* + * Returns available msr list. User must free. + */ +struct kvm_msr_list *kvm_get_msr_list(kvm_context_t kvm) +{ + struct kvm_msr_list sizer, *msrs; + int r, e; + + sizer.nmsrs = 0; + r = ioctl(kvm->fd, KVM_GET_MSR_INDEX_LIST, &sizer); + if (r == -1 && errno != E2BIG) + return 0; + msrs = malloc(sizeof *msrs + sizer.nmsrs * sizeof *msrs->indices); + if (!msrs) { + errno = ENOMEM; + return 0; + } + msrs->nmsrs = sizer.nmsrs; + r = ioctl(kvm->fd, KVM_GET_MSR_INDEX_LIST, msrs); + if (r == -1) { + e = errno; + free(msrs); + errno = e; + return 0; + } + return msrs; +} + +int kvm_get_msrs(kvm_context_t kvm, int vcpu, struct kvm_msr_entry *msrs, + int n) +{ + struct kvm_msrs *kmsrs = malloc(sizeof *kmsrs + n * sizeof *msrs); + int r, e; + + if (!kmsrs) { + errno = ENOMEM; + return -1; + } + kmsrs->nmsrs = n; + memcpy(kmsrs->entries, msrs, n * sizeof *msrs); + r = ioctl(kvm->vcpu_fd[vcpu], KVM_GET_MSRS, kmsrs); + e = errno; + memcpy(msrs, kmsrs->entries, n * sizeof *msrs); + free(kmsrs); + errno = e; + return r; +} + +int kvm_set_msrs(kvm_context_t kvm, int vcpu, struct kvm_msr_entry *msrs, + int n) +{ + struct kvm_msrs *kmsrs = malloc(sizeof *kmsrs + n * sizeof *msrs); + int r, e; + + if (!kmsrs) { + errno = ENOMEM; + return -1; + } + kmsrs->nmsrs = n; + memcpy(kmsrs->entries, msrs, n * sizeof *msrs); + r = ioctl(kvm->vcpu_fd[vcpu], KVM_SET_MSRS, kmsrs); + e = errno; + free(kmsrs); + errno = e; + return r; +} + +static void print_seg(FILE *file, const char *name, struct kvm_segment *seg) +{ + fprintf(stderr, + "%s %04x (%08llx/%08x p %d dpl %d db %d s %d type %x l %d" + " g %d avl %d)\n", + name, seg->selector, seg->base, seg->limit, seg->present, + seg->dpl, seg->db, seg->s, seg->type, seg->l, seg->g, + seg->avl); +} + +static void print_dt(FILE *file, const char *name, struct kvm_dtable *dt) +{ + fprintf(stderr, "%s %llx/%x\n", name, dt->base, dt->limit); +} + +void kvm_show_regs(kvm_context_t kvm, int vcpu) +{ + int fd = kvm->vcpu_fd[vcpu]; + struct kvm_regs regs; + struct kvm_sregs sregs; + int r; + + r = ioctl(fd, KVM_GET_REGS, ®s); + if (r == -1) { + perror("KVM_GET_REGS"); + return; + } + fprintf(stderr, + "rax %016llx rbx %016llx rcx %016llx rdx %016llx\n" + "rsi %016llx rdi %016llx rsp %016llx rbp %016llx\n" + "r8 %016llx r9 %016llx r10 %016llx r11 %016llx\n" + "r12 %016llx r13 %016llx r14 %016llx r15 %016llx\n" + "rip %016llx rflags %08llx\n", + regs.rax, regs.rbx, regs.rcx, regs.rdx, + regs.rsi, regs.rdi, regs.rsp, regs.rbp, + regs.r8, regs.r9, regs.r10, regs.r11, + regs.r12, regs.r13, regs.r14, regs.r15, + regs.rip, regs.rflags); + r = ioctl(fd, KVM_GET_SREGS, &sregs); + if (r == -1) { + perror("KVM_GET_SREGS"); + return; + } + print_seg(stderr, "cs", &sregs.cs); + print_seg(stderr, "ds", &sregs.ds); + print_seg(stderr, "es", &sregs.es); + print_seg(stderr, "ss", &sregs.ss); + print_seg(stderr, "fs", &sregs.fs); + print_seg(stderr, "gs", &sregs.gs); + print_seg(stderr, "tr", &sregs.tr); + print_seg(stderr, "ldt", &sregs.ldt); + print_dt(stderr, "gdt", &sregs.gdt); + print_dt(stderr, "idt", &sregs.idt); + fprintf(stderr, "cr0 %llx cr2 %llx cr3 %llx cr4 %llx cr8 %llx" + " efer %llx\n", + sregs.cr0, sregs.cr2, sregs.cr3, sregs.cr4, sregs.cr8, + sregs.efer); +} + +static int handle_cpuid(kvm_context_t kvm, struct kvm_run *run, int vcpu) +{ + struct kvm_regs regs; + uint32_t orig_eax; + uint64_t rax, rbx, rcx, rdx; + int r; + + kvm_get_regs(kvm, vcpu, ®s); + orig_eax = regs.rax; + rax = regs.rax; + rbx = regs.rbx; + rcx = regs.rcx; + rdx = regs.rdx; + r = kvm->callbacks->cpuid(kvm->opaque, &rax, &rbx, &rcx, &rdx); + regs.rax = rax; + regs.rbx = rbx; + regs.rcx = rcx; + regs.rdx = rdx; + if (orig_eax == 1) + regs.rdx &= ~(1ull << 12); /* disable mtrr support */ + kvm_set_regs(kvm, vcpu, ®s); + run->emulated = 1; + return r; +} + +static int handle_mmio(kvm_context_t kvm, struct kvm_run *kvm_run) +{ + unsigned long addr = kvm_run->mmio.phys_addr; + void *data = kvm_run->mmio.data; + int r = -1; + + if (kvm_run->mmio.is_write) { + switch (kvm_run->mmio.len) { + case 1: + r = kvm->callbacks->writeb(kvm->opaque, addr, *(uint8_t *)data); + break; + case 2: + r = kvm->callbacks->writew(kvm->opaque, addr, *(uint16_t *)data); + break; + case 4: + r = kvm->callbacks->writel(kvm->opaque, addr, *(uint32_t *)data); + break; + case 8: + r = kvm->callbacks->writeq(kvm->opaque, addr, *(uint64_t *)data); + break; + } + } else { + switch (kvm_run->mmio.len) { + case 1: + r = kvm->callbacks->readb(kvm->opaque, addr, (uint8_t *)data); + break; + case 2: + r = kvm->callbacks->readw(kvm->opaque, addr, (uint16_t *)data); + break; + case 4: + r = kvm->callbacks->readl(kvm->opaque, addr, (uint32_t *)data); + break; + case 8: + r = kvm->callbacks->readq(kvm->opaque, addr, (uint64_t *)data); + break; + } + kvm_run->mmio_completed = 1; + } + return r; +} + +static int handle_io_window(kvm_context_t kvm, struct kvm_run *kvm_run) +{ + return kvm->callbacks->io_window(kvm->opaque); +} + +static int handle_halt(kvm_context_t kvm, struct kvm_run *kvm_run, int vcpu) +{ + return kvm->callbacks->halt(kvm->opaque, vcpu); +} + +static int handle_shutdown(kvm_context_t kvm, struct kvm_run *kvm_run, + int vcpu) +{ + return kvm->callbacks->shutdown(kvm->opaque, vcpu); +} + +int try_push_interrupts(kvm_context_t kvm) +{ + return kvm->callbacks->try_push_interrupts(kvm->opaque); +} + +static void post_kvm_run(kvm_context_t kvm, struct kvm_run *kvm_run) +{ + kvm->callbacks->post_kvm_run(kvm->opaque, kvm_run); +} + +static void pre_kvm_run(kvm_context_t kvm, struct kvm_run *kvm_run) +{ + kvm->callbacks->pre_kvm_run(kvm->opaque, kvm_run); +} + +int kvm_run(kvm_context_t kvm, int vcpu) +{ + int r; + int fd = kvm->vcpu_fd[vcpu]; + struct kvm_run kvm_run = { + .emulated = 0, + .mmio_completed = 0, + }; + +again: + kvm_run.request_interrupt_window = try_push_interrupts(kvm); + pre_kvm_run(kvm, &kvm_run); + r = ioctl(fd, KVM_RUN, &kvm_run); + post_kvm_run(kvm, &kvm_run); + + kvm_run.emulated = 0; + kvm_run.mmio_completed = 0; + if (r == -1 && errno != EINTR) { + r = -errno; + printf("kvm_run: %m\n"); + return r; + } + if (r == -1) { + r = handle_io_window(kvm, &kvm_run); + goto more; + } + switch (kvm_run.exit_type) { + case KVM_EXIT_TYPE_FAIL_ENTRY: + fprintf(stderr, "kvm_run: failed entry, reason %u\n", + kvm_run.exit_reason & 0xffff); + return -ENOEXEC; + break; + case KVM_EXIT_TYPE_VM_EXIT: + switch (kvm_run.exit_reason) { + case KVM_EXIT_UNKNOWN: + fprintf(stderr, "unhandled vm exit: 0x%x\n", + kvm_run.hw.hardware_exit_reason); + kvm_show_regs(kvm, vcpu); + abort(); + break; + case KVM_EXIT_EXCEPTION: + fprintf(stderr, "exception %d (%x)\n", + kvm_run.ex.exception, + kvm_run.ex.error_code); + kvm_show_regs(kvm, vcpu); + abort(); + break; + case KVM_EXIT_IO: + r = handle_io(kvm, &kvm_run, vcpu); + break; + case KVM_EXIT_CPUID: + r = handle_cpuid(kvm, &kvm_run, vcpu); + break; + case KVM_EXIT_DEBUG: + r = handle_debug(kvm, &kvm_run, vcpu); + break; + case KVM_EXIT_MMIO: + r = handle_mmio(kvm, &kvm_run); + break; + case KVM_EXIT_HLT: + r = handle_halt(kvm, &kvm_run, vcpu); + break; + case KVM_EXIT_IRQ_WINDOW_OPEN: + break; + case KVM_EXIT_SHUTDOWN: + r = handle_shutdown(kvm, &kvm_run, vcpu); + break; + default: + fprintf(stderr, "unhandled vm exit: 0x%x\n", kvm_run.exit_reason); + kvm_show_regs(kvm, vcpu); + abort(); + break; + } + } +more: + if (!r) + goto again; + return r; +} + +int kvm_inject_irq(kvm_context_t kvm, int vcpu, unsigned irq) +{ + struct kvm_interrupt intr; + + intr.irq = irq; + return ioctl(kvm->vcpu_fd[vcpu], KVM_INTERRUPT, &intr); +} + +int kvm_guest_debug(kvm_context_t kvm, int vcpu, struct kvm_debug_guest *dbg) +{ + return ioctl(kvm->vcpu_fd[vcpu], KVM_DEBUG_GUEST, dbg); +} --- qemu-0.9.0/kvmctl.h +++ qemu-0.9.0/kvmctl.h @@ -0,0 +1,269 @@ +/** \file kvmctl.h + * libkvm API + */ + +#ifndef KVMCTL_H +#define KVMCTL_H + +#define __user /* temporary, until installed via make headers_install */ +#include "kvm.h" +#include + +struct kvm_context; + +typedef struct kvm_context *kvm_context_t; + +/*! + * \brief KVM callbacks structure + * + * This structure holds pointers to various functions that KVM will call + * when it encounters something that cannot be virtualized, such as + * accessing hardware devices via MMIO or regular IO. + */ +struct kvm_callbacks { + int (*cpuid)(void *opaque, + uint64_t *rax, uint64_t *rbx, uint64_t *rcx, uint64_t *rdx); + /// For 8bit IO reads from the guest (Usually when executing 'inb') + int (*inb)(void *opaque, uint16_t addr, uint8_t *data); + /// For 16bit IO reads from the guest (Usually when executing 'inw') + int (*inw)(void *opaque, uint16_t addr, uint16_t *data); + /// For 32bit IO reads from the guest (Usually when executing 'inl') + int (*inl)(void *opaque, uint16_t addr, uint32_t *data); + /// For 8bit IO writes from the guest (Usually when executing 'outb') + int (*outb)(void *opaque, uint16_t addr, uint8_t data); + /// For 16bit IO writes from the guest (Usually when executing 'outw') + int (*outw)(void *opaque, uint16_t addr, uint16_t data); + /// For 32bit IO writes from the guest (Usually when executing 'outl') + int (*outl)(void *opaque, uint16_t addr, uint32_t data); + /// For 8bit memory reads from unmapped memory (For MMIO devices) + int (*readb)(void *opaque, uint64_t addr, uint8_t *data); + /// For 16bit memory reads from unmapped memory (For MMIO devices) + int (*readw)(void *opaque, uint64_t addr, uint16_t *data); + /// For 32bit memory reads from unmapped memory (For MMIO devices) + int (*readl)(void *opaque, uint64_t addr, uint32_t *data); + /// For 64bit memory reads from unmapped memory (For MMIO devices) + int (*readq)(void *opaque, uint64_t addr, uint64_t *data); + /// For 8bit memory writes to unmapped memory (For MMIO devices) + int (*writeb)(void *opaque, uint64_t addr, uint8_t data); + /// For 16bit memory writes to unmapped memory (For MMIO devices) + int (*writew)(void *opaque, uint64_t addr, uint16_t data); + /// For 32bit memory writes to unmapped memory (For MMIO devices) + int (*writel)(void *opaque, uint64_t addr, uint32_t data); + /// For 64bit memory writes to unmapped memory (For MMIO devices) + int (*writeq)(void *opaque, uint64_t addr, uint64_t data); + int (*debug)(void *opaque, int vcpu); + /*! + * \brief Called when the VCPU issues an 'hlt' instruction. + * + * Typically, you should yeild here to prevent 100% CPU utilization + * on the host CPU. + */ + int (*halt)(void *opaque, int vcpu); + int (*shutdown)(void *opaque, int vcpu); + int (*io_window)(void *opaque); + int (*try_push_interrupts)(void *opaque); + void (*post_kvm_run)(void *opaque, struct kvm_run *kvm_run); + void (*pre_kvm_run)(void *opaque, struct kvm_run *kvm_run); +}; + +/*! + * \brief Create new KVM context + * + * This creates a new kvm_context. A KVM context is a small area of data that + * holds information about the KVM instance that gets created by this call.\n + * This should always be your first call to KVM. + * + * \param callbacks Pointer to a valid kvm_callbacks structure + * \param opaque Not used + * \return NULL on failure + */ +kvm_context_t kvm_init(struct kvm_callbacks *callbacks, + void *opaque); + +/*! + * \brief Cleanup the KVM context + * + * Should always be called when closing down KVM.\n + * Exception: If kvm_init() fails, this function should not be called, as the + * context would be invalid + * + * \param kvm Pointer to the kvm_context that is to be freed + */ +void kvm_finalize(kvm_context_t kvm); + +/*! + * \brief Create new virtual machine + * + * This creates a new virtual machine, maps physical RAM to it, and creates a + * virtual CPU for it.\n + * \n + * Memory gets mapped for addresses 0->0xA0000, 0xC0000->phys_mem_bytes + * + * \param kvm Pointer to the current kvm_context + * \param phys_mem_bytes The amount of physical ram you want the VM to have + * \param phys_mem This pointer will be set to point to the memory that + * kvm_create allocates for physical RAM + * \return 0 on success + */ +int kvm_create(kvm_context_t kvm, + unsigned long phys_mem_bytes, + void **phys_mem); + +/*! + * \brief Start the VCPU + * + * This starts the VCPU and virtualization is started.\n + * \n + * This function will not return until any of these conditions are met: + * - An IO/MMIO handler does not return "0" + * - An exception that neither the guest OS, nor KVM can handle occurs + * + * \note This function will call the callbacks registered in kvm_init() + * to emulate those functions + * \note If you at any point want to interrupt the VCPU, kvm_run() will + * listen to the EINTR signal. This allows you to simulate external interrupts + * and asyncronous IO. + * + * \param kvm Pointer to the current kvm_context + * \param vcpu Which virtual CPU should be started + * \return 0 on success, but you really shouldn't expect this function to + * return except for when an error has occured, or when you have sent it + * an EINTR signal. + */ +int kvm_run(kvm_context_t kvm, int vcpu); + +/*! + * \brief Read VCPU registers + * + * This gets the GP registers from the VCPU and outputs them + * into a kvm_regs structure + * + * \note This function returns a \b copy of the VCPUs registers.\n + * If you wish to modify the VCPUs GP registers, you should call kvm_set_regs() + * + * \param kvm Pointer to the current kvm_context + * \param vcpu Which virtual CPU should get dumped + * \param regs Pointer to a kvm_regs which will be populated with the VCPUs + * registers values + * \return 0 on success + */ +int kvm_get_regs(kvm_context_t kvm, int vcpu, struct kvm_regs *regs); + +/*! + * \brief Write VCPU registers + * + * This sets the GP registers on the VCPU from a kvm_regs structure + * + * \note When this function returns, the regs pointer and the data it points to + * can be discarded + * \param kvm Pointer to the current kvm_context + * \param vcpu Which virtual CPU should get dumped + * \param regs Pointer to a kvm_regs which will be populated with the VCPUs + * registers values + * \return 0 on success + */ +int kvm_set_regs(kvm_context_t kvm, int vcpu, struct kvm_regs *regs); + +/*! + * \brief Read VCPU system registers + * + * This gets the non-GP registers from the VCPU and outputs them + * into a kvm_sregs structure + * + * \note This function returns a \b copy of the VCPUs registers.\n + * If you wish to modify the VCPUs non-GP registers, you should call + * kvm_set_sregs() + * + * \param kvm Pointer to the current kvm_context + * \param vcpu Which virtual CPU should get dumped + * \param regs Pointer to a kvm_sregs which will be populated with the VCPUs + * registers values + * \return 0 on success + */ +int kvm_get_sregs(kvm_context_t kvm, int vcpu, struct kvm_sregs *regs); + +/*! + * \brief Write VCPU system registers + * + * This sets the non-GP registers on the VCPU from a kvm_sregs structure + * + * \note When this function returns, the regs pointer and the data it points to + * can be discarded + * \param kvm Pointer to the current kvm_context + * \param vcpu Which virtual CPU should get dumped + * \param regs Pointer to a kvm_sregs which will be populated with the VCPUs + * registers values + * \return 0 on success + */ +int kvm_set_sregs(kvm_context_t kvm, int vcpu, struct kvm_sregs *regs); + +struct kvm_msr_list *kvm_get_msr_list(kvm_context_t); +int kvm_get_msrs(kvm_context_t, int vcpu, struct kvm_msr_entry *msrs, int n); +int kvm_set_msrs(kvm_context_t, int vcpu, struct kvm_msr_entry *msrs, int n); + +/*! + * \brief Simulate an external vectored interrupt + * + * This allows you to simulate an external vectored interrupt. + * + * \param kvm Pointer to the current kvm_context + * \param vcpu Which virtual CPU should get dumped + * \param irq Vector number + * \return 0 on success + */ +int kvm_inject_irq(kvm_context_t kvm, int vcpu, unsigned irq); +int kvm_guest_debug(kvm_context_t, int vcpu, struct kvm_debug_guest *dbg); + +/*! + * \brief Dump all VCPU information + * + * This dumps \b all the information that KVM has about a virtual CPU, namely: + * - GP Registers + * - System registers (selectors, descriptors, etc) + * - VMCS Data + * - MSRS + * - Pending interrupts + * + * \param kvm Pointer to the current kvm_context + * \param vcpu Which virtual CPU should get dumped + * \return 0 on success + */ +int kvm_dump_vcpu(kvm_context_t kvm, int vcpu); + +/*! + * \brief Dump VCPU registers + * + * This dumps some of the information that KVM has about a virtual CPU, namely: + * - GP Registers + * + * A much more verbose version of this is available as kvm_dump_vcpu() + * + * \param kvm Pointer to the current kvm_context + * \param vcpu Which virtual CPU should get dumped + * \return 0 on success + */ +void kvm_show_regs(kvm_context_t kvm, int vcpu); + +void *kvm_create_phys_mem(kvm_context_t, unsigned long phys_start, + unsigned long len, int slot, int log, int writable); +void kvm_destroy_phys_mem(kvm_context_t, unsigned long phys_start, + unsigned long len); +int kvm_get_dirty_pages(kvm_context_t, int slot, void *buf); + +/*! + * \brief Enable dirty-pages-logging for all memory regions + * + * \param kvm Pointer to the current kvm_context + */ +int kvm_dirty_pages_log_enable_all(kvm_context_t kvm); + +/*! + * \brief Disable dirty-page-logging for some memory regions + * + * Disable dirty-pages-logging for those memory regions that were + * created with dirty-page-logging disabled. + * + * \param kvm Pointer to the current kvm_context + */ +int kvm_dirty_pages_log_reset(kvm_context_t kvm); +#endif --- qemu-0.9.0/migration.c +++ qemu-0.9.0/migration.c @@ -24,6 +24,9 @@ #include "vl.h" #include "qemu_socket.h" +#ifdef USE_KVM +#include "qemu-kvm.h" +#endif #include @@ -172,6 +175,10 @@ int dirty_count = 0; for (addr = 0; addr < phys_ram_size; addr += TARGET_PAGE_SIZE) { +#ifdef USE_KVM + if (kvm_allowed && (addr>=0xa0000) && (addr<0xc0000)) /* do not access video-addresses */ + continue; +#endif if (cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG)) dirty_count++; } @@ -186,6 +193,11 @@ if (migrate_write_buffer(s)) return; +#ifdef USE_KVM + if (kvm_allowed && !*s->has_error) + *s->has_error = kvm_update_dirty_pages_log(); +#endif + if (migrate_check_convergence(s) || *s->has_error) { qemu_del_timer(s->timer); qemu_free_timer(s->timer); @@ -195,6 +207,11 @@ } while (s->addr < phys_ram_size) { +#ifdef USE_KVM + if (kvm_allowed && (s->addr>=0xa0000) && (s->addr<0xc0000)) /* do not access video-addresses */ + s->addr = 0xc0000; +#endif + if (cpu_physical_memory_get_dirty(s->addr, MIGRATION_DIRTY_FLAG)) { uint32_t value = cpu_to_be32(s->addr); @@ -254,6 +271,10 @@ fcntl(s->fd, F_SETFL, O_NONBLOCK); for (addr = 0; addr < phys_ram_size; addr += TARGET_PAGE_SIZE) { +#ifdef USE_KVM + if (kvm_allowed && (addr>=0xa0000) && (addr<0xc0000)) /* do not access video-addresses */ + continue; +#endif if (!cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG)) cpu_physical_memory_set_dirty(addr); } @@ -723,6 +744,10 @@ unsigned int sum; for (addr = 0; addr < phys_ram_size; addr += TARGET_PAGE_SIZE) { +#ifdef USE_KVM + if (kvm_allowed && (addr>=0xa0000) && (addr<0xc0000)) /* do not access video-addresses */ + continue; +#endif sum = calc_page_checksum(addr); qemu_put_be32(f, addr); qemu_put_be32(f, sum); @@ -737,6 +762,10 @@ int num_errors = 0; for (addr = 0; addr < phys_ram_size; addr += TARGET_PAGE_SIZE) { +#ifdef USE_KVM + if (kvm_allowed && (addr>=0xa0000) && (addr<0xc0000)) /* do not access video-addresses */ + continue; +#endif sum = calc_page_checksum(addr); raddr = qemu_get_be32(f); rsum = qemu_get_be32(f); --- qemu-0.9.0/qemu-kvm.c +++ qemu-0.9.0/qemu-kvm.c @@ -0,0 +1,793 @@ + +#include "config.h" +#include "config-host.h" + +#ifdef USE_KVM + +#include "exec.h" + +#include "qemu-kvm.h" +#include +#include + +#define MSR_IA32_TSC 0x10 + +extern void perror(const char *s); + +int kvm_allowed = 1; +kvm_context_t kvm_context; +static struct kvm_msr_list *kvm_msr_list; +static int kvm_has_msr_star; + +#define NR_CPU 16 +static CPUState *saved_env[NR_CPU]; + +static void set_msr_entry(struct kvm_msr_entry *entry, uint32_t index, + uint64_t data) +{ + entry->index = index; + entry->data = data; +} + +/* returns 0 on success, non-0 on failure */ +static int get_msr_entry(struct kvm_msr_entry *entry, CPUState *env) +{ + switch (entry->index) { + case MSR_IA32_SYSENTER_CS: + env->sysenter_cs = entry->data; + break; + case MSR_IA32_SYSENTER_ESP: + env->sysenter_esp = entry->data; + break; + case MSR_IA32_SYSENTER_EIP: + env->sysenter_eip = entry->data; + break; + case MSR_STAR: + env->star = entry->data; + break; +#ifdef TARGET_X86_64 + case MSR_CSTAR: + env->cstar = entry->data; + break; + case MSR_KERNELGSBASE: + env->kernelgsbase = entry->data; + break; + case MSR_FMASK: + env->fmask = entry->data; + break; + case MSR_LSTAR: + env->lstar = entry->data; + break; +#endif + case MSR_IA32_TSC: + env->tsc = entry->data; + break; + default: + printf("Warning unknown msr index 0x%x\n", entry->index); + return 1; + } + return 0; +} + +#ifdef TARGET_X86_64 +#define MSR_COUNT 9 +#else +#define MSR_COUNT 5 +#endif + +static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs) +{ + lhs->selector = rhs->selector; + lhs->base = rhs->base; + lhs->limit = rhs->limit; + lhs->type = 3; + lhs->present = 1; + lhs->dpl = 3; + lhs->db = 0; + lhs->s = 1; + lhs->l = 0; + lhs->g = 0; + lhs->avl = 0; + lhs->unusable = 0; +} + +static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs) +{ + unsigned flags = rhs->flags; + lhs->selector = rhs->selector; + lhs->base = rhs->base; + lhs->limit = rhs->limit; + lhs->type = (flags >> DESC_TYPE_SHIFT) & 15; + lhs->present = (flags & DESC_P_MASK) != 0; + lhs->dpl = rhs->selector & 3; + lhs->db = (flags >> DESC_B_SHIFT) & 1; + lhs->s = (flags & DESC_S_MASK) != 0; + lhs->l = (flags >> DESC_L_SHIFT) & 1; + lhs->g = (flags & DESC_G_MASK) != 0; + lhs->avl = (flags & DESC_AVL_MASK) != 0; + lhs->unusable = 0; +} + +static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs) +{ + lhs->selector = rhs->selector; + lhs->base = rhs->base; + lhs->limit = rhs->limit; + lhs->flags = + (rhs->type << DESC_TYPE_SHIFT) + | (rhs->present * DESC_P_MASK) + | (rhs->dpl << DESC_DPL_SHIFT) + | (rhs->db << DESC_B_SHIFT) + | (rhs->s * DESC_S_MASK) + | (rhs->l << DESC_L_SHIFT) + | (rhs->g * DESC_G_MASK) + | (rhs->avl * DESC_AVL_MASK); +} + +/* the reset values of qemu are not compatible to SVM + * this function is used to fix the segment descriptor values */ +static void fix_realmode_dataseg(struct kvm_segment *seg) +{ + seg->type = 0x02; + seg->present = 1; + seg->s = 1; +} + +static void load_regs(CPUState *env) +{ + struct kvm_regs regs; + struct kvm_sregs sregs; + struct kvm_msr_entry msrs[MSR_COUNT]; + int rc, n; + + /* hack: save env */ + if (!saved_env[0]) + saved_env[0] = env; + + regs.rax = env->regs[R_EAX]; + regs.rbx = env->regs[R_EBX]; + regs.rcx = env->regs[R_ECX]; + regs.rdx = env->regs[R_EDX]; + regs.rsi = env->regs[R_ESI]; + regs.rdi = env->regs[R_EDI]; + regs.rsp = env->regs[R_ESP]; + regs.rbp = env->regs[R_EBP]; +#ifdef TARGET_X86_64 + regs.r8 = env->regs[8]; + regs.r9 = env->regs[9]; + regs.r10 = env->regs[10]; + regs.r11 = env->regs[11]; + regs.r12 = env->regs[12]; + regs.r13 = env->regs[13]; + regs.r14 = env->regs[14]; + regs.r15 = env->regs[15]; +#endif + + regs.rflags = env->eflags; + regs.rip = env->eip; + + kvm_set_regs(kvm_context, 0, ®s); + + memcpy(sregs.interrupt_bitmap, env->kvm_interrupt_bitmap, sizeof(sregs.interrupt_bitmap)); + + if ((env->eflags & VM_MASK)) { + set_v8086_seg(&sregs.cs, &env->segs[R_CS]); + set_v8086_seg(&sregs.ds, &env->segs[R_DS]); + set_v8086_seg(&sregs.es, &env->segs[R_ES]); + set_v8086_seg(&sregs.fs, &env->segs[R_FS]); + set_v8086_seg(&sregs.gs, &env->segs[R_GS]); + set_v8086_seg(&sregs.ss, &env->segs[R_SS]); + } else { + set_seg(&sregs.cs, &env->segs[R_CS]); + set_seg(&sregs.ds, &env->segs[R_DS]); + set_seg(&sregs.es, &env->segs[R_ES]); + set_seg(&sregs.fs, &env->segs[R_FS]); + set_seg(&sregs.gs, &env->segs[R_GS]); + set_seg(&sregs.ss, &env->segs[R_SS]); + + if (env->cr[0] & CR0_PE_MASK) { + /* force ss cpl to cs cpl */ + sregs.ss.selector = (sregs.ss.selector & ~3) | + (sregs.cs.selector & 3); + sregs.ss.dpl = sregs.ss.selector & 3; + } + + if (!(env->cr[0] & CR0_PG_MASK)) { + fix_realmode_dataseg(&sregs.ds); + fix_realmode_dataseg(&sregs.es); + fix_realmode_dataseg(&sregs.fs); + fix_realmode_dataseg(&sregs.gs); + fix_realmode_dataseg(&sregs.ss); + } + } + + set_seg(&sregs.tr, &env->tr); + set_seg(&sregs.ldt, &env->ldt); + + sregs.idt.limit = env->idt.limit; + sregs.idt.base = env->idt.base; + sregs.gdt.limit = env->gdt.limit; + sregs.gdt.base = env->gdt.base; + + sregs.cr0 = env->cr[0]; + sregs.cr2 = env->cr[2]; + sregs.cr3 = env->cr[3]; + sregs.cr4 = env->cr[4]; + + sregs.apic_base = cpu_get_apic_base(env); + sregs.efer = env->efer; + sregs.cr8 = cpu_get_apic_tpr(env); + + kvm_set_sregs(kvm_context, 0, &sregs); + + /* msrs */ + n = 0; + set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs); + set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp); + set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip); + if (kvm_has_msr_star) + set_msr_entry(&msrs[n++], MSR_STAR, env->star); + set_msr_entry(&msrs[n++], MSR_IA32_TSC, env->tsc); +#ifdef TARGET_X86_64 + set_msr_entry(&msrs[n++], MSR_CSTAR, env->cstar); + set_msr_entry(&msrs[n++], MSR_KERNELGSBASE, env->kernelgsbase); + set_msr_entry(&msrs[n++], MSR_FMASK, env->fmask); + set_msr_entry(&msrs[n++], MSR_LSTAR , env->lstar); +#endif + + rc = kvm_set_msrs(kvm_context, 0, msrs, n); + if (rc == -1) + perror("kvm_set_msrs FAILED"); +} + + +static void save_regs(CPUState *env) +{ + struct kvm_regs regs; + struct kvm_sregs sregs; + struct kvm_msr_entry msrs[MSR_COUNT]; + uint32_t hflags; + uint32_t i, n, rc; + + kvm_get_regs(kvm_context, 0, ®s); + + env->regs[R_EAX] = regs.rax; + env->regs[R_EBX] = regs.rbx; + env->regs[R_ECX] = regs.rcx; + env->regs[R_EDX] = regs.rdx; + env->regs[R_ESI] = regs.rsi; + env->regs[R_EDI] = regs.rdi; + env->regs[R_ESP] = regs.rsp; + env->regs[R_EBP] = regs.rbp; +#ifdef TARGET_X86_64 + env->regs[8] = regs.r8; + env->regs[9] = regs.r9; + env->regs[10] = regs.r10; + env->regs[11] = regs.r11; + env->regs[12] = regs.r12; + env->regs[13] = regs.r13; + env->regs[14] = regs.r14; + env->regs[15] = regs.r15; +#endif + + env->eflags = regs.rflags; + env->eip = regs.rip; + + kvm_get_sregs(kvm_context, 0, &sregs); + + memcpy(env->kvm_interrupt_bitmap, sregs.interrupt_bitmap, sizeof(env->kvm_interrupt_bitmap)); + + get_seg(&env->segs[R_CS], &sregs.cs); + get_seg(&env->segs[R_DS], &sregs.ds); + get_seg(&env->segs[R_ES], &sregs.es); + get_seg(&env->segs[R_FS], &sregs.fs); + get_seg(&env->segs[R_GS], &sregs.gs); + get_seg(&env->segs[R_SS], &sregs.ss); + + get_seg(&env->tr, &sregs.tr); + get_seg(&env->ldt, &sregs.ldt); + + env->idt.limit = sregs.idt.limit; + env->idt.base = sregs.idt.base; + env->gdt.limit = sregs.gdt.limit; + env->gdt.base = sregs.gdt.base; + + env->cr[0] = sregs.cr0; + env->cr[2] = sregs.cr2; + env->cr[3] = sregs.cr3; + env->cr[4] = sregs.cr4; + + cpu_set_apic_base(env, sregs.apic_base); + + env->efer = sregs.efer; + cpu_set_apic_tpr(env, sregs.cr8); + +#define HFLAG_COPY_MASK ~( \ + HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \ + HF_TS_MASK | HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK | \ + HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \ + HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK) + + + + hflags = (env->segs[R_CS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK; + hflags |= (env->cr[0] & CR0_PE_MASK) << (HF_PE_SHIFT - CR0_PE_SHIFT); + hflags |= (env->cr[0] << (HF_MP_SHIFT - CR0_MP_SHIFT)) & + (HF_MP_MASK | HF_EM_MASK | HF_TS_MASK); + hflags |= (env->eflags & (HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK)); + hflags |= (env->cr[4] & CR4_OSFXSR_MASK) << + (HF_OSFXSR_SHIFT - CR4_OSFXSR_SHIFT); + + if (env->efer & MSR_EFER_LMA) { + hflags |= HF_LMA_MASK; + } + + if ((hflags & HF_LMA_MASK) && (env->segs[R_CS].flags & DESC_L_MASK)) { + hflags |= HF_CS32_MASK | HF_SS32_MASK | HF_CS64_MASK; + } else { + hflags |= (env->segs[R_CS].flags & DESC_B_MASK) >> + (DESC_B_SHIFT - HF_CS32_SHIFT); + hflags |= (env->segs[R_SS].flags & DESC_B_MASK) >> + (DESC_B_SHIFT - HF_SS32_SHIFT); + if (!(env->cr[0] & CR0_PE_MASK) || + (env->eflags & VM_MASK) || + !(hflags & HF_CS32_MASK)) { + hflags |= HF_ADDSEG_MASK; + } else { + hflags |= ((env->segs[R_DS].base | + env->segs[R_ES].base | + env->segs[R_SS].base) != 0) << + HF_ADDSEG_SHIFT; + } + } + env->hflags = (env->hflags & HFLAG_COPY_MASK) | hflags; + CC_SRC = env->eflags & (CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C); + DF = 1 - (2 * ((env->eflags >> 10) & 1)); + CC_OP = CC_OP_EFLAGS; + env->eflags &= ~(DF_MASK | CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C); + + tlb_flush(env, 1); + + /* msrs */ + n = 0; + msrs[n++].index = MSR_IA32_SYSENTER_CS; + msrs[n++].index = MSR_IA32_SYSENTER_ESP; + msrs[n++].index = MSR_IA32_SYSENTER_EIP; + if (kvm_has_msr_star) + msrs[n++].index = MSR_STAR; + msrs[n++].index = MSR_IA32_TSC; +#ifdef TARGET_X86_64 + msrs[n++].index = MSR_CSTAR; + msrs[n++].index = MSR_KERNELGSBASE; + msrs[n++].index = MSR_FMASK; + msrs[n++].index = MSR_LSTAR; +#endif + rc = kvm_get_msrs(kvm_context, 0, msrs, n); + if (rc == -1) { + perror("kvm_get_msrs FAILED"); + } + else { + n = rc; /* actual number of MSRs */ + for (i=0 ; i + + +static int try_push_interrupts(void *opaque) +{ + CPUState **envs = opaque, *env; + env = envs[0]; + + if (env->ready_for_interrupt_injection && + (env->interrupt_request & CPU_INTERRUPT_HARD) && + (env->eflags & IF_MASK)) { + env->interrupt_request &= ~CPU_INTERRUPT_HARD; + // for now using cpu 0 + kvm_inject_irq(kvm_context, 0, cpu_get_pic_interrupt(env)); + } + + return (env->interrupt_request & CPU_INTERRUPT_HARD) != 0; +} + +static void post_kvm_run(void *opaque, struct kvm_run *kvm_run) +{ + CPUState **envs = opaque, *env; + env = envs[0]; + + env->eflags = (kvm_run->if_flag) ? env->eflags | IF_MASK:env->eflags & ~IF_MASK; + env->ready_for_interrupt_injection = kvm_run->ready_for_interrupt_injection; + cpu_set_apic_tpr(env, kvm_run->cr8); + cpu_set_apic_base(env, kvm_run->apic_base); +} + +static void pre_kvm_run(void *opaque, struct kvm_run *kvm_run) +{ + CPUState **envs = opaque, *env; + env = envs[0]; + + kvm_run->cr8 = cpu_get_apic_tpr(env); +} + +void kvm_load_registers(CPUState *env) +{ + load_regs(env); +} + +void kvm_save_registers(CPUState *env) +{ + save_regs(env); +} + +int kvm_cpu_exec(CPUState *env) +{ + int r; + int pending = (!env->ready_for_interrupt_injection || + ((env->interrupt_request & CPU_INTERRUPT_HARD) && + (env->eflags & IF_MASK))); + + if (!pending && (env->interrupt_request & CPU_INTERRUPT_EXIT)) { + env->interrupt_request &= ~CPU_INTERRUPT_EXIT; + env->exception_index = EXCP_INTERRUPT; + cpu_loop_exit(); + } + + + if (!saved_env[0]) + saved_env[0] = env; + + r = kvm_run(kvm_context, 0); + if (r < 0) { + printf("kvm_run returned %d\n", r); + exit(1); + } + + return 0; +} + + +static int kvm_cpuid(void *opaque, uint64_t *rax, uint64_t *rbx, + uint64_t *rcx, uint64_t *rdx) +{ + CPUState **envs = opaque; + CPUState *saved_env; + uint32_t eax = *rax; + + saved_env = env; + env = envs[0]; + + env->regs[R_EAX] = *rax; + env->regs[R_EBX] = *rbx; + env->regs[R_ECX] = *rcx; + env->regs[R_EDX] = *rdx; + helper_cpuid(); + *rdx = env->regs[R_EDX]; + *rcx = env->regs[R_ECX]; + *rbx = env->regs[R_EBX]; + *rax = env->regs[R_EAX]; + // don't report long mode/syscall/nx if no native support + if (eax == 0x80000001) { + unsigned long h_eax = eax, h_edx; + + + // push/pop hack to workaround gcc 3 register pressure trouble + asm ( +#ifdef __x86_64__ + "push %%rbx; push %%rcx; cpuid; pop %%rcx; pop %%rbx" +#else + "push %%ebx; push %%ecx; cpuid; pop %%ecx; pop %%ebx" +#endif + : "+a"(h_eax), "=d"(h_edx)); + + // long mode + if ((h_edx & 0x20000000) == 0) + *rdx &= ~0x20000000ull; + // syscall + if ((h_edx & 0x00000800) == 0) + *rdx &= ~0x00000800ull; + // nx + if ((h_edx & 0x00100000) == 0) + *rdx &= ~0x00100000ull; + } + env = saved_env; + return 0; +} + +static int kvm_debug(void *opaque, int vcpu) +{ + CPUState **envs = opaque; + + env = envs[0]; + env->exception_index = EXCP_DEBUG; + return 1; +} + +static int kvm_inb(void *opaque, uint16_t addr, uint8_t *data) +{ + *data = cpu_inb(0, addr); + return 0; +} + +static int kvm_inw(void *opaque, uint16_t addr, uint16_t *data) +{ + *data = cpu_inw(0, addr); + return 0; +} + +static int kvm_inl(void *opaque, uint16_t addr, uint32_t *data) +{ + *data = cpu_inl(0, addr); + return 0; +} + +static int kvm_outb(void *opaque, uint16_t addr, uint8_t data) +{ + if (addr == 0xb2 && data == 0) { + struct kvm_regs regs; + + kvm_get_regs(kvm_context, 0, ®s); + + /* hack around smm entry: kvm doesn't emulate smm at this time */ + if (regs.rip == 0x409f4) + regs.rip += 0x4b; + kvm_set_regs(kvm_context, 0, ®s); + + return 0; + } + cpu_outb(0, addr, data); + return 0; +} + +static int kvm_outw(void *opaque, uint16_t addr, uint16_t data) +{ + cpu_outw(0, addr, data); + return 0; +} + +static int kvm_outl(void *opaque, uint16_t addr, uint32_t data) +{ + cpu_outl(0, addr, data); + return 0; +} + +static int kvm_readb(void *opaque, uint64_t addr, uint8_t *data) +{ + *data = ldub_phys(addr); + return 0; +} + +static int kvm_readw(void *opaque, uint64_t addr, uint16_t *data) +{ + *data = lduw_phys(addr); + return 0; +} + +static int kvm_readl(void *opaque, uint64_t addr, uint32_t *data) +{ + *data = ldl_phys(addr); + return 0; +} + +static int kvm_readq(void *opaque, uint64_t addr, uint64_t *data) +{ + *data = ldq_phys(addr); + return 0; +} + +static int kvm_writeb(void *opaque, uint64_t addr, uint8_t data) +{ + stb_phys(addr, data); + return 0; +} + +static int kvm_writew(void *opaque, uint64_t addr, uint16_t data) +{ + stw_phys(addr, data); + return 0; +} + +static int kvm_writel(void *opaque, uint64_t addr, uint32_t data) +{ + stl_phys(addr, data); + return 0; +} + +static int kvm_writeq(void *opaque, uint64_t addr, uint64_t data) +{ + stq_phys(addr, data); + return 0; +} + +static int kvm_io_window(void *opaque) +{ + return 1; +} + + +static int kvm_halt(void *opaque, int vcpu) +{ + CPUState **envs = opaque, *env; + + env = envs[0]; + if (!((env->interrupt_request & CPU_INTERRUPT_HARD) && + (env->eflags & IF_MASK))) { + env->hflags |= HF_HALTED_MASK; + env->exception_index = EXCP_HLT; + } + + return 1; +} + +static int kvm_shutdown(void *opaque, int vcpu) +{ + qemu_system_reset_request(); + return 1; +} + +static struct kvm_callbacks qemu_kvm_ops = { + .cpuid = kvm_cpuid, + .debug = kvm_debug, + .inb = kvm_inb, + .inw = kvm_inw, + .inl = kvm_inl, + .outb = kvm_outb, + .outw = kvm_outw, + .outl = kvm_outl, + .readb = kvm_readb, + .readw = kvm_readw, + .readl = kvm_readl, + .readq = kvm_readq, + .writeb = kvm_writeb, + .writew = kvm_writew, + .writel = kvm_writel, + .writeq = kvm_writeq, + .halt = kvm_halt, + .shutdown = kvm_shutdown, + .io_window = kvm_io_window, + .try_push_interrupts = try_push_interrupts, + .post_kvm_run = post_kvm_run, + .pre_kvm_run = pre_kvm_run, +}; + +int kvm_qemu_init() +{ + /* Try to initialize kvm */ + kvm_context = kvm_init(&qemu_kvm_ops, saved_env); + if (!kvm_context) { + return -1; + } + + return 0; +} + +int kvm_qemu_create_context(void) +{ + int i; + + if (kvm_create(kvm_context, phys_ram_size, (void**)&phys_ram_base) < 0) { + kvm_qemu_destroy(); + return -1; + } + kvm_msr_list = kvm_get_msr_list(kvm_context); + if (!kvm_msr_list) { + kvm_qemu_destroy(); + return -1; + } + for (i = 0; i < kvm_msr_list->nmsrs; ++i) + if (kvm_msr_list->indices[i] == MSR_STAR) + kvm_has_msr_star = 1; + return 0; +} + +void kvm_qemu_destroy(void) +{ + kvm_finalize(kvm_context); +} + +int kvm_update_debugger(CPUState *env) +{ + struct kvm_debug_guest dbg; + int i; + + dbg.enabled = 0; + if (env->nb_breakpoints || env->singlestep_enabled) { + dbg.enabled = 1; + for (i = 0; i < 4 && i < env->nb_breakpoints; ++i) { + dbg.breakpoints[i].enabled = 1; + dbg.breakpoints[i].address = env->breakpoints[i]; + } + dbg.singlestep = env->singlestep_enabled; + } + return kvm_guest_debug(kvm_context, 0, &dbg); +} + + +/* + * dirty pages logging + */ +/* FIXME: use unsigned long pointer instead of unsigned char */ +unsigned char *kvm_dirty_bitmap = NULL; +int kvm_physical_memory_set_dirty_tracking(int enable) +{ + int r = 0; + + if (!kvm_allowed) + return 0; + + if (enable) { + if (!kvm_dirty_bitmap) { + unsigned bitmap_size = BITMAP_SIZE(phys_ram_size); + kvm_dirty_bitmap = qemu_malloc(bitmap_size); + if (kvm_dirty_bitmap == NULL) { + perror("Failed to allocate dirty pages bitmap"); + r=-1; + } + else { + r = kvm_dirty_pages_log_enable_all(kvm_context); + } + } + } + else { + if (kvm_dirty_bitmap) { + r = kvm_dirty_pages_log_reset(kvm_context); + qemu_free(kvm_dirty_bitmap); + kvm_dirty_bitmap = NULL; + } + } + return r; +} + +/* get kvm's dirty pages bitmap and update qemu's */ +int kvm_get_dirty_pages_log_slot(int slot, + unsigned char *bitmap, + unsigned int offset, + unsigned int len) +{ + int r; + unsigned int i, j, n=0; + unsigned char c; + unsigned page_number, addr, addr1; + + memset(bitmap, 0, len); + r = kvm_get_dirty_pages(kvm_context, slot, bitmap); + if (r) + return r; + + /* + * bitmap-traveling is faster than memory-traveling (for addr...) + * especially when most of the memory is not dirty. + */ + for (i=0; i0) { + j = ffsl(c) - 1; + c &= ~(1u<>TARGET_PAGE_BITS), HOST_LONG_BITS) / 8) +#endif --- qemu-0.9.0/sdl.c +++ qemu-0.9.0/sdl.c @@ -214,6 +214,11 @@ { char buf[1024]; strcpy(buf, "QEMU"); +#if USE_KVM + if (kvm_allowed) { + strcat(buf, "/KVM"); + } +#endif if (!vm_running) { strcat(buf, " [Stopped]"); } --- qemu-0.9.0/target-i386/cpu.h +++ qemu-0.9.0/target-i386/cpu.h @@ -161,12 +161,17 @@ #define HF_MP_MASK (1 << HF_MP_SHIFT) #define HF_EM_MASK (1 << HF_EM_SHIFT) #define HF_TS_MASK (1 << HF_TS_SHIFT) +#define HF_IOPL_MASK (3 << HF_IOPL_SHIFT) #define HF_LMA_MASK (1 << HF_LMA_SHIFT) #define HF_CS64_MASK (1 << HF_CS64_SHIFT) #define HF_OSFXSR_MASK (1 << HF_OSFXSR_SHIFT) +#define HF_VM_MASK (1 << HF_VM_SHIFT) #define HF_HALTED_MASK (1 << HF_HALTED_SHIFT) #define HF_SMM_MASK (1 << HF_SMM_SHIFT) +#define CR0_PE_SHIFT 0 +#define CR0_MP_SHIFT 1 + #define CR0_PE_MASK (1 << 0) #define CR0_MP_MASK (1 << 1) #define CR0_EM_MASK (1 << 2) @@ -185,7 +190,8 @@ #define CR4_PAE_MASK (1 << 5) #define CR4_PGE_MASK (1 << 7) #define CR4_PCE_MASK (1 << 8) -#define CR4_OSFXSR_MASK (1 << 9) +#define CR4_OSFXSR_SHIFT 9 +#define CR4_OSFXSR_MASK (1 << CR4_OSFXSR_SHIFT) #define CR4_OSXMMEXCPT_MASK (1 << 10) #define PG_PRESENT_BIT 0 @@ -496,6 +502,10 @@ target_ulong kernelgsbase; #endif +#ifdef USE_KVM + uint64_t tsc; /* time stamp counter */ + uint8_t ready_for_interrupt_injection; +#endif uint64_t pat; /* temporary data for USE_CODE_COPY mode */ @@ -534,6 +544,13 @@ int kqemu_enabled; int last_io_time; #endif + +#ifdef USE_KVM +#define BITS_PER_LONG (8 * sizeof (long)) +#define NR_IRQ_WORDS (256/ BITS_PER_LONG) + unsigned long kvm_interrupt_bitmap[NR_IRQ_WORDS]; +#endif + /* in order to simplify APIC support, we leave this pointer to the user */ struct APICState *apic_state; --- qemu-0.9.0/target-i386/helper.c +++ qemu-0.9.0/target-i386/helper.c @@ -18,7 +18,9 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "exec.h" - +#ifdef USE_KVM +extern int kvm_allowed; +#endif //#define DEBUG_PCALL #if 0 @@ -839,6 +841,13 @@ uint32_t e1, e2, e3, ss; target_ulong old_eip, esp, offset; +#ifdef USE_KVM + if (kvm_allowed) { + printf("%s: unexpect\n", __FUNCTION__); + exit(-1); + } +#endif + has_error_code = 0; if (!is_int && !is_hw) { switch(intno) { @@ -1122,6 +1131,12 @@ int dpl, cpl; uint32_t e2; +#ifdef USE_KVM + if (kvm_allowed) { + printf("%s: unexpect\n", __FUNCTION__); + exit(-1); + } +#endif dt = &env->idt; ptr = dt->base + (intno * 8); e2 = ldl_kernel(ptr + 4); @@ -1147,6 +1162,12 @@ void do_interrupt(int intno, int is_int, int error_code, target_ulong next_eip, int is_hw) { +#ifdef USE_KVM + if (kvm_allowed) { + printf("%s: unexpect\n", __FUNCTION__); + exit(-1); + } +#endif if (loglevel & CPU_LOG_INT) { if ((env->cr[0] & CR0_PE_MASK)) { static int count; @@ -1958,6 +1979,12 @@ cpu_x86_load_seg_cache(env, R_CS, (new_cs & 0xfffc) | cpl, get_seg_base(e1, e2), limit, e2); EIP = new_eip; +#ifdef USE_KVM + if (kvm_allowed && (e2 & DESC_L_MASK)) { + env->exception_index = -1; + cpu_loop_exit(); + } +#endif } else { /* jump to call or task gate */ dpl = (e2 >> DESC_DPL_SHIFT) & 3; --- qemu-0.9.0/target-i386/helper2.c +++ qemu-0.9.0/target-i386/helper2.c @@ -143,6 +143,9 @@ #ifdef USE_KQEMU kqemu_init(env); #endif +#ifdef USE_KVM + env->ready_for_interrupt_injection = 1; +#endif return env; } --- qemu-0.9.0/vl.c +++ qemu-0.9.0/vl.c @@ -88,6 +88,10 @@ #include "exec-all.h" +#if USE_KVM +#include "qemu-kvm.h" +#endif + #define DEFAULT_NETWORK_SCRIPT "/etc/qemu-ifup" #ifdef __sun__ #define SMBD_COMMAND "/usr/sfw/sbin/smbd" @@ -149,6 +153,9 @@ int graphic_depth = 15; int full_screen = 0; int no_quit = 0; +#ifdef USE_KVM +CharDriverState *vmchannel_hds[MAX_VMCHANNEL_DEVICES]; +#endif CharDriverState *serial_hds[MAX_SERIAL_PORTS]; CharDriverState *parallel_hds[MAX_PARALLEL_PORTS]; #ifdef TARGET_I386 @@ -5407,6 +5414,15 @@ /* XXX: compute hflags from scratch, except for CPL and IIF */ env->hflags = hflags; tlb_flush(env, 1); +#ifdef USE_KVM + if (kvm_allowed) { + for (i = 0; i < NR_IRQ_WORDS ; i++) { + qemu_get_betls(f, &env->kvm_interrupt_bitmap[i]); + } + qemu_get_be64s(f, &env->tsc); + kvm_load_registers(env); + } +#endif return 0; } @@ -5555,6 +5571,10 @@ if (qemu_get_be32(f) != phys_ram_size) return -EINVAL; for(i = 0; i < phys_ram_size; i+= TARGET_PAGE_SIZE) { +#ifdef USE_KVM + if (kvm_allowed && (i>=0xa0000) && (i<0xc0000)) /* do not access video-addresses */ + continue; +#endif ret = ram_get_page(f, phys_ram_base + i, TARGET_PAGE_SIZE); if (ret) return ret; @@ -5689,6 +5709,10 @@ target_ulong addr; for (addr = 0; addr < phys_ram_size; addr += TARGET_PAGE_SIZE) { +#ifdef USE_KVM + if (kvm_allowed && (addr>=0xa0000) && (addr<0xc0000)) /* do not access video-addresses */ + continue; +#endif if (cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG)) { qemu_put_be32(f, addr); qemu_put_buffer(f, phys_ram_base + addr, TARGET_PAGE_SIZE); @@ -6237,6 +6261,10 @@ if (reset_requested) { reset_requested = 0; qemu_system_reset(); +#ifdef USE_KVM + if (kvm_allowed) + kvm_load_registers(env); +#endif ret = EXCP_INTERRUPT; } if (powerdown_requested) { @@ -6354,6 +6382,9 @@ "\n" "Debug/Expert options:\n" "-monitor dev redirect the monitor to char device 'dev'\n" +#ifdef USE_KVM + "-vmchannel di:DI,dev redirect the hypercall device with device id DI, to char device 'dev'\n" +#endif "-serial dev redirect the serial port to char device 'dev'\n" "-parallel dev redirect the parallel port to char device 'dev'\n" "-pidfile file Write PID to 'file'\n" @@ -6368,6 +6399,9 @@ "-kernel-kqemu enable KQEMU full virtualization (default is user mode only)\n" "-no-kqemu disable KQEMU kernel module usage\n" #endif +#ifdef USE_KVM + "-no-kvm disable KVM hardware virtualization\n" +#endif #ifdef USE_CODE_COPY "-no-code-copy disable code copy acceleration\n" #endif @@ -6448,6 +6482,9 @@ QEMU_OPTION_g, QEMU_OPTION_std_vga, QEMU_OPTION_monitor, +#ifdef USE_KVM + QEMU_OPTION_vmchannel, +#endif QEMU_OPTION_serial, QEMU_OPTION_parallel, QEMU_OPTION_loadvm, @@ -6462,6 +6499,7 @@ QEMU_OPTION_smp, QEMU_OPTION_vnc, QEMU_OPTION_no_acpi, + QEMU_OPTION_no_kvm, QEMU_OPTION_no_reboot, QEMU_OPTION_daemonize, QEMU_OPTION_option_rom, @@ -6524,12 +6562,18 @@ { "no-kqemu", 0, QEMU_OPTION_no_kqemu }, { "kernel-kqemu", 0, QEMU_OPTION_kernel_kqemu }, #endif +#ifdef USE_KVM + { "no-kvm", 0, QEMU_OPTION_no_kvm }, +#endif #if defined(TARGET_PPC) || defined(TARGET_SPARC) { "g", 1, QEMU_OPTION_g }, #endif { "localtime", 0, QEMU_OPTION_localtime }, { "std-vga", 0, QEMU_OPTION_std_vga }, { "monitor", 1, QEMU_OPTION_monitor }, +#ifdef USE_KVM + { "vmchannel", 1, QEMU_OPTION_vmchannel }, +#endif { "serial", 1, QEMU_OPTION_serial }, { "parallel", 1, QEMU_OPTION_parallel }, { "loadvm", HAS_ARG, QEMU_OPTION_loadvm }, @@ -6787,6 +6831,10 @@ const char *r, *optarg; CharDriverState *monitor_hd; char monitor_device[128]; +#ifdef USE_KVM + char vmchannel_devices[MAX_VMCHANNEL_DEVICES][128]; + int vmchannel_device_index; +#endif char serial_devices[MAX_SERIAL_PORTS][128]; int serial_device_index; char parallel_devices[MAX_PARALLEL_PORTS][128]; @@ -6858,6 +6906,12 @@ translation = BIOS_ATA_TRANSLATION_AUTO; pstrcpy(monitor_device, sizeof(monitor_device), "vc"); +#ifdef USE_KVM + for(i = 0; i < MAX_VMCHANNEL_DEVICES; i++) + vmchannel_devices[i][0] = '\0'; + vmchannel_device_index = 0; +#endif + pstrcpy(serial_devices[0], sizeof(serial_devices[0]), "vc"); for(i = 1; i < MAX_SERIAL_PORTS; i++) serial_devices[i][0] = '\0'; @@ -7145,6 +7199,17 @@ case QEMU_OPTION_monitor: pstrcpy(monitor_device, sizeof(monitor_device), optarg); break; +#ifdef USE_KVM + case QEMU_OPTION_vmchannel: + if (vmchannel_device_index >= MAX_VMCHANNEL_DEVICES) { + fprintf(stderr, "qemu: too many vmchannel devices\n"); + exit(1); + } + pstrcpy(vmchannel_devices[vmchannel_device_index], + sizeof(vmchannel_devices[0]), optarg); + vmchannel_device_index++; + break; +#endif case QEMU_OPTION_serial: if (serial_device_index >= MAX_SERIAL_PORTS) { fprintf(stderr, "qemu: too many serial ports\n"); @@ -7193,6 +7258,11 @@ kqemu_allowed = 2; break; #endif +#ifdef USE_KVM + case QEMU_OPTION_no_kvm: + kvm_allowed = 0; + break; +#endif case QEMU_OPTION_usb: usb_enabled = 1; break; @@ -7283,6 +7353,15 @@ } #endif +#if USE_KVM + if (kvm_allowed) { + if (kvm_qemu_init() < 0) { + fprintf(stderr, "Could not initialize KVM, will disable KVM support\n"); + kvm_allowed = 0; + } + } +#endif + #ifdef USE_KQEMU if (smp_cpus > 1) kqemu_allowed = 0; @@ -7362,11 +7441,28 @@ phys_ram_size += ret; } +#if USE_KVM + /* Initialize kvm */ + if (kvm_allowed) { + phys_ram_size += KVM_EXTRA_PAGES * 4096; + if (kvm_qemu_create_context() < 0) { + fprintf(stderr, "Could not create KVM context\n"); + exit(1); + } + } else { + phys_ram_base = qemu_vmalloc(phys_ram_size); + if (!phys_ram_base) { + fprintf(stderr, "Could not allocate physical memory\n"); + exit(1); + } + } +#else phys_ram_base = qemu_vmalloc(phys_ram_size); if (!phys_ram_base) { fprintf(stderr, "Could not allocate physical memory\n"); exit(1); } +#endif /* we always create the cdrom drive, even if no disk is there */ bdrv_init(); @@ -7445,6 +7541,33 @@ } monitor_init(monitor_hd, !nographic); +#ifdef USE_KVM + for(i = 0; i < MAX_VMCHANNEL_DEVICES; i++) { + const char *devname = vmchannel_devices[i]; + if (devname[0] != '\0' && strcmp(devname, "none")) { + int devid; + char *termn; + + if (strstart(devname, "di:", &devname)) { + devid = strtol(devname, &termn, 16); + devname = termn + 1; + } + else { + fprintf(stderr, "qemu: could not find vmchannel device id '%s'\n", + devname); + exit(1); + } + vmchannel_hds[i] = qemu_chr_open(devname); + if (!vmchannel_hds[i]) { + fprintf(stderr, "qemu: could not open vmchannel device '%s'\n", + devname); + exit(1); + } + vmchannel_init(vmchannel_hds[i], devid, i); + } + } +#endif + for(i = 0; i < MAX_SERIAL_PORTS; i++) { const char *devname = serial_devices[i]; if (devname[0] != '\0' && strcmp(devname, "none")) { --- qemu-0.9.0/vl.h +++ qemu-0.9.0/vl.h @@ -157,6 +157,7 @@ extern int graphic_depth; extern const char *keyboard_layout; extern int kqemu_allowed; +extern int kvm_allowed; extern int win2k_install_hack; extern int usb_enabled; extern int smp_cpus; @@ -177,6 +178,10 @@ #define BIOS_SIZE ((256 + 64) * 1024) #endif +#if USE_KVM +#define KVM_EXTRA_PAGES 3 +#endif + /* keyboard/mouse support */ #define MOUSE_EVENT_LBUTTON 0x01 @@ -342,6 +347,10 @@ CharDriverState *text_console_init(DisplayState *ds); void console_select(unsigned int index); +/* vmchannel devices */ + +#define MAX_VMCHANNEL_DEVICES 4 + /* serial ports */ #define MAX_SERIAL_PORTS 4 @@ -1220,6 +1229,11 @@ typedef struct ADBDevice ADBDevice; +/* hypercall.c */ + +void pci_hypercall_init(PCIBus *bus); +void vmchannel_init(CharDriverState *hd, uint32_t deviceid, uint32_t index); + /* buf = NULL means polling */ typedef int ADBDeviceRequest(ADBDevice *d, uint8_t *buf_out, const uint8_t *buf, int len);