| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | /*
 | 
					
						
							|  |  |  |  * Copyright (c) 2003-2004 Fabrice Bellard | 
					
						
							|  |  |  |  * Copyright (c) 2019 Red Hat, Inc. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * Permission is hereby granted, free of charge, to any person obtaining a copy | 
					
						
							|  |  |  |  * of this software and associated documentation files (the "Software"), to deal | 
					
						
							|  |  |  |  * in the Software without restriction, including without limitation the rights | 
					
						
							|  |  |  |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | 
					
						
							|  |  |  |  * copies of the Software, and to permit persons to whom the Software is | 
					
						
							|  |  |  |  * furnished to do so, subject to the following conditions: | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * The above copyright notice and this permission notice shall be included in | 
					
						
							|  |  |  |  * all copies or substantial portions of the Software. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | 
					
						
							|  |  |  |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | 
					
						
							|  |  |  |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | 
					
						
							|  |  |  |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | 
					
						
							|  |  |  |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | 
					
						
							|  |  |  |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | 
					
						
							|  |  |  |  * THE SOFTWARE. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | #include "qemu/osdep.h"
 | 
					
						
							|  |  |  | #include "qemu/error-report.h"
 | 
					
						
							|  |  |  | #include "qemu/option.h"
 | 
					
						
							|  |  |  | #include "qemu/cutils.h"
 | 
					
						
							|  |  |  | #include "qemu/units.h"
 | 
					
						
							| 
									
										
										
										
											2020-10-28 07:36:57 -04:00
										 |  |  | #include "qemu/datadir.h"
 | 
					
						
							| 
									
										
										
										
											2022-07-21 14:56:36 +02:00
										 |  |  | #include "qemu/guest-random.h"
 | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | #include "qapi/error.h"
 | 
					
						
							|  |  |  | #include "qapi/qmp/qerror.h"
 | 
					
						
							|  |  |  | #include "qapi/qapi-visit-common.h"
 | 
					
						
							| 
									
										
											  
											
												vl: Add sgx compound properties to expose SGX EPC sections to guest
Because SGX EPC is enumerated through CPUID, EPC "devices" need to be
realized prior to realizing the vCPUs themselves, i.e. long before
generic devices are parsed and realized.  From a virtualization
perspective, the CPUID aspect also means that EPC sections cannot be
hotplugged without paravirtualizing the guest kernel (hardware does
not support hotplugging as EPC sections must be locked down during
pre-boot to provide EPC's security properties).
So even though EPC sections could be realized through the generic
-devices command, they need to be created much earlier for them to
actually be usable by the guest.  Place all EPC sections in a
contiguous block, somewhat arbitrarily starting after RAM above 4g.
Ensuring EPC is in a contiguous region simplifies calculations, e.g.
device memory base, PCI hole, etc..., allows dynamic calculation of the
total EPC size, e.g. exposing EPC to guests does not require -maxmem,
and last but not least allows all of EPC to be enumerated in a single
ACPI entry, which is expected by some kernels, e.g. Windows 7 and 8.
The new compound properties command for sgx like below:
 ......
 -object memory-backend-epc,id=mem1,size=28M,prealloc=on \
 -object memory-backend-epc,id=mem2,size=10M \
 -M sgx-epc.0.memdev=mem1,sgx-epc.1.memdev=mem2
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20210719112136.57018-6-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
											
										 
											2021-09-28 10:40:58 +02:00
										 |  |  | #include "qapi/clone-visitor.h"
 | 
					
						
							|  |  |  | #include "qapi/qapi-visit-machine.h"
 | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | #include "qapi/visitor.h"
 | 
					
						
							|  |  |  | #include "sysemu/qtest.h"
 | 
					
						
							| 
									
										
										
										
											2020-10-28 02:23:19 +00:00
										 |  |  | #include "sysemu/whpx.h"
 | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | #include "sysemu/numa.h"
 | 
					
						
							|  |  |  | #include "sysemu/replay.h"
 | 
					
						
							| 
									
										
											  
											
												x86: return modified setup_data only if read as memory, not as file
If setup_data is being read into a specific memory location, then
generally the setup_data address parameter is read first, so that the
caller knows where to read it into. In that case, we should return
setup_data containing the absolute addresses that are hard coded and
determined a priori. This is the case when kernels are loaded by BIOS,
for example. In contrast, when setup_data is read as a file, then we
shouldn't modify setup_data, since the absolute address will be wrong by
definition. This is the case when OVMF loads the image.
This allows setup_data to be used like normal, without crashing when EFI
tries to use it.
(As a small development note, strangely, fw_cfg_add_file_callback() was
exported but fw_cfg_add_bytes_callback() wasn't, so this makes that
consistent.)
Cc: Gerd Hoffmann <kraxel@redhat.com>
Cc: Laurent Vivier <laurent@vivier.eu>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Maydell <peter.maydell@linaro.org>
Cc: Philippe Mathieu-Daudé <f4bug@amsat.org>
Cc: Richard Henderson <richard.henderson@linaro.org>
Suggested-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Message-Id: <20220921093134.2936487-1-Jason@zx2c4.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
											
										 
											2022-09-21 11:31:31 +02:00
										 |  |  | #include "sysemu/reset.h"
 | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | #include "sysemu/sysemu.h"
 | 
					
						
							| 
									
										
										
										
											2020-08-19 13:17:19 +02:00
										 |  |  | #include "sysemu/cpu-timers.h"
 | 
					
						
							| 
									
										
										
										
											2022-03-14 14:25:41 +00:00
										 |  |  | #include "sysemu/xen.h"
 | 
					
						
							| 
									
										
										
										
											2019-12-12 14:14:40 +01:00
										 |  |  | #include "trace.h"
 | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | #include "hw/i386/x86.h"
 | 
					
						
							|  |  |  | #include "target/i386/cpu.h"
 | 
					
						
							|  |  |  | #include "hw/i386/topology.h"
 | 
					
						
							|  |  |  | #include "hw/i386/fw_cfg.h"
 | 
					
						
							| 
									
										
										
										
											2019-12-12 17:15:43 +01:00
										 |  |  | #include "hw/intc/i8259.h"
 | 
					
						
							| 
									
										
										
										
											2020-09-15 14:09:03 +02:00
										 |  |  | #include "hw/rtc/mc146818rtc.h"
 | 
					
						
							| 
									
										
										
										
											2021-10-07 18:17:07 +02:00
										 |  |  | #include "target/i386/sev.h"
 | 
					
						
							| 
									
										
											  
											
												x86: don't let decompressed kernel image clobber setup_data
The setup_data links are appended to the compressed kernel image. Since
the kernel image is typically loaded at 0x100000, setup_data lives at
`0x100000 + compressed_size`, which does not get relocated during the
kernel's boot process.
The kernel typically decompresses the image starting at address
0x1000000 (note: there's one more zero there than the compressed image
above). This usually is fine for most kernels.
However, if the compressed image is actually quite large, then
setup_data will live at a `0x100000 + compressed_size` that extends into
the decompressed zone at 0x1000000. In other words, if compressed_size
is larger than `0x1000000 - 0x100000`, then the decompression step will
clobber setup_data, resulting in crashes.
Visually, what happens now is that QEMU appends setup_data to the kernel
image:
          kernel image            setup_data
   |--------------------------||----------------|
0x100000                  0x100000+l1     0x100000+l1+l2
The problem is that this decompresses to 0x1000000 (one more zero). So
if l1 is > (0x1000000-0x100000), then this winds up looking like:
          kernel image            setup_data
   |--------------------------||----------------|
0x100000                  0x100000+l1     0x100000+l1+l2
                                 d e c o m p r e s s e d   k e r n e l
                     |-------------------------------------------------------------|
                0x1000000                                                     0x1000000+l3
The decompressed kernel seemingly overwriting the compressed kernel
image isn't a problem, because that gets relocated to a higher address
early on in the boot process, at the end of startup_64. setup_data,
however, stays in the same place, since those links are self referential
and nothing fixes them up.  So the decompressed kernel clobbers it.
Fix this by appending setup_data to the cmdline blob rather than the
kernel image blob, which remains at a lower address that won't get
clobbered.
This could have been done by overwriting the initrd blob instead, but
that poses big difficulties, such as no longer being able to use memory
mapped files for initrd, hurting performance, and, more importantly, the
initrd address calculation is hard coded in qboot, and it always grows
down rather than up, which means lots of brittle semantics would have to
be changed around, incurring more complexity. In contrast, using cmdline
is simple and doesn't interfere with anything.
The microvm machine has a gross hack where it fiddles with fw_cfg data
after the fact. So this hack is updated to account for this appending,
by reserving some bytes.
Fixup-by: Michael S. Tsirkin <mst@redhat.com>
Cc: x86@kernel.org
Cc: Philippe Mathieu-Daudé <philmd@linaro.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Message-Id: <20221230220725.618763-1-Jason@zx2c4.com>
Message-ID: <20230128061015-mutt-send-email-mst@kernel.org>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Eric Biggers <ebiggers@google.com>
Tested-by: Mathias Krause <minipli@grsecurity.net>
											
										 
											2022-12-30 23:07:25 +01:00
										 |  |  | #include "hw/i386/microvm.h"
 | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | #include "hw/acpi/cpu_hotplug.h"
 | 
					
						
							| 
									
										
										
										
											2019-12-12 14:14:40 +01:00
										 |  |  | #include "hw/irq.h"
 | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | #include "hw/nmi.h"
 | 
					
						
							|  |  |  | #include "hw/loader.h"
 | 
					
						
							|  |  |  | #include "multiboot.h"
 | 
					
						
							|  |  |  | #include "elf.h"
 | 
					
						
							|  |  |  | #include "standard-headers/asm-x86/bootparam.h"
 | 
					
						
							| 
									
										
										
										
											2020-02-03 11:42:03 +01:00
										 |  |  | #include CONFIG_DEVICES
 | 
					
						
							| 
									
										
										
										
											2020-12-12 16:55:08 +01:00
										 |  |  | #include "kvm/kvm_i386.h"
 | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | /* Physical Address of PVH entry point read from kernel ELF NOTE */ | 
					
						
							|  |  |  | static size_t pvh_start_addr; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-03-11 17:52:52 -05:00
										 |  |  | inline void init_topo_info(X86CPUTopoInfo *topo_info, | 
					
						
							|  |  |  |                            const X86MachineState *x86ms) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     MachineState *ms = MACHINE(x86ms); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-17 17:53:03 +02:00
										 |  |  |     topo_info->dies_per_pkg = ms->smp.dies; | 
					
						
							| 
									
										
										
										
											2020-03-11 17:52:52 -05:00
										 |  |  |     topo_info->cores_per_die = ms->smp.cores; | 
					
						
							|  |  |  |     topo_info->threads_per_core = ms->smp.threads; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | /*
 | 
					
						
							|  |  |  |  * Calculates initial APIC ID for a specific CPU index | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * Currently we need to be able to calculate the APIC ID from the CPU index | 
					
						
							|  |  |  |  * alone (without requiring a CPU object), as the QEMU<->Seabios interfaces have | 
					
						
							|  |  |  |  * no concept of "CPU index", and the NUMA tables on fw_cfg need the APIC ID of | 
					
						
							|  |  |  |  * all CPUs up to max_cpus. | 
					
						
							|  |  |  |  */ | 
					
						
							| 
									
										
										
										
											2019-09-30 17:26:29 +02:00
										 |  |  | uint32_t x86_cpu_apic_id_from_index(X86MachineState *x86ms, | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |                                     unsigned int cpu_index) | 
					
						
							|  |  |  | { | 
					
						
							| 
									
										
										
										
											2020-03-11 17:52:52 -05:00
										 |  |  |     X86CPUTopoInfo topo_info; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-03-11 17:52:52 -05:00
										 |  |  |     init_topo_info(&topo_info, x86ms); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-28 08:16:34 -05:00
										 |  |  |     return x86_apicid_from_cpu_idx(&topo_info, cpu_index); | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-09-30 17:26:29 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | void x86_cpu_new(X86MachineState *x86ms, int64_t apic_id, Error **errp) | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | { | 
					
						
							| 
									
										
										
										
											2020-06-30 11:03:45 +02:00
										 |  |  |     Object *cpu = object_new(MACHINE(x86ms)->cpu_type); | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-07-07 18:06:04 +02:00
										 |  |  |     if (!object_property_set_uint(cpu, "apic-id", apic_id, errp)) { | 
					
						
							| 
									
										
										
										
											2020-06-30 11:03:45 +02:00
										 |  |  |         goto out; | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2020-07-07 18:06:04 +02:00
										 |  |  |     qdev_realize(DEVICE(cpu), NULL, errp); | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-30 11:03:45 +02:00
										 |  |  | out: | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |     object_unref(cpu); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-09-30 17:26:29 +02:00
										 |  |  | void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version) | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | { | 
					
						
							|  |  |  |     int i; | 
					
						
							|  |  |  |     const CPUArchIdList *possible_cpus; | 
					
						
							| 
									
										
										
										
											2019-09-30 17:26:29 +02:00
										 |  |  |     MachineState *ms = MACHINE(x86ms); | 
					
						
							|  |  |  |     MachineClass *mc = MACHINE_GET_CLASS(x86ms); | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-09-30 17:26:29 +02:00
										 |  |  |     x86_cpu_set_default_version(default_cpu_version); | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     /*
 | 
					
						
							|  |  |  |      * Calculates the limit to CPU APIC ID values | 
					
						
							|  |  |  |      * | 
					
						
							|  |  |  |      * Limit for the APIC ID value, so that all | 
					
						
							| 
									
										
										
										
											2019-09-30 17:26:29 +02:00
										 |  |  |      * CPU APIC IDs are < x86ms->apic_id_limit. | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |      * | 
					
						
							|  |  |  |      * This is used for FW_CFG_MAX_CPUS. See comments on fw_cfg_arch_create(). | 
					
						
							|  |  |  |      */ | 
					
						
							| 
									
										
										
										
											2019-09-30 17:26:29 +02:00
										 |  |  |     x86ms->apic_id_limit = x86_cpu_apic_id_from_index(x86ms, | 
					
						
							| 
									
										
										
										
											2019-10-22 09:39:50 +02:00
										 |  |  |                                                       ms->smp.max_cpus - 1) + 1; | 
					
						
							| 
									
										
										
										
											2022-03-14 14:25:41 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     /*
 | 
					
						
							|  |  |  |      * Can we support APIC ID 255 or higher? | 
					
						
							|  |  |  |      * | 
					
						
							|  |  |  |      * Under Xen: yes. | 
					
						
							|  |  |  |      * With userspace emulated lapic: no | 
					
						
							|  |  |  |      * With KVM's in-kernel lapic: only if X2APIC API is enabled. | 
					
						
							|  |  |  |      */ | 
					
						
							|  |  |  |     if (x86ms->apic_id_limit > 255 && !xen_enabled() && | 
					
						
							|  |  |  |         (!kvm_irqchip_in_kernel() || !kvm_enable_x2apic())) { | 
					
						
							|  |  |  |         error_report("current -smp configuration requires kernel " | 
					
						
							|  |  |  |                      "irqchip and X2APIC API support."); | 
					
						
							|  |  |  |         exit(EXIT_FAILURE); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-08-25 10:52:46 +08:00
										 |  |  |     if (kvm_enabled()) { | 
					
						
							|  |  |  |         kvm_set_max_apic_id(x86ms->apic_id_limit); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |     possible_cpus = mc->possible_cpu_arch_ids(ms); | 
					
						
							|  |  |  |     for (i = 0; i < ms->smp.cpus; i++) { | 
					
						
							| 
									
										
										
										
											2019-09-30 17:26:29 +02:00
										 |  |  |         x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal); | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2020-09-15 14:09:03 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | void x86_rtc_set_cpus_count(ISADevice *rtc, uint16_t cpus_count) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     if (cpus_count > 0xff) { | 
					
						
							|  |  |  |         /*
 | 
					
						
							|  |  |  |          * If the number of CPUs can't be represented in 8 bits, the | 
					
						
							|  |  |  |          * BIOS must use "FW_CFG_NB_CPUS". Set RTC field to 0 just | 
					
						
							|  |  |  |          * to make old BIOSes fail more predictably. | 
					
						
							|  |  |  |          */ | 
					
						
							|  |  |  |         rtc_set_memory(rtc, 0x5f, 0); | 
					
						
							|  |  |  |     } else { | 
					
						
							|  |  |  |         rtc_set_memory(rtc, 0x5f, cpus_count - 1); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static int x86_apic_cmp(const void *a, const void *b) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |    CPUArchId *apic_a = (CPUArchId *)a; | 
					
						
							|  |  |  |    CPUArchId *apic_b = (CPUArchId *)b; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    return apic_a->arch_id - apic_b->arch_id; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /*
 | 
					
						
							|  |  |  |  * returns pointer to CPUArchId descriptor that matches CPU's apic_id | 
					
						
							|  |  |  |  * in ms->possible_cpus->cpus, if ms->possible_cpus->cpus has no | 
					
						
							|  |  |  |  * entry corresponding to CPU's apic_id returns NULL. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | CPUArchId *x86_find_cpu_slot(MachineState *ms, uint32_t id, int *idx) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     CPUArchId apic_id, *found_cpu; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     apic_id.arch_id = id; | 
					
						
							|  |  |  |     found_cpu = bsearch(&apic_id, ms->possible_cpus->cpus, | 
					
						
							|  |  |  |         ms->possible_cpus->len, sizeof(*ms->possible_cpus->cpus), | 
					
						
							|  |  |  |         x86_apic_cmp); | 
					
						
							|  |  |  |     if (found_cpu && idx) { | 
					
						
							|  |  |  |         *idx = found_cpu - ms->possible_cpus->cpus; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return found_cpu; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | void x86_cpu_plug(HotplugHandler *hotplug_dev, | 
					
						
							|  |  |  |                   DeviceState *dev, Error **errp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     CPUArchId *found_cpu; | 
					
						
							|  |  |  |     Error *local_err = NULL; | 
					
						
							|  |  |  |     X86CPU *cpu = X86_CPU(dev); | 
					
						
							|  |  |  |     X86MachineState *x86ms = X86_MACHINE(hotplug_dev); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (x86ms->acpi_dev) { | 
					
						
							|  |  |  |         hotplug_handler_plug(x86ms->acpi_dev, dev, &local_err); | 
					
						
							|  |  |  |         if (local_err) { | 
					
						
							|  |  |  |             goto out; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /* increment the number of CPUs */ | 
					
						
							|  |  |  |     x86ms->boot_cpus++; | 
					
						
							|  |  |  |     if (x86ms->rtc) { | 
					
						
							|  |  |  |         x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     if (x86ms->fw_cfg) { | 
					
						
							|  |  |  |         fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     found_cpu = x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, NULL); | 
					
						
							|  |  |  |     found_cpu->cpu = OBJECT(dev); | 
					
						
							|  |  |  | out: | 
					
						
							|  |  |  |     error_propagate(errp, local_err); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | void x86_cpu_unplug_request_cb(HotplugHandler *hotplug_dev, | 
					
						
							|  |  |  |                                DeviceState *dev, Error **errp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     int idx = -1; | 
					
						
							|  |  |  |     X86CPU *cpu = X86_CPU(dev); | 
					
						
							|  |  |  |     X86MachineState *x86ms = X86_MACHINE(hotplug_dev); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (!x86ms->acpi_dev) { | 
					
						
							|  |  |  |         error_setg(errp, "CPU hot unplug not supported without ACPI"); | 
					
						
							|  |  |  |         return; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, &idx); | 
					
						
							|  |  |  |     assert(idx != -1); | 
					
						
							|  |  |  |     if (idx == 0) { | 
					
						
							|  |  |  |         error_setg(errp, "Boot CPU is unpluggable"); | 
					
						
							|  |  |  |         return; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     hotplug_handler_unplug_request(x86ms->acpi_dev, dev, | 
					
						
							|  |  |  |                                    errp); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | void x86_cpu_unplug_cb(HotplugHandler *hotplug_dev, | 
					
						
							|  |  |  |                        DeviceState *dev, Error **errp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     CPUArchId *found_cpu; | 
					
						
							|  |  |  |     Error *local_err = NULL; | 
					
						
							|  |  |  |     X86CPU *cpu = X86_CPU(dev); | 
					
						
							|  |  |  |     X86MachineState *x86ms = X86_MACHINE(hotplug_dev); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     hotplug_handler_unplug(x86ms->acpi_dev, dev, &local_err); | 
					
						
							|  |  |  |     if (local_err) { | 
					
						
							|  |  |  |         goto out; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     found_cpu = x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, NULL); | 
					
						
							|  |  |  |     found_cpu->cpu = NULL; | 
					
						
							|  |  |  |     qdev_unrealize(dev); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /* decrement the number of CPUs */ | 
					
						
							|  |  |  |     x86ms->boot_cpus--; | 
					
						
							|  |  |  |     /* Update the number of CPUs in CMOS */ | 
					
						
							|  |  |  |     x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus); | 
					
						
							|  |  |  |     fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus); | 
					
						
							|  |  |  |  out: | 
					
						
							|  |  |  |     error_propagate(errp, local_err); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | void x86_cpu_pre_plug(HotplugHandler *hotplug_dev, | 
					
						
							|  |  |  |                       DeviceState *dev, Error **errp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     int idx; | 
					
						
							|  |  |  |     CPUState *cs; | 
					
						
							|  |  |  |     CPUArchId *cpu_slot; | 
					
						
							|  |  |  |     X86CPUTopoIDs topo_ids; | 
					
						
							|  |  |  |     X86CPU *cpu = X86_CPU(dev); | 
					
						
							|  |  |  |     CPUX86State *env = &cpu->env; | 
					
						
							|  |  |  |     MachineState *ms = MACHINE(hotplug_dev); | 
					
						
							|  |  |  |     X86MachineState *x86ms = X86_MACHINE(hotplug_dev); | 
					
						
							|  |  |  |     unsigned int smp_cores = ms->smp.cores; | 
					
						
							|  |  |  |     unsigned int smp_threads = ms->smp.threads; | 
					
						
							|  |  |  |     X86CPUTopoInfo topo_info; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (!object_dynamic_cast(OBJECT(cpu), ms->cpu_type)) { | 
					
						
							|  |  |  |         error_setg(errp, "Invalid CPU type, expected cpu type: '%s'", | 
					
						
							|  |  |  |                    ms->cpu_type); | 
					
						
							|  |  |  |         return; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-23 05:46:41 -04:00
										 |  |  |     if (x86ms->acpi_dev) { | 
					
						
							|  |  |  |         Error *local_err = NULL; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         hotplug_handler_pre_plug(HOTPLUG_HANDLER(x86ms->acpi_dev), dev, | 
					
						
							|  |  |  |                                  &local_err); | 
					
						
							|  |  |  |         if (local_err) { | 
					
						
							|  |  |  |             error_propagate(errp, local_err); | 
					
						
							|  |  |  |             return; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-15 14:09:03 +02:00
										 |  |  |     init_topo_info(&topo_info, x86ms); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-17 17:53:03 +02:00
										 |  |  |     env->nr_dies = ms->smp.dies; | 
					
						
							| 
									
										
										
										
											2020-09-15 14:09:03 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     /*
 | 
					
						
							|  |  |  |      * If APIC ID is not set, | 
					
						
							|  |  |  |      * set it based on socket/die/core/thread properties. | 
					
						
							|  |  |  |      */ | 
					
						
							|  |  |  |     if (cpu->apic_id == UNASSIGNED_APIC_ID) { | 
					
						
							|  |  |  |         int max_socket = (ms->smp.max_cpus - 1) / | 
					
						
							| 
									
										
										
										
											2021-06-17 17:53:03 +02:00
										 |  |  |                                 smp_threads / smp_cores / ms->smp.dies; | 
					
						
							| 
									
										
										
										
											2020-09-15 14:09:03 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         /*
 | 
					
						
							|  |  |  |          * die-id was optional in QEMU 4.0 and older, so keep it optional | 
					
						
							|  |  |  |          * if there's only one die per socket. | 
					
						
							|  |  |  |          */ | 
					
						
							| 
									
										
										
										
											2021-06-17 17:53:03 +02:00
										 |  |  |         if (cpu->die_id < 0 && ms->smp.dies == 1) { | 
					
						
							| 
									
										
										
										
											2020-09-15 14:09:03 +02:00
										 |  |  |             cpu->die_id = 0; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if (cpu->socket_id < 0) { | 
					
						
							|  |  |  |             error_setg(errp, "CPU socket-id is not set"); | 
					
						
							|  |  |  |             return; | 
					
						
							|  |  |  |         } else if (cpu->socket_id > max_socket) { | 
					
						
							|  |  |  |             error_setg(errp, "Invalid CPU socket-id: %u must be in range 0:%u", | 
					
						
							|  |  |  |                        cpu->socket_id, max_socket); | 
					
						
							|  |  |  |             return; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         if (cpu->die_id < 0) { | 
					
						
							|  |  |  |             error_setg(errp, "CPU die-id is not set"); | 
					
						
							|  |  |  |             return; | 
					
						
							| 
									
										
										
										
											2021-06-17 17:53:03 +02:00
										 |  |  |         } else if (cpu->die_id > ms->smp.dies - 1) { | 
					
						
							| 
									
										
										
										
											2020-09-15 14:09:03 +02:00
										 |  |  |             error_setg(errp, "Invalid CPU die-id: %u must be in range 0:%u", | 
					
						
							| 
									
										
										
										
											2021-06-17 17:53:03 +02:00
										 |  |  |                        cpu->die_id, ms->smp.dies - 1); | 
					
						
							| 
									
										
										
										
											2020-09-15 14:09:03 +02:00
										 |  |  |             return; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         if (cpu->core_id < 0) { | 
					
						
							|  |  |  |             error_setg(errp, "CPU core-id is not set"); | 
					
						
							|  |  |  |             return; | 
					
						
							|  |  |  |         } else if (cpu->core_id > (smp_cores - 1)) { | 
					
						
							|  |  |  |             error_setg(errp, "Invalid CPU core-id: %u must be in range 0:%u", | 
					
						
							|  |  |  |                        cpu->core_id, smp_cores - 1); | 
					
						
							|  |  |  |             return; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         if (cpu->thread_id < 0) { | 
					
						
							|  |  |  |             error_setg(errp, "CPU thread-id is not set"); | 
					
						
							|  |  |  |             return; | 
					
						
							|  |  |  |         } else if (cpu->thread_id > (smp_threads - 1)) { | 
					
						
							|  |  |  |             error_setg(errp, "Invalid CPU thread-id: %u must be in range 0:%u", | 
					
						
							|  |  |  |                        cpu->thread_id, smp_threads - 1); | 
					
						
							|  |  |  |             return; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         topo_ids.pkg_id = cpu->socket_id; | 
					
						
							|  |  |  |         topo_ids.die_id = cpu->die_id; | 
					
						
							|  |  |  |         topo_ids.core_id = cpu->core_id; | 
					
						
							|  |  |  |         topo_ids.smt_id = cpu->thread_id; | 
					
						
							|  |  |  |         cpu->apic_id = x86_apicid_from_topo_ids(&topo_info, &topo_ids); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     cpu_slot = x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, &idx); | 
					
						
							|  |  |  |     if (!cpu_slot) { | 
					
						
							|  |  |  |         MachineState *ms = MACHINE(x86ms); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         x86_topo_ids_from_apicid(cpu->apic_id, &topo_info, &topo_ids); | 
					
						
							|  |  |  |         error_setg(errp, | 
					
						
							|  |  |  |             "Invalid CPU [socket: %u, die: %u, core: %u, thread: %u] with" | 
					
						
							|  |  |  |             " APIC ID %" PRIu32 ", valid index range 0:%d", | 
					
						
							|  |  |  |             topo_ids.pkg_id, topo_ids.die_id, topo_ids.core_id, topo_ids.smt_id, | 
					
						
							|  |  |  |             cpu->apic_id, ms->possible_cpus->len - 1); | 
					
						
							|  |  |  |         return; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (cpu_slot->cpu) { | 
					
						
							|  |  |  |         error_setg(errp, "CPU[%d] with APIC ID %" PRIu32 " exists", | 
					
						
							|  |  |  |                    idx, cpu->apic_id); | 
					
						
							|  |  |  |         return; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /* if 'address' properties socket-id/core-id/thread-id are not set, set them
 | 
					
						
							|  |  |  |      * so that machine_query_hotpluggable_cpus would show correct values | 
					
						
							|  |  |  |      */ | 
					
						
							|  |  |  |     /* TODO: move socket_id/core_id/thread_id checks into x86_cpu_realizefn()
 | 
					
						
							|  |  |  |      * once -smp refactoring is complete and there will be CPU private | 
					
						
							|  |  |  |      * CPUState::nr_cores and CPUState::nr_threads fields instead of globals */ | 
					
						
							|  |  |  |     x86_topo_ids_from_apicid(cpu->apic_id, &topo_info, &topo_ids); | 
					
						
							|  |  |  |     if (cpu->socket_id != -1 && cpu->socket_id != topo_ids.pkg_id) { | 
					
						
							|  |  |  |         error_setg(errp, "property socket-id: %u doesn't match set apic-id:" | 
					
						
							|  |  |  |             " 0x%x (socket-id: %u)", cpu->socket_id, cpu->apic_id, | 
					
						
							|  |  |  |             topo_ids.pkg_id); | 
					
						
							|  |  |  |         return; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     cpu->socket_id = topo_ids.pkg_id; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (cpu->die_id != -1 && cpu->die_id != topo_ids.die_id) { | 
					
						
							|  |  |  |         error_setg(errp, "property die-id: %u doesn't match set apic-id:" | 
					
						
							|  |  |  |             " 0x%x (die-id: %u)", cpu->die_id, cpu->apic_id, topo_ids.die_id); | 
					
						
							|  |  |  |         return; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     cpu->die_id = topo_ids.die_id; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (cpu->core_id != -1 && cpu->core_id != topo_ids.core_id) { | 
					
						
							|  |  |  |         error_setg(errp, "property core-id: %u doesn't match set apic-id:" | 
					
						
							|  |  |  |             " 0x%x (core-id: %u)", cpu->core_id, cpu->apic_id, | 
					
						
							|  |  |  |             topo_ids.core_id); | 
					
						
							|  |  |  |         return; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     cpu->core_id = topo_ids.core_id; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (cpu->thread_id != -1 && cpu->thread_id != topo_ids.smt_id) { | 
					
						
							|  |  |  |         error_setg(errp, "property thread-id: %u doesn't match set apic-id:" | 
					
						
							|  |  |  |             " 0x%x (thread-id: %u)", cpu->thread_id, cpu->apic_id, | 
					
						
							|  |  |  |             topo_ids.smt_id); | 
					
						
							|  |  |  |         return; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     cpu->thread_id = topo_ids.smt_id; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) && | 
					
						
							|  |  |  |         !kvm_hv_vpindex_settable()) { | 
					
						
							|  |  |  |         error_setg(errp, "kernel doesn't allow setting HyperV VP_INDEX"); | 
					
						
							|  |  |  |         return; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     cs = CPU(cpu); | 
					
						
							|  |  |  |     cs->cpu_index = idx; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     numa_cpu_pre_plug(cpu_slot, dev, errp); | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | CpuInstanceProperties | 
					
						
							|  |  |  | x86_cpu_index_to_props(MachineState *ms, unsigned cpu_index) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     MachineClass *mc = MACHINE_GET_CLASS(ms); | 
					
						
							|  |  |  |     const CPUArchIdList *possible_cpus = mc->possible_cpu_arch_ids(ms); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert(cpu_index < possible_cpus->len); | 
					
						
							|  |  |  |     return possible_cpus->cpus[cpu_index].props; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | int64_t x86_get_default_cpu_node_id(const MachineState *ms, int idx) | 
					
						
							|  |  |  | { | 
					
						
							| 
									
										
										
										
											2020-03-03 13:56:58 -06:00
										 |  |  |    X86CPUTopoIDs topo_ids; | 
					
						
							| 
									
										
										
										
											2019-10-22 09:39:50 +02:00
										 |  |  |    X86MachineState *x86ms = X86_MACHINE(ms); | 
					
						
							| 
									
										
										
										
											2020-03-11 17:52:52 -05:00
										 |  |  |    X86CPUTopoInfo topo_info; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    init_topo_info(&topo_info, x86ms); | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |    assert(idx < ms->possible_cpus->len); | 
					
						
							| 
									
										
										
										
											2020-08-31 13:42:23 -05:00
										 |  |  |    x86_topo_ids_from_apicid(ms->possible_cpus->cpus[idx].arch_id, | 
					
						
							|  |  |  |                             &topo_info, &topo_ids); | 
					
						
							| 
									
										
										
										
											2020-03-03 13:56:58 -06:00
										 |  |  |    return topo_ids.pkg_id % ms->numa_state->num_nodes; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | const CPUArchIdList *x86_possible_cpu_arch_ids(MachineState *ms) | 
					
						
							|  |  |  | { | 
					
						
							| 
									
										
										
										
											2019-10-22 09:39:50 +02:00
										 |  |  |     X86MachineState *x86ms = X86_MACHINE(ms); | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |     unsigned int max_cpus = ms->smp.max_cpus; | 
					
						
							| 
									
										
										
										
											2020-03-11 17:52:52 -05:00
										 |  |  |     X86CPUTopoInfo topo_info; | 
					
						
							|  |  |  |     int i; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if (ms->possible_cpus) { | 
					
						
							|  |  |  |         /*
 | 
					
						
							|  |  |  |          * make sure that max_cpus hasn't changed since the first use, i.e. | 
					
						
							|  |  |  |          * -smp hasn't been parsed after it | 
					
						
							|  |  |  |          */ | 
					
						
							|  |  |  |         assert(ms->possible_cpus->len == max_cpus); | 
					
						
							|  |  |  |         return ms->possible_cpus; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     ms->possible_cpus = g_malloc0(sizeof(CPUArchIdList) + | 
					
						
							|  |  |  |                                   sizeof(CPUArchId) * max_cpus); | 
					
						
							|  |  |  |     ms->possible_cpus->len = max_cpus; | 
					
						
							| 
									
										
										
										
											2020-03-11 17:52:52 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  |     init_topo_info(&topo_info, x86ms); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |     for (i = 0; i < ms->possible_cpus->len; i++) { | 
					
						
							| 
									
										
										
										
											2020-03-03 13:56:58 -06:00
										 |  |  |         X86CPUTopoIDs topo_ids; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         ms->possible_cpus->cpus[i].type = ms->cpu_type; | 
					
						
							|  |  |  |         ms->possible_cpus->cpus[i].vcpus_count = 1; | 
					
						
							| 
									
										
										
										
											2020-08-31 13:42:23 -05:00
										 |  |  |         ms->possible_cpus->cpus[i].arch_id = | 
					
						
							|  |  |  |             x86_cpu_apic_id_from_index(x86ms, i); | 
					
						
							|  |  |  |         x86_topo_ids_from_apicid(ms->possible_cpus->cpus[i].arch_id, | 
					
						
							|  |  |  |                                  &topo_info, &topo_ids); | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |         ms->possible_cpus->cpus[i].props.has_socket_id = true; | 
					
						
							| 
									
										
										
										
											2020-03-03 13:56:58 -06:00
										 |  |  |         ms->possible_cpus->cpus[i].props.socket_id = topo_ids.pkg_id; | 
					
						
							| 
									
										
										
										
											2021-06-17 17:53:03 +02:00
										 |  |  |         if (ms->smp.dies > 1) { | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |             ms->possible_cpus->cpus[i].props.has_die_id = true; | 
					
						
							| 
									
										
										
										
											2020-03-03 13:56:58 -06:00
										 |  |  |             ms->possible_cpus->cpus[i].props.die_id = topo_ids.die_id; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |         } | 
					
						
							|  |  |  |         ms->possible_cpus->cpus[i].props.has_core_id = true; | 
					
						
							| 
									
										
										
										
											2020-03-03 13:56:58 -06:00
										 |  |  |         ms->possible_cpus->cpus[i].props.core_id = topo_ids.core_id; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |         ms->possible_cpus->cpus[i].props.has_thread_id = true; | 
					
						
							| 
									
										
										
										
											2020-03-03 13:56:58 -06:00
										 |  |  |         ms->possible_cpus->cpus[i].props.thread_id = topo_ids.smt_id; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |     } | 
					
						
							|  |  |  |     return ms->possible_cpus; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-10-22 09:39:50 +02:00
										 |  |  | static void x86_nmi(NMIState *n, int cpu_index, Error **errp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     /* cpu index isn't used */ | 
					
						
							|  |  |  |     CPUState *cs; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     CPU_FOREACH(cs) { | 
					
						
							|  |  |  |         X86CPU *cpu = X86_CPU(cs); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if (!cpu->apic_state) { | 
					
						
							|  |  |  |             cpu_interrupt(cs, CPU_INTERRUPT_NMI); | 
					
						
							|  |  |  |         } else { | 
					
						
							|  |  |  |             apic_deliver_nmi(cpu->apic_state); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | static long get_file_size(FILE *f) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     long where, size; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /* XXX: on Unix systems, using fstat() probably makes more sense */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     where = ftell(f); | 
					
						
							|  |  |  |     fseek(f, 0, SEEK_END); | 
					
						
							|  |  |  |     size = ftell(f); | 
					
						
							|  |  |  |     fseek(f, where, SEEK_SET); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return size; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-12-12 14:14:40 +01:00
										 |  |  | /* TSC handling */ | 
					
						
							|  |  |  | uint64_t cpu_get_tsc(CPUX86State *env) | 
					
						
							|  |  |  | { | 
					
						
							| 
									
										
										
										
											2020-07-31 12:23:42 +02:00
										 |  |  |     return cpus_get_elapsed_ticks(); | 
					
						
							| 
									
										
										
										
											2019-12-12 14:14:40 +01:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* IRQ handling */ | 
					
						
							|  |  |  | static void pic_irq_request(void *opaque, int irq, int level) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     CPUState *cs = first_cpu; | 
					
						
							|  |  |  |     X86CPU *cpu = X86_CPU(cs); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     trace_x86_pic_interrupt(irq, level); | 
					
						
							| 
									
										
										
										
											2020-10-28 02:23:19 +00:00
										 |  |  |     if (cpu->apic_state && !kvm_irqchip_in_kernel() && | 
					
						
							|  |  |  |         !whpx_apic_in_platform()) { | 
					
						
							| 
									
										
										
										
											2019-12-12 14:14:40 +01:00
										 |  |  |         CPU_FOREACH(cs) { | 
					
						
							|  |  |  |             cpu = X86_CPU(cs); | 
					
						
							|  |  |  |             if (apic_accept_pic_intr(cpu->apic_state)) { | 
					
						
							|  |  |  |                 apic_deliver_pic_intr(cpu->apic_state, level); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } else { | 
					
						
							|  |  |  |         if (level) { | 
					
						
							|  |  |  |             cpu_interrupt(cs, CPU_INTERRUPT_HARD); | 
					
						
							|  |  |  |         } else { | 
					
						
							|  |  |  |             cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | qemu_irq x86_allocate_cpu_irq(void) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     return qemu_allocate_irq(pic_irq_request, NULL, 0); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | int cpu_get_pic_interrupt(CPUX86State *env) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     X86CPU *cpu = env_archcpu(env); | 
					
						
							|  |  |  |     int intno; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-28 02:23:19 +00:00
										 |  |  |     if (!kvm_irqchip_in_kernel() && !whpx_apic_in_platform()) { | 
					
						
							| 
									
										
										
										
											2019-12-12 14:14:40 +01:00
										 |  |  |         intno = apic_get_interrupt(cpu->apic_state); | 
					
						
							|  |  |  |         if (intno >= 0) { | 
					
						
							|  |  |  |             return intno; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         /* read the irq from the PIC */ | 
					
						
							|  |  |  |         if (!apic_accept_pic_intr(cpu->apic_state)) { | 
					
						
							|  |  |  |             return -1; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     intno = pic_read_irq(isa_pic); | 
					
						
							|  |  |  |     return intno; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | DeviceState *cpu_get_current_apic(void) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     if (current_cpu) { | 
					
						
							|  |  |  |         X86CPU *cpu = X86_CPU(current_cpu); | 
					
						
							|  |  |  |         return cpu->apic_state; | 
					
						
							|  |  |  |     } else { | 
					
						
							|  |  |  |         return NULL; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | void gsi_handler(void *opaque, int n, int level) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     GSIState *s = opaque; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     trace_x86_gsi_interrupt(n, level); | 
					
						
							| 
									
										
										
										
											2020-12-03 11:54:13 +01:00
										 |  |  |     switch (n) { | 
					
						
							|  |  |  |     case 0 ... ISA_NUM_IRQS - 1: | 
					
						
							|  |  |  |         if (s->i8259_irq[n]) { | 
					
						
							|  |  |  |             /* Under KVM, Kernel will forward to both PIC and IOAPIC */ | 
					
						
							|  |  |  |             qemu_set_irq(s->i8259_irq[n], level); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         /* fall through */ | 
					
						
							|  |  |  |     case ISA_NUM_IRQS ... IOAPIC_NUM_PINS - 1: | 
					
						
							|  |  |  |         qemu_set_irq(s->ioapic_irq[n], level); | 
					
						
							|  |  |  |         break; | 
					
						
							| 
									
										
										
										
											2020-12-03 11:54:14 +01:00
										 |  |  |     case IO_APIC_SECONDARY_IRQBASE | 
					
						
							|  |  |  |         ... IO_APIC_SECONDARY_IRQBASE + IOAPIC_NUM_PINS - 1: | 
					
						
							|  |  |  |         qemu_set_irq(s->ioapic2_irq[n - IO_APIC_SECONDARY_IRQBASE], level); | 
					
						
							|  |  |  |         break; | 
					
						
							| 
									
										
										
										
											2019-12-12 14:14:40 +01:00
										 |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     DeviceState *dev; | 
					
						
							|  |  |  |     SysBusDevice *d; | 
					
						
							|  |  |  |     unsigned int i; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-12-13 12:11:45 +01:00
										 |  |  |     assert(parent_name); | 
					
						
							| 
									
										
										
										
											2019-12-12 14:14:40 +01:00
										 |  |  |     if (kvm_ioapic_in_kernel()) { | 
					
						
							| 
									
										
											  
											
												qdev: Convert uses of qdev_create() with Coccinelle
This is the transformation explained in the commit before previous.
Takes care of just one pattern that needs conversion.  More to come in
this series.
Coccinelle script:
    @ depends on !(file in "hw/arm/highbank.c")@
    expression bus, type_name, dev, expr;
    @@
    -    dev = qdev_create(bus, type_name);
    +    dev = qdev_new(type_name);
         ... when != dev = expr
    -    qdev_init_nofail(dev);
    +    qdev_realize_and_unref(dev, bus, &error_fatal);
    @@
    expression bus, type_name, dev, expr;
    identifier DOWN;
    @@
    -    dev = DOWN(qdev_create(bus, type_name));
    +    dev = DOWN(qdev_new(type_name));
         ... when != dev = expr
    -    qdev_init_nofail(DEVICE(dev));
    +    qdev_realize_and_unref(DEVICE(dev), bus, &error_fatal);
    @@
    expression bus, type_name, expr;
    identifier dev;
    @@
    -    DeviceState *dev = qdev_create(bus, type_name);
    +    DeviceState *dev = qdev_new(type_name);
         ... when != dev = expr
    -    qdev_init_nofail(dev);
    +    qdev_realize_and_unref(dev, bus, &error_fatal);
    @@
    expression bus, type_name, dev, expr, errp;
    symbol true;
    @@
    -    dev = qdev_create(bus, type_name);
    +    dev = qdev_new(type_name);
         ... when != dev = expr
    -    object_property_set_bool(OBJECT(dev), true, "realized", errp);
    +    qdev_realize_and_unref(dev, bus, errp);
    @@
    expression bus, type_name, expr, errp;
    identifier dev;
    symbol true;
    @@
    -    DeviceState *dev = qdev_create(bus, type_name);
    +    DeviceState *dev = qdev_new(type_name);
         ... when != dev = expr
    -    object_property_set_bool(OBJECT(dev), true, "realized", errp);
    +    qdev_realize_and_unref(dev, bus, errp);
The first rule exempts hw/arm/highbank.c, because it matches along two
control flow paths there, with different @type_name.  Covered by the
next commit's manual conversions.
Missing #include "qapi/error.h" added manually.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20200610053247.1583243-10-armbru@redhat.com>
[Conflicts in hw/misc/empty_slot.c and hw/sparc/leon3.c resolved]
											
										 
											2020-06-10 07:31:58 +02:00
										 |  |  |         dev = qdev_new(TYPE_KVM_IOAPIC); | 
					
						
							| 
									
										
										
										
											2019-12-12 14:14:40 +01:00
										 |  |  |     } else { | 
					
						
							| 
									
										
											  
											
												qdev: Convert uses of qdev_create() with Coccinelle
This is the transformation explained in the commit before previous.
Takes care of just one pattern that needs conversion.  More to come in
this series.
Coccinelle script:
    @ depends on !(file in "hw/arm/highbank.c")@
    expression bus, type_name, dev, expr;
    @@
    -    dev = qdev_create(bus, type_name);
    +    dev = qdev_new(type_name);
         ... when != dev = expr
    -    qdev_init_nofail(dev);
    +    qdev_realize_and_unref(dev, bus, &error_fatal);
    @@
    expression bus, type_name, dev, expr;
    identifier DOWN;
    @@
    -    dev = DOWN(qdev_create(bus, type_name));
    +    dev = DOWN(qdev_new(type_name));
         ... when != dev = expr
    -    qdev_init_nofail(DEVICE(dev));
    +    qdev_realize_and_unref(DEVICE(dev), bus, &error_fatal);
    @@
    expression bus, type_name, expr;
    identifier dev;
    @@
    -    DeviceState *dev = qdev_create(bus, type_name);
    +    DeviceState *dev = qdev_new(type_name);
         ... when != dev = expr
    -    qdev_init_nofail(dev);
    +    qdev_realize_and_unref(dev, bus, &error_fatal);
    @@
    expression bus, type_name, dev, expr, errp;
    symbol true;
    @@
    -    dev = qdev_create(bus, type_name);
    +    dev = qdev_new(type_name);
         ... when != dev = expr
    -    object_property_set_bool(OBJECT(dev), true, "realized", errp);
    +    qdev_realize_and_unref(dev, bus, errp);
    @@
    expression bus, type_name, expr, errp;
    identifier dev;
    symbol true;
    @@
    -    DeviceState *dev = qdev_create(bus, type_name);
    +    DeviceState *dev = qdev_new(type_name);
         ... when != dev = expr
    -    object_property_set_bool(OBJECT(dev), true, "realized", errp);
    +    qdev_realize_and_unref(dev, bus, errp);
The first rule exempts hw/arm/highbank.c, because it matches along two
control flow paths there, with different @type_name.  Covered by the
next commit's manual conversions.
Missing #include "qapi/error.h" added manually.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20200610053247.1583243-10-armbru@redhat.com>
[Conflicts in hw/misc/empty_slot.c and hw/sparc/leon3.c resolved]
											
										 
											2020-06-10 07:31:58 +02:00
										 |  |  |         dev = qdev_new(TYPE_IOAPIC); | 
					
						
							| 
									
										
										
										
											2019-12-12 14:14:40 +01:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2019-12-13 12:11:45 +01:00
										 |  |  |     object_property_add_child(object_resolve_path(parent_name, NULL), | 
					
						
							| 
									
										
											  
											
												qom: Drop parameter @errp of object_property_add() & friends
The only way object_property_add() can fail is when a property with
the same name already exists.  Since our property names are all
hardcoded, failure is a programming error, and the appropriate way to
handle it is passing &error_abort.
Same for its variants, except for object_property_add_child(), which
additionally fails when the child already has a parent.  Parentage is
also under program control, so this is a programming error, too.
We have a bit over 500 callers.  Almost half of them pass
&error_abort, slightly fewer ignore errors, one test case handles
errors, and the remaining few callers pass them to their own callers.
The previous few commits demonstrated once again that ignoring
programming errors is a bad idea.
Of the few ones that pass on errors, several violate the Error API.
The Error ** argument must be NULL, &error_abort, &error_fatal, or a
pointer to a variable containing NULL.  Passing an argument of the
latter kind twice without clearing it in between is wrong: if the
first call sets an error, it no longer points to NULL for the second
call.  ich9_pm_add_properties(), sparc32_ledma_realize(),
sparc32_dma_realize(), xilinx_axidma_realize(), xilinx_enet_realize()
are wrong that way.
When the one appropriate choice of argument is &error_abort, letting
users pick the argument is a bad idea.
Drop parameter @errp and assert the preconditions instead.
There's one exception to "duplicate property name is a programming
error": the way object_property_add() implements the magic (and
undocumented) "automatic arrayification".  Don't drop @errp there.
Instead, rename object_property_add() to object_property_try_add(),
and add the obvious wrapper object_property_add().
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20200505152926.18877-15-armbru@redhat.com>
[Two semantic rebase conflicts resolved]
											
										 
											2020-05-05 17:29:22 +02:00
										 |  |  |                               "ioapic", OBJECT(dev)); | 
					
						
							| 
									
										
										
										
											2019-12-12 14:14:40 +01:00
										 |  |  |     d = SYS_BUS_DEVICE(dev); | 
					
						
							| 
									
										
											  
											
												sysbus: Convert to sysbus_realize() etc. with Coccinelle
Convert from qdev_realize(), qdev_realize_and_unref() with null @bus
argument to sysbus_realize(), sysbus_realize_and_unref().
Coccinelle script:
    @@
    expression dev, errp;
    @@
    -    qdev_realize(DEVICE(dev), NULL, errp);
    +    sysbus_realize(SYS_BUS_DEVICE(dev), errp);
    @@
    expression sysbus_dev, dev, errp;
    @@
    +    sysbus_dev = SYS_BUS_DEVICE(dev);
    -    qdev_realize_and_unref(dev, NULL, errp);
    +    sysbus_realize_and_unref(sysbus_dev, errp);
    -    sysbus_dev = SYS_BUS_DEVICE(dev);
    @@
    expression sysbus_dev, dev, errp;
    expression expr;
    @@
         sysbus_dev = SYS_BUS_DEVICE(dev);
         ... when != dev = expr;
    -    qdev_realize_and_unref(dev, NULL, errp);
    +    sysbus_realize_and_unref(sysbus_dev, errp);
    @@
    expression dev, errp;
    @@
    -    qdev_realize_and_unref(DEVICE(dev), NULL, errp);
    +    sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), errp);
    @@
    expression dev, errp;
    @@
    -    qdev_realize_and_unref(dev, NULL, errp);
    +    sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), errp);
Whitespace changes minimized manually.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Acked-by: Alistair Francis <alistair.francis@wdc.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20200610053247.1583243-46-armbru@redhat.com>
[Conflicts in hw/misc/empty_slot.c and hw/sparc/leon3.c resolved]
											
										 
											2020-06-10 07:32:34 +02:00
										 |  |  |     sysbus_realize_and_unref(d, &error_fatal); | 
					
						
							| 
									
										
										
										
											2019-12-12 14:14:40 +01:00
										 |  |  |     sysbus_mmio_map(d, 0, IO_APIC_DEFAULT_ADDRESS); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for (i = 0; i < IOAPIC_NUM_PINS; i++) { | 
					
						
							|  |  |  |         gsi_state->ioapic_irq[i] = qdev_get_gpio_in(dev, i); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-03 11:54:14 +01:00
										 |  |  | DeviceState *ioapic_init_secondary(GSIState *gsi_state) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     DeviceState *dev; | 
					
						
							|  |  |  |     SysBusDevice *d; | 
					
						
							|  |  |  |     unsigned int i; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     dev = qdev_new(TYPE_IOAPIC); | 
					
						
							|  |  |  |     d = SYS_BUS_DEVICE(dev); | 
					
						
							|  |  |  |     sysbus_realize_and_unref(d, &error_fatal); | 
					
						
							|  |  |  |     sysbus_mmio_map(d, 0, IO_APIC_SECONDARY_ADDRESS); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for (i = 0; i < IOAPIC_NUM_PINS; i++) { | 
					
						
							|  |  |  |         gsi_state->ioapic2_irq[i] = qdev_get_gpio_in(dev, i); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return dev; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-09-21 11:31:32 +02:00
										 |  |  | typedef struct SetupData { | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |     uint64_t next; | 
					
						
							|  |  |  |     uint32_t type; | 
					
						
							|  |  |  |     uint32_t len; | 
					
						
							| 
									
										
											  
											
												misc: Replace zero-length arrays with flexible array member (automatic)
Description copied from Linux kernel commit from Gustavo A. R. Silva
(see [3]):
--v-- description start --v--
  The current codebase makes use of the zero-length array language
  extension to the C90 standard, but the preferred mechanism to
  declare variable-length types such as these ones is a flexible
  array member [1], introduced in C99:
  struct foo {
      int stuff;
      struct boo array[];
  };
  By making use of the mechanism above, we will get a compiler
  warning in case the flexible array does not occur last in the
  structure, which will help us prevent some kind of undefined
  behavior bugs from being unadvertenly introduced [2] to the
  Linux codebase from now on.
--^-- description end --^--
Do the similar housekeeping in the QEMU codebase (which uses
C99 since commit 7be41675f7cb).
All these instances of code were found with the help of the
following Coccinelle script:
  @@
  identifier s, m, a;
  type t, T;
  @@
   struct s {
      ...
      t m;
  -   T a[0];
  +   T a[];
  };
  @@
  identifier s, m, a;
  type t, T;
  @@
   struct s {
      ...
      t m;
  -   T a[0];
  +   T a[];
   } QEMU_PACKED;
[1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html
[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=76497732932f
[3] https://git.kernel.org/pub/scm/linux/kernel/git/gustavoars/linux.git/commit/?id=17642a2fbd2c1
Inspired-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
											
										 
											2020-03-04 16:38:15 +01:00
										 |  |  |     uint8_t data[]; | 
					
						
							| 
									
										
										
										
											2022-09-21 11:31:32 +02:00
										 |  |  | } __attribute__((packed)) SetupData; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /*
 | 
					
						
							|  |  |  |  * The entry point into the kernel for PVH boot is different from | 
					
						
							|  |  |  |  * the native entry point.  The PVH entry is defined by the x86/HVM | 
					
						
							|  |  |  |  * direct boot ABI and is available in an ELFNOTE in the kernel binary. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * This function is passed to load_elf() when it is called from | 
					
						
							|  |  |  |  * load_elfboot() which then additionally checks for an ELF Note of | 
					
						
							|  |  |  |  * type XEN_ELFNOTE_PHYS32_ENTRY and passes it to this function to | 
					
						
							|  |  |  |  * parse the PVH entry address from the ELF Note. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * Due to trickery in elf_opts.h, load_elf() is actually available as | 
					
						
							|  |  |  |  * load_elf32() or load_elf64() and this routine needs to be able | 
					
						
							|  |  |  |  * to deal with being called as 32 or 64 bit. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * The address of the PVH entry point is saved to the 'pvh_start_addr' | 
					
						
							|  |  |  |  * global variable.  (although the entry point is 32-bit, the kernel | 
					
						
							|  |  |  |  * binary can be either 32-bit or 64-bit). | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | static uint64_t read_pvh_start_addr(void *arg1, void *arg2, bool is64) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     size_t *elf_note_data_addr; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /* Check if ELF Note header passed in is valid */ | 
					
						
							|  |  |  |     if (arg1 == NULL) { | 
					
						
							|  |  |  |         return 0; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (is64) { | 
					
						
							|  |  |  |         struct elf64_note *nhdr64 = (struct elf64_note *)arg1; | 
					
						
							|  |  |  |         uint64_t nhdr_size64 = sizeof(struct elf64_note); | 
					
						
							|  |  |  |         uint64_t phdr_align = *(uint64_t *)arg2; | 
					
						
							|  |  |  |         uint64_t nhdr_namesz = nhdr64->n_namesz; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         elf_note_data_addr = | 
					
						
							|  |  |  |             ((void *)nhdr64) + nhdr_size64 + | 
					
						
							|  |  |  |             QEMU_ALIGN_UP(nhdr_namesz, phdr_align); | 
					
						
							| 
									
										
										
										
											2021-03-02 09:03:15 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         pvh_start_addr = *elf_note_data_addr; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |     } else { | 
					
						
							|  |  |  |         struct elf32_note *nhdr32 = (struct elf32_note *)arg1; | 
					
						
							|  |  |  |         uint32_t nhdr_size32 = sizeof(struct elf32_note); | 
					
						
							|  |  |  |         uint32_t phdr_align = *(uint32_t *)arg2; | 
					
						
							|  |  |  |         uint32_t nhdr_namesz = nhdr32->n_namesz; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         elf_note_data_addr = | 
					
						
							|  |  |  |             ((void *)nhdr32) + nhdr_size32 + | 
					
						
							|  |  |  |             QEMU_ALIGN_UP(nhdr_namesz, phdr_align); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-03-02 09:03:15 +00:00
										 |  |  |         pvh_start_addr = *(uint32_t *)elf_note_data_addr; | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     return pvh_start_addr; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static bool load_elfboot(const char *kernel_filename, | 
					
						
							|  |  |  |                          int kernel_file_size, | 
					
						
							|  |  |  |                          uint8_t *header, | 
					
						
							|  |  |  |                          size_t pvh_xen_start_addr, | 
					
						
							|  |  |  |                          FWCfgState *fw_cfg) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     uint32_t flags = 0; | 
					
						
							|  |  |  |     uint32_t mh_load_addr = 0; | 
					
						
							|  |  |  |     uint32_t elf_kernel_size = 0; | 
					
						
							|  |  |  |     uint64_t elf_entry; | 
					
						
							|  |  |  |     uint64_t elf_low, elf_high; | 
					
						
							|  |  |  |     int kernel_size; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (ldl_p(header) != 0x464c457f) { | 
					
						
							|  |  |  |         return false; /* no elfboot */ | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     bool elf_is64 = header[EI_CLASS] == ELFCLASS64; | 
					
						
							|  |  |  |     flags = elf_is64 ? | 
					
						
							|  |  |  |         ((Elf64_Ehdr *)header)->e_flags : ((Elf32_Ehdr *)header)->e_flags; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (flags & 0x00010004) { /* LOAD_ELF_HEADER_HAS_ADDR */ | 
					
						
							|  |  |  |         error_report("elfboot unsupported flags = %x", flags); | 
					
						
							|  |  |  |         exit(1); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     uint64_t elf_note_type = XEN_ELFNOTE_PHYS32_ENTRY; | 
					
						
							|  |  |  |     kernel_size = load_elf(kernel_filename, read_pvh_start_addr, | 
					
						
							|  |  |  |                            NULL, &elf_note_type, &elf_entry, | 
					
						
							| 
									
										
										
										
											2020-01-26 23:55:04 +01:00
										 |  |  |                            &elf_low, &elf_high, NULL, 0, I386_ELF_MACHINE, | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |                            0, 0); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (kernel_size < 0) { | 
					
						
							|  |  |  |         error_report("Error while loading elf kernel"); | 
					
						
							|  |  |  |         exit(1); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     mh_load_addr = elf_low; | 
					
						
							|  |  |  |     elf_kernel_size = elf_high - elf_low; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (pvh_start_addr == 0) { | 
					
						
							|  |  |  |         error_report("Error loading uncompressed kernel without PVH ELF Note"); | 
					
						
							|  |  |  |         exit(1); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ENTRY, pvh_start_addr); | 
					
						
							|  |  |  |     fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, mh_load_addr); | 
					
						
							|  |  |  |     fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, elf_kernel_size); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return true; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												x86: return modified setup_data only if read as memory, not as file
If setup_data is being read into a specific memory location, then
generally the setup_data address parameter is read first, so that the
caller knows where to read it into. In that case, we should return
setup_data containing the absolute addresses that are hard coded and
determined a priori. This is the case when kernels are loaded by BIOS,
for example. In contrast, when setup_data is read as a file, then we
shouldn't modify setup_data, since the absolute address will be wrong by
definition. This is the case when OVMF loads the image.
This allows setup_data to be used like normal, without crashing when EFI
tries to use it.
(As a small development note, strangely, fw_cfg_add_file_callback() was
exported but fw_cfg_add_bytes_callback() wasn't, so this makes that
consistent.)
Cc: Gerd Hoffmann <kraxel@redhat.com>
Cc: Laurent Vivier <laurent@vivier.eu>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Maydell <peter.maydell@linaro.org>
Cc: Philippe Mathieu-Daudé <f4bug@amsat.org>
Cc: Richard Henderson <richard.henderson@linaro.org>
Suggested-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Message-Id: <20220921093134.2936487-1-Jason@zx2c4.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
											
										 
											2022-09-21 11:31:31 +02:00
										 |  |  | typedef struct SetupDataFixup { | 
					
						
							|  |  |  |     void *pos; | 
					
						
							|  |  |  |     hwaddr orig_val, new_val; | 
					
						
							|  |  |  |     uint32_t addr; | 
					
						
							|  |  |  | } SetupDataFixup; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static void fixup_setup_data(void *opaque) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     SetupDataFixup *fixup = opaque; | 
					
						
							|  |  |  |     stq_p(fixup->pos, fixup->new_val); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static void reset_setup_data(void *opaque) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     SetupDataFixup *fixup = opaque; | 
					
						
							|  |  |  |     stq_p(fixup->pos, fixup->orig_val); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-09-21 11:31:33 +02:00
										 |  |  | static void reset_rng_seed(void *opaque) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     SetupData *setup_data = opaque; | 
					
						
							|  |  |  |     qemu_guest_getrandom_nofail(setup_data->data, le32_to_cpu(setup_data->len)); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-09-30 17:26:29 +02:00
										 |  |  | void x86_load_linux(X86MachineState *x86ms, | 
					
						
							|  |  |  |                     FWCfgState *fw_cfg, | 
					
						
							|  |  |  |                     int acpi_data_size, | 
					
						
							| 
									
										
										
										
											2022-07-21 14:56:36 +02:00
										 |  |  |                     bool pvh_enabled, | 
					
						
							|  |  |  |                     bool legacy_no_rng_seed) | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | { | 
					
						
							| 
									
										
										
										
											2021-10-20 14:48:10 +02:00
										 |  |  |     bool linuxboot_dma_enabled = X86_MACHINE_GET_CLASS(x86ms)->fwcfg_dma_enabled; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |     uint16_t protocol; | 
					
						
							|  |  |  |     int setup_size, kernel_size, cmdline_size; | 
					
						
							|  |  |  |     int dtb_size, setup_data_offset; | 
					
						
							|  |  |  |     uint32_t initrd_max; | 
					
						
							|  |  |  |     uint8_t header[8192], *setup, *kernel; | 
					
						
							| 
									
										
										
										
											2022-07-21 14:56:36 +02:00
										 |  |  |     hwaddr real_addr, prot_addr, cmdline_addr, initrd_addr = 0, first_setup_data = 0; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |     FILE *f; | 
					
						
							|  |  |  |     char *vmode; | 
					
						
							| 
									
										
										
										
											2019-09-30 17:26:29 +02:00
										 |  |  |     MachineState *machine = MACHINE(x86ms); | 
					
						
							| 
									
										
										
										
											2022-09-21 11:31:32 +02:00
										 |  |  |     SetupData *setup_data; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |     const char *kernel_filename = machine->kernel_filename; | 
					
						
							|  |  |  |     const char *initrd_filename = machine->initrd_filename; | 
					
						
							|  |  |  |     const char *dtb_filename = machine->dtb; | 
					
						
							| 
									
										
											  
											
												x86: don't let decompressed kernel image clobber setup_data
The setup_data links are appended to the compressed kernel image. Since
the kernel image is typically loaded at 0x100000, setup_data lives at
`0x100000 + compressed_size`, which does not get relocated during the
kernel's boot process.
The kernel typically decompresses the image starting at address
0x1000000 (note: there's one more zero there than the compressed image
above). This usually is fine for most kernels.
However, if the compressed image is actually quite large, then
setup_data will live at a `0x100000 + compressed_size` that extends into
the decompressed zone at 0x1000000. In other words, if compressed_size
is larger than `0x1000000 - 0x100000`, then the decompression step will
clobber setup_data, resulting in crashes.
Visually, what happens now is that QEMU appends setup_data to the kernel
image:
          kernel image            setup_data
   |--------------------------||----------------|
0x100000                  0x100000+l1     0x100000+l1+l2
The problem is that this decompresses to 0x1000000 (one more zero). So
if l1 is > (0x1000000-0x100000), then this winds up looking like:
          kernel image            setup_data
   |--------------------------||----------------|
0x100000                  0x100000+l1     0x100000+l1+l2
                                 d e c o m p r e s s e d   k e r n e l
                     |-------------------------------------------------------------|
                0x1000000                                                     0x1000000+l3
The decompressed kernel seemingly overwriting the compressed kernel
image isn't a problem, because that gets relocated to a higher address
early on in the boot process, at the end of startup_64. setup_data,
however, stays in the same place, since those links are self referential
and nothing fixes them up.  So the decompressed kernel clobbers it.
Fix this by appending setup_data to the cmdline blob rather than the
kernel image blob, which remains at a lower address that won't get
clobbered.
This could have been done by overwriting the initrd blob instead, but
that poses big difficulties, such as no longer being able to use memory
mapped files for initrd, hurting performance, and, more importantly, the
initrd address calculation is hard coded in qboot, and it always grows
down rather than up, which means lots of brittle semantics would have to
be changed around, incurring more complexity. In contrast, using cmdline
is simple and doesn't interfere with anything.
The microvm machine has a gross hack where it fiddles with fw_cfg data
after the fact. So this hack is updated to account for this appending,
by reserving some bytes.
Fixup-by: Michael S. Tsirkin <mst@redhat.com>
Cc: x86@kernel.org
Cc: Philippe Mathieu-Daudé <philmd@linaro.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Message-Id: <20221230220725.618763-1-Jason@zx2c4.com>
Message-ID: <20230128061015-mutt-send-email-mst@kernel.org>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Eric Biggers <ebiggers@google.com>
Tested-by: Mathias Krause <minipli@grsecurity.net>
											
										 
											2022-12-30 23:07:25 +01:00
										 |  |  |     char *kernel_cmdline; | 
					
						
							| 
									
										
										
										
											2021-09-30 08:49:15 +03:00
										 |  |  |     SevKernelLoaderContext sev_load_ctx = {}; | 
					
						
							| 
									
										
										
										
											2022-07-21 14:56:36 +02:00
										 |  |  |     enum { RNG_SEED_LENGTH = 32 }; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
											  
											
												x86: don't let decompressed kernel image clobber setup_data
The setup_data links are appended to the compressed kernel image. Since
the kernel image is typically loaded at 0x100000, setup_data lives at
`0x100000 + compressed_size`, which does not get relocated during the
kernel's boot process.
The kernel typically decompresses the image starting at address
0x1000000 (note: there's one more zero there than the compressed image
above). This usually is fine for most kernels.
However, if the compressed image is actually quite large, then
setup_data will live at a `0x100000 + compressed_size` that extends into
the decompressed zone at 0x1000000. In other words, if compressed_size
is larger than `0x1000000 - 0x100000`, then the decompression step will
clobber setup_data, resulting in crashes.
Visually, what happens now is that QEMU appends setup_data to the kernel
image:
          kernel image            setup_data
   |--------------------------||----------------|
0x100000                  0x100000+l1     0x100000+l1+l2
The problem is that this decompresses to 0x1000000 (one more zero). So
if l1 is > (0x1000000-0x100000), then this winds up looking like:
          kernel image            setup_data
   |--------------------------||----------------|
0x100000                  0x100000+l1     0x100000+l1+l2
                                 d e c o m p r e s s e d   k e r n e l
                     |-------------------------------------------------------------|
                0x1000000                                                     0x1000000+l3
The decompressed kernel seemingly overwriting the compressed kernel
image isn't a problem, because that gets relocated to a higher address
early on in the boot process, at the end of startup_64. setup_data,
however, stays in the same place, since those links are self referential
and nothing fixes them up.  So the decompressed kernel clobbers it.
Fix this by appending setup_data to the cmdline blob rather than the
kernel image blob, which remains at a lower address that won't get
clobbered.
This could have been done by overwriting the initrd blob instead, but
that poses big difficulties, such as no longer being able to use memory
mapped files for initrd, hurting performance, and, more importantly, the
initrd address calculation is hard coded in qboot, and it always grows
down rather than up, which means lots of brittle semantics would have to
be changed around, incurring more complexity. In contrast, using cmdline
is simple and doesn't interfere with anything.
The microvm machine has a gross hack where it fiddles with fw_cfg data
after the fact. So this hack is updated to account for this appending,
by reserving some bytes.
Fixup-by: Michael S. Tsirkin <mst@redhat.com>
Cc: x86@kernel.org
Cc: Philippe Mathieu-Daudé <philmd@linaro.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Message-Id: <20221230220725.618763-1-Jason@zx2c4.com>
Message-ID: <20230128061015-mutt-send-email-mst@kernel.org>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Eric Biggers <ebiggers@google.com>
Tested-by: Mathias Krause <minipli@grsecurity.net>
											
										 
											2022-12-30 23:07:25 +01:00
										 |  |  |     /*
 | 
					
						
							|  |  |  |      * Add the NUL terminator, some padding for the microvm cmdline fiddling | 
					
						
							|  |  |  |      * hack, and then align to 16 bytes as a paranoia measure | 
					
						
							|  |  |  |      */ | 
					
						
							|  |  |  |     cmdline_size = (strlen(machine->kernel_cmdline) + 1 + | 
					
						
							|  |  |  |                     VIRTIO_CMDLINE_TOTAL_MAX_LEN + 16) & ~15; | 
					
						
							|  |  |  |     /* Make a copy, since we might append arbitrary bytes to it later. */ | 
					
						
							|  |  |  |     kernel_cmdline = g_strndup(machine->kernel_cmdline, cmdline_size); | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     /* load the kernel header */ | 
					
						
							|  |  |  |     f = fopen(kernel_filename, "rb"); | 
					
						
							|  |  |  |     if (!f) { | 
					
						
							|  |  |  |         fprintf(stderr, "qemu: could not open kernel file '%s': %s\n", | 
					
						
							|  |  |  |                 kernel_filename, strerror(errno)); | 
					
						
							|  |  |  |         exit(1); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     kernel_size = get_file_size(f); | 
					
						
							|  |  |  |     if (!kernel_size || | 
					
						
							|  |  |  |         fread(header, 1, MIN(ARRAY_SIZE(header), kernel_size), f) != | 
					
						
							|  |  |  |         MIN(ARRAY_SIZE(header), kernel_size)) { | 
					
						
							|  |  |  |         fprintf(stderr, "qemu: could not load kernel '%s': %s\n", | 
					
						
							|  |  |  |                 kernel_filename, strerror(errno)); | 
					
						
							|  |  |  |         exit(1); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /* kernel protocol version */ | 
					
						
							|  |  |  |     if (ldl_p(header + 0x202) == 0x53726448) { | 
					
						
							|  |  |  |         protocol = lduw_p(header + 0x206); | 
					
						
							|  |  |  |     } else { | 
					
						
							|  |  |  |         /*
 | 
					
						
							|  |  |  |          * This could be a multiboot kernel. If it is, let's stop treating it | 
					
						
							|  |  |  |          * like a Linux kernel. | 
					
						
							|  |  |  |          * Note: some multiboot images could be in the ELF format (the same of | 
					
						
							|  |  |  |          * PVH), so we try multiboot first since we check the multiboot magic | 
					
						
							|  |  |  |          * header before to load it. | 
					
						
							|  |  |  |          */ | 
					
						
							| 
									
										
										
										
											2021-10-20 15:59:44 +02:00
										 |  |  |         if (load_multiboot(x86ms, fw_cfg, f, kernel_filename, initrd_filename, | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |                            kernel_cmdline, kernel_size, header)) { | 
					
						
							|  |  |  |             return; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         /*
 | 
					
						
							|  |  |  |          * Check if the file is an uncompressed kernel file (ELF) and load it, | 
					
						
							|  |  |  |          * saving the PVH entry point used by the x86/HVM direct boot ABI. | 
					
						
							|  |  |  |          * If load_elfboot() is successful, populate the fw_cfg info. | 
					
						
							|  |  |  |          */ | 
					
						
							| 
									
										
										
										
											2019-09-30 17:26:29 +02:00
										 |  |  |         if (pvh_enabled && | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |             load_elfboot(kernel_filename, kernel_size, | 
					
						
							|  |  |  |                          header, pvh_start_addr, fw_cfg)) { | 
					
						
							|  |  |  |             fclose(f); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE, | 
					
						
							|  |  |  |                 strlen(kernel_cmdline) + 1); | 
					
						
							|  |  |  |             fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, sizeof(header)); | 
					
						
							|  |  |  |             fw_cfg_add_bytes(fw_cfg, FW_CFG_SETUP_DATA, | 
					
						
							|  |  |  |                              header, sizeof(header)); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             /* load initrd */ | 
					
						
							|  |  |  |             if (initrd_filename) { | 
					
						
							|  |  |  |                 GMappedFile *mapped_file; | 
					
						
							|  |  |  |                 gsize initrd_size; | 
					
						
							|  |  |  |                 gchar *initrd_data; | 
					
						
							|  |  |  |                 GError *gerr = NULL; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 mapped_file = g_mapped_file_new(initrd_filename, false, &gerr); | 
					
						
							|  |  |  |                 if (!mapped_file) { | 
					
						
							|  |  |  |                     fprintf(stderr, "qemu: error reading initrd %s: %s\n", | 
					
						
							|  |  |  |                             initrd_filename, gerr->message); | 
					
						
							|  |  |  |                     exit(1); | 
					
						
							|  |  |  |                 } | 
					
						
							| 
									
										
										
										
											2019-10-22 09:39:50 +02:00
										 |  |  |                 x86ms->initrd_mapped_file = mapped_file; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |                 initrd_data = g_mapped_file_get_contents(mapped_file); | 
					
						
							|  |  |  |                 initrd_size = g_mapped_file_get_length(mapped_file); | 
					
						
							| 
									
										
										
										
											2019-09-30 17:26:29 +02:00
										 |  |  |                 initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |                 if (initrd_size >= initrd_max) { | 
					
						
							|  |  |  |                     fprintf(stderr, "qemu: initrd is too large, cannot support." | 
					
						
							|  |  |  |                             "(max: %"PRIu32", need %"PRId64")\n", | 
					
						
							|  |  |  |                             initrd_max, (uint64_t)initrd_size); | 
					
						
							|  |  |  |                     exit(1); | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 initrd_addr = (initrd_max - initrd_size) & ~4095; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, initrd_addr); | 
					
						
							|  |  |  |                 fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, initrd_size); | 
					
						
							|  |  |  |                 fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, initrd_data, | 
					
						
							|  |  |  |                                  initrd_size); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             option_rom[nb_option_roms].bootindex = 0; | 
					
						
							|  |  |  |             option_rom[nb_option_roms].name = "pvh.bin"; | 
					
						
							|  |  |  |             nb_option_roms++; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             return; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         protocol = 0; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (protocol < 0x200 || !(header[0x211] & 0x01)) { | 
					
						
							|  |  |  |         /* Low kernel */ | 
					
						
							|  |  |  |         real_addr    = 0x90000; | 
					
						
							|  |  |  |         cmdline_addr = 0x9a000 - cmdline_size; | 
					
						
							|  |  |  |         prot_addr    = 0x10000; | 
					
						
							|  |  |  |     } else if (protocol < 0x202) { | 
					
						
							|  |  |  |         /* High but ancient kernel */ | 
					
						
							|  |  |  |         real_addr    = 0x90000; | 
					
						
							|  |  |  |         cmdline_addr = 0x9a000 - cmdline_size; | 
					
						
							|  |  |  |         prot_addr    = 0x100000; | 
					
						
							|  |  |  |     } else { | 
					
						
							|  |  |  |         /* High and recent kernel */ | 
					
						
							|  |  |  |         real_addr    = 0x10000; | 
					
						
							|  |  |  |         cmdline_addr = 0x20000; | 
					
						
							|  |  |  |         prot_addr    = 0x100000; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /* highest address for loading the initrd */ | 
					
						
							|  |  |  |     if (protocol >= 0x20c && | 
					
						
							|  |  |  |         lduw_p(header + 0x236) & XLF_CAN_BE_LOADED_ABOVE_4G) { | 
					
						
							|  |  |  |         /*
 | 
					
						
							|  |  |  |          * Linux has supported initrd up to 4 GB for a very long time (2007, | 
					
						
							|  |  |  |          * long before XLF_CAN_BE_LOADED_ABOVE_4G which was added in 2013), | 
					
						
							|  |  |  |          * though it only sets initrd_max to 2 GB to "work around bootloader | 
					
						
							|  |  |  |          * bugs". Luckily, QEMU firmware(which does something like bootloader) | 
					
						
							|  |  |  |          * has supported this. | 
					
						
							|  |  |  |          * | 
					
						
							|  |  |  |          * It's believed that if XLF_CAN_BE_LOADED_ABOVE_4G is set, initrd can | 
					
						
							|  |  |  |          * be loaded into any address. | 
					
						
							|  |  |  |          * | 
					
						
							|  |  |  |          * In addition, initrd_max is uint32_t simply because QEMU doesn't | 
					
						
							|  |  |  |          * support the 64-bit boot protocol (specifically the ext_ramdisk_image | 
					
						
							|  |  |  |          * field). | 
					
						
							|  |  |  |          * | 
					
						
							|  |  |  |          * Therefore here just limit initrd_max to UINT32_MAX simply as well. | 
					
						
							|  |  |  |          */ | 
					
						
							|  |  |  |         initrd_max = UINT32_MAX; | 
					
						
							|  |  |  |     } else if (protocol >= 0x203) { | 
					
						
							|  |  |  |         initrd_max = ldl_p(header + 0x22c); | 
					
						
							|  |  |  |     } else { | 
					
						
							|  |  |  |         initrd_max = 0x37ffffff; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-09-30 17:26:29 +02:00
										 |  |  |     if (initrd_max >= x86ms->below_4g_mem_size - acpi_data_size) { | 
					
						
							|  |  |  |         initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (protocol >= 0x202) { | 
					
						
							|  |  |  |         stl_p(header + 0x228, cmdline_addr); | 
					
						
							|  |  |  |     } else { | 
					
						
							|  |  |  |         stw_p(header + 0x20, 0xA33F); | 
					
						
							|  |  |  |         stw_p(header + 0x22, cmdline_addr - real_addr); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /* handle vga= parameter */ | 
					
						
							|  |  |  |     vmode = strstr(kernel_cmdline, "vga="); | 
					
						
							|  |  |  |     if (vmode) { | 
					
						
							|  |  |  |         unsigned int video_mode; | 
					
						
							| 
									
										
										
										
											2019-12-21 17:21:24 +01:00
										 |  |  |         const char *end; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |         int ret; | 
					
						
							|  |  |  |         /* skip "vga=" */ | 
					
						
							|  |  |  |         vmode += 4; | 
					
						
							|  |  |  |         if (!strncmp(vmode, "normal", 6)) { | 
					
						
							|  |  |  |             video_mode = 0xffff; | 
					
						
							|  |  |  |         } else if (!strncmp(vmode, "ext", 3)) { | 
					
						
							|  |  |  |             video_mode = 0xfffe; | 
					
						
							|  |  |  |         } else if (!strncmp(vmode, "ask", 3)) { | 
					
						
							|  |  |  |             video_mode = 0xfffd; | 
					
						
							|  |  |  |         } else { | 
					
						
							| 
									
										
										
										
											2019-12-21 17:21:24 +01:00
										 |  |  |             ret = qemu_strtoui(vmode, &end, 0, &video_mode); | 
					
						
							|  |  |  |             if (ret != 0 || (*end && *end != ' ')) { | 
					
						
							|  |  |  |                 fprintf(stderr, "qemu: invalid 'vga=' kernel parameter.\n"); | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |                 exit(1); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         stw_p(header + 0x1fa, video_mode); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /* loader type */ | 
					
						
							|  |  |  |     /*
 | 
					
						
							|  |  |  |      * High nybble = B reserved for QEMU; low nybble is revision number. | 
					
						
							|  |  |  |      * If this code is substantially changed, you may want to consider | 
					
						
							|  |  |  |      * incrementing the revision. | 
					
						
							|  |  |  |      */ | 
					
						
							|  |  |  |     if (protocol >= 0x200) { | 
					
						
							|  |  |  |         header[0x210] = 0xB0; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     /* heap */ | 
					
						
							|  |  |  |     if (protocol >= 0x201) { | 
					
						
							|  |  |  |         header[0x211] |= 0x80; /* CAN_USE_HEAP */ | 
					
						
							|  |  |  |         stw_p(header + 0x224, cmdline_addr - real_addr - 0x200); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /* load initrd */ | 
					
						
							|  |  |  |     if (initrd_filename) { | 
					
						
							|  |  |  |         GMappedFile *mapped_file; | 
					
						
							|  |  |  |         gsize initrd_size; | 
					
						
							|  |  |  |         gchar *initrd_data; | 
					
						
							|  |  |  |         GError *gerr = NULL; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if (protocol < 0x200) { | 
					
						
							|  |  |  |             fprintf(stderr, "qemu: linux kernel too old to load a ram disk\n"); | 
					
						
							|  |  |  |             exit(1); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         mapped_file = g_mapped_file_new(initrd_filename, false, &gerr); | 
					
						
							|  |  |  |         if (!mapped_file) { | 
					
						
							|  |  |  |             fprintf(stderr, "qemu: error reading initrd %s: %s\n", | 
					
						
							|  |  |  |                     initrd_filename, gerr->message); | 
					
						
							|  |  |  |             exit(1); | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2019-10-22 09:39:50 +02:00
										 |  |  |         x86ms->initrd_mapped_file = mapped_file; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         initrd_data = g_mapped_file_get_contents(mapped_file); | 
					
						
							|  |  |  |         initrd_size = g_mapped_file_get_length(mapped_file); | 
					
						
							|  |  |  |         if (initrd_size >= initrd_max) { | 
					
						
							|  |  |  |             fprintf(stderr, "qemu: initrd is too large, cannot support." | 
					
						
							|  |  |  |                     "(max: %"PRIu32", need %"PRId64")\n", | 
					
						
							|  |  |  |                     initrd_max, (uint64_t)initrd_size); | 
					
						
							|  |  |  |             exit(1); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         initrd_addr = (initrd_max - initrd_size) & ~4095; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, initrd_addr); | 
					
						
							|  |  |  |         fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, initrd_size); | 
					
						
							|  |  |  |         fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, initrd_data, initrd_size); | 
					
						
							| 
									
										
										
										
											2021-09-30 08:49:15 +03:00
										 |  |  |         sev_load_ctx.initrd_data = initrd_data; | 
					
						
							|  |  |  |         sev_load_ctx.initrd_size = initrd_size; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         stl_p(header + 0x218, initrd_addr); | 
					
						
							|  |  |  |         stl_p(header + 0x21c, initrd_size); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /* load kernel and setup */ | 
					
						
							|  |  |  |     setup_size = header[0x1f1]; | 
					
						
							|  |  |  |     if (setup_size == 0) { | 
					
						
							|  |  |  |         setup_size = 4; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     setup_size = (setup_size + 1) * 512; | 
					
						
							|  |  |  |     if (setup_size > kernel_size) { | 
					
						
							|  |  |  |         fprintf(stderr, "qemu: invalid kernel header\n"); | 
					
						
							|  |  |  |         exit(1); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     kernel_size -= setup_size; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     setup  = g_malloc(setup_size); | 
					
						
							|  |  |  |     kernel = g_malloc(kernel_size); | 
					
						
							|  |  |  |     fseek(f, 0, SEEK_SET); | 
					
						
							|  |  |  |     if (fread(setup, 1, setup_size, f) != setup_size) { | 
					
						
							|  |  |  |         fprintf(stderr, "fread() failed\n"); | 
					
						
							|  |  |  |         exit(1); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     if (fread(kernel, 1, kernel_size, f) != kernel_size) { | 
					
						
							|  |  |  |         fprintf(stderr, "fread() failed\n"); | 
					
						
							|  |  |  |         exit(1); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     fclose(f); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /* append dtb to kernel */ | 
					
						
							|  |  |  |     if (dtb_filename) { | 
					
						
							|  |  |  |         if (protocol < 0x209) { | 
					
						
							|  |  |  |             fprintf(stderr, "qemu: Linux kernel too old to load a dtb\n"); | 
					
						
							|  |  |  |             exit(1); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         dtb_size = get_image_size(dtb_filename); | 
					
						
							|  |  |  |         if (dtb_size <= 0) { | 
					
						
							|  |  |  |             fprintf(stderr, "qemu: error reading dtb %s: %s\n", | 
					
						
							|  |  |  |                     dtb_filename, strerror(errno)); | 
					
						
							|  |  |  |             exit(1); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												x86: don't let decompressed kernel image clobber setup_data
The setup_data links are appended to the compressed kernel image. Since
the kernel image is typically loaded at 0x100000, setup_data lives at
`0x100000 + compressed_size`, which does not get relocated during the
kernel's boot process.
The kernel typically decompresses the image starting at address
0x1000000 (note: there's one more zero there than the compressed image
above). This usually is fine for most kernels.
However, if the compressed image is actually quite large, then
setup_data will live at a `0x100000 + compressed_size` that extends into
the decompressed zone at 0x1000000. In other words, if compressed_size
is larger than `0x1000000 - 0x100000`, then the decompression step will
clobber setup_data, resulting in crashes.
Visually, what happens now is that QEMU appends setup_data to the kernel
image:
          kernel image            setup_data
   |--------------------------||----------------|
0x100000                  0x100000+l1     0x100000+l1+l2
The problem is that this decompresses to 0x1000000 (one more zero). So
if l1 is > (0x1000000-0x100000), then this winds up looking like:
          kernel image            setup_data
   |--------------------------||----------------|
0x100000                  0x100000+l1     0x100000+l1+l2
                                 d e c o m p r e s s e d   k e r n e l
                     |-------------------------------------------------------------|
                0x1000000                                                     0x1000000+l3
The decompressed kernel seemingly overwriting the compressed kernel
image isn't a problem, because that gets relocated to a higher address
early on in the boot process, at the end of startup_64. setup_data,
however, stays in the same place, since those links are self referential
and nothing fixes them up.  So the decompressed kernel clobbers it.
Fix this by appending setup_data to the cmdline blob rather than the
kernel image blob, which remains at a lower address that won't get
clobbered.
This could have been done by overwriting the initrd blob instead, but
that poses big difficulties, such as no longer being able to use memory
mapped files for initrd, hurting performance, and, more importantly, the
initrd address calculation is hard coded in qboot, and it always grows
down rather than up, which means lots of brittle semantics would have to
be changed around, incurring more complexity. In contrast, using cmdline
is simple and doesn't interfere with anything.
The microvm machine has a gross hack where it fiddles with fw_cfg data
after the fact. So this hack is updated to account for this appending,
by reserving some bytes.
Fixup-by: Michael S. Tsirkin <mst@redhat.com>
Cc: x86@kernel.org
Cc: Philippe Mathieu-Daudé <philmd@linaro.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Message-Id: <20221230220725.618763-1-Jason@zx2c4.com>
Message-ID: <20230128061015-mutt-send-email-mst@kernel.org>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Eric Biggers <ebiggers@google.com>
Tested-by: Mathias Krause <minipli@grsecurity.net>
											
										 
											2022-12-30 23:07:25 +01:00
										 |  |  |         setup_data_offset = cmdline_size; | 
					
						
							|  |  |  |         cmdline_size += sizeof(SetupData) + dtb_size; | 
					
						
							|  |  |  |         kernel_cmdline = g_realloc(kernel_cmdline, cmdline_size); | 
					
						
							|  |  |  |         setup_data = (void *)kernel_cmdline + setup_data_offset; | 
					
						
							| 
									
										
										
										
											2022-07-21 14:56:36 +02:00
										 |  |  |         setup_data->next = cpu_to_le64(first_setup_data); | 
					
						
							| 
									
										
											  
											
												x86: don't let decompressed kernel image clobber setup_data
The setup_data links are appended to the compressed kernel image. Since
the kernel image is typically loaded at 0x100000, setup_data lives at
`0x100000 + compressed_size`, which does not get relocated during the
kernel's boot process.
The kernel typically decompresses the image starting at address
0x1000000 (note: there's one more zero there than the compressed image
above). This usually is fine for most kernels.
However, if the compressed image is actually quite large, then
setup_data will live at a `0x100000 + compressed_size` that extends into
the decompressed zone at 0x1000000. In other words, if compressed_size
is larger than `0x1000000 - 0x100000`, then the decompression step will
clobber setup_data, resulting in crashes.
Visually, what happens now is that QEMU appends setup_data to the kernel
image:
          kernel image            setup_data
   |--------------------------||----------------|
0x100000                  0x100000+l1     0x100000+l1+l2
The problem is that this decompresses to 0x1000000 (one more zero). So
if l1 is > (0x1000000-0x100000), then this winds up looking like:
          kernel image            setup_data
   |--------------------------||----------------|
0x100000                  0x100000+l1     0x100000+l1+l2
                                 d e c o m p r e s s e d   k e r n e l
                     |-------------------------------------------------------------|
                0x1000000                                                     0x1000000+l3
The decompressed kernel seemingly overwriting the compressed kernel
image isn't a problem, because that gets relocated to a higher address
early on in the boot process, at the end of startup_64. setup_data,
however, stays in the same place, since those links are self referential
and nothing fixes them up.  So the decompressed kernel clobbers it.
Fix this by appending setup_data to the cmdline blob rather than the
kernel image blob, which remains at a lower address that won't get
clobbered.
This could have been done by overwriting the initrd blob instead, but
that poses big difficulties, such as no longer being able to use memory
mapped files for initrd, hurting performance, and, more importantly, the
initrd address calculation is hard coded in qboot, and it always grows
down rather than up, which means lots of brittle semantics would have to
be changed around, incurring more complexity. In contrast, using cmdline
is simple and doesn't interfere with anything.
The microvm machine has a gross hack where it fiddles with fw_cfg data
after the fact. So this hack is updated to account for this appending,
by reserving some bytes.
Fixup-by: Michael S. Tsirkin <mst@redhat.com>
Cc: x86@kernel.org
Cc: Philippe Mathieu-Daudé <philmd@linaro.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Message-Id: <20221230220725.618763-1-Jason@zx2c4.com>
Message-ID: <20230128061015-mutt-send-email-mst@kernel.org>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Eric Biggers <ebiggers@google.com>
Tested-by: Mathias Krause <minipli@grsecurity.net>
											
										 
											2022-12-30 23:07:25 +01:00
										 |  |  |         first_setup_data = cmdline_addr + setup_data_offset; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |         setup_data->type = cpu_to_le32(SETUP_DTB); | 
					
						
							|  |  |  |         setup_data->len = cpu_to_le32(dtb_size); | 
					
						
							|  |  |  |         load_image_size(dtb_filename, setup_data->data, dtb_size); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												x86: don't let decompressed kernel image clobber setup_data
The setup_data links are appended to the compressed kernel image. Since
the kernel image is typically loaded at 0x100000, setup_data lives at
`0x100000 + compressed_size`, which does not get relocated during the
kernel's boot process.
The kernel typically decompresses the image starting at address
0x1000000 (note: there's one more zero there than the compressed image
above). This usually is fine for most kernels.
However, if the compressed image is actually quite large, then
setup_data will live at a `0x100000 + compressed_size` that extends into
the decompressed zone at 0x1000000. In other words, if compressed_size
is larger than `0x1000000 - 0x100000`, then the decompression step will
clobber setup_data, resulting in crashes.
Visually, what happens now is that QEMU appends setup_data to the kernel
image:
          kernel image            setup_data
   |--------------------------||----------------|
0x100000                  0x100000+l1     0x100000+l1+l2
The problem is that this decompresses to 0x1000000 (one more zero). So
if l1 is > (0x1000000-0x100000), then this winds up looking like:
          kernel image            setup_data
   |--------------------------||----------------|
0x100000                  0x100000+l1     0x100000+l1+l2
                                 d e c o m p r e s s e d   k e r n e l
                     |-------------------------------------------------------------|
                0x1000000                                                     0x1000000+l3
The decompressed kernel seemingly overwriting the compressed kernel
image isn't a problem, because that gets relocated to a higher address
early on in the boot process, at the end of startup_64. setup_data,
however, stays in the same place, since those links are self referential
and nothing fixes them up.  So the decompressed kernel clobbers it.
Fix this by appending setup_data to the cmdline blob rather than the
kernel image blob, which remains at a lower address that won't get
clobbered.
This could have been done by overwriting the initrd blob instead, but
that poses big difficulties, such as no longer being able to use memory
mapped files for initrd, hurting performance, and, more importantly, the
initrd address calculation is hard coded in qboot, and it always grows
down rather than up, which means lots of brittle semantics would have to
be changed around, incurring more complexity. In contrast, using cmdline
is simple and doesn't interfere with anything.
The microvm machine has a gross hack where it fiddles with fw_cfg data
after the fact. So this hack is updated to account for this appending,
by reserving some bytes.
Fixup-by: Michael S. Tsirkin <mst@redhat.com>
Cc: x86@kernel.org
Cc: Philippe Mathieu-Daudé <philmd@linaro.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Message-Id: <20221230220725.618763-1-Jason@zx2c4.com>
Message-ID: <20230128061015-mutt-send-email-mst@kernel.org>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Eric Biggers <ebiggers@google.com>
Tested-by: Mathias Krause <minipli@grsecurity.net>
											
										 
											2022-12-30 23:07:25 +01:00
										 |  |  |     if (!legacy_no_rng_seed && protocol >= 0x209) { | 
					
						
							|  |  |  |         setup_data_offset = cmdline_size; | 
					
						
							|  |  |  |         cmdline_size += sizeof(SetupData) + RNG_SEED_LENGTH; | 
					
						
							|  |  |  |         kernel_cmdline = g_realloc(kernel_cmdline, cmdline_size); | 
					
						
							|  |  |  |         setup_data = (void *)kernel_cmdline + setup_data_offset; | 
					
						
							| 
									
										
										
										
											2022-07-21 14:56:36 +02:00
										 |  |  |         setup_data->next = cpu_to_le64(first_setup_data); | 
					
						
							| 
									
										
											  
											
												x86: don't let decompressed kernel image clobber setup_data
The setup_data links are appended to the compressed kernel image. Since
the kernel image is typically loaded at 0x100000, setup_data lives at
`0x100000 + compressed_size`, which does not get relocated during the
kernel's boot process.
The kernel typically decompresses the image starting at address
0x1000000 (note: there's one more zero there than the compressed image
above). This usually is fine for most kernels.
However, if the compressed image is actually quite large, then
setup_data will live at a `0x100000 + compressed_size` that extends into
the decompressed zone at 0x1000000. In other words, if compressed_size
is larger than `0x1000000 - 0x100000`, then the decompression step will
clobber setup_data, resulting in crashes.
Visually, what happens now is that QEMU appends setup_data to the kernel
image:
          kernel image            setup_data
   |--------------------------||----------------|
0x100000                  0x100000+l1     0x100000+l1+l2
The problem is that this decompresses to 0x1000000 (one more zero). So
if l1 is > (0x1000000-0x100000), then this winds up looking like:
          kernel image            setup_data
   |--------------------------||----------------|
0x100000                  0x100000+l1     0x100000+l1+l2
                                 d e c o m p r e s s e d   k e r n e l
                     |-------------------------------------------------------------|
                0x1000000                                                     0x1000000+l3
The decompressed kernel seemingly overwriting the compressed kernel
image isn't a problem, because that gets relocated to a higher address
early on in the boot process, at the end of startup_64. setup_data,
however, stays in the same place, since those links are self referential
and nothing fixes them up.  So the decompressed kernel clobbers it.
Fix this by appending setup_data to the cmdline blob rather than the
kernel image blob, which remains at a lower address that won't get
clobbered.
This could have been done by overwriting the initrd blob instead, but
that poses big difficulties, such as no longer being able to use memory
mapped files for initrd, hurting performance, and, more importantly, the
initrd address calculation is hard coded in qboot, and it always grows
down rather than up, which means lots of brittle semantics would have to
be changed around, incurring more complexity. In contrast, using cmdline
is simple and doesn't interfere with anything.
The microvm machine has a gross hack where it fiddles with fw_cfg data
after the fact. So this hack is updated to account for this appending,
by reserving some bytes.
Fixup-by: Michael S. Tsirkin <mst@redhat.com>
Cc: x86@kernel.org
Cc: Philippe Mathieu-Daudé <philmd@linaro.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Message-Id: <20221230220725.618763-1-Jason@zx2c4.com>
Message-ID: <20230128061015-mutt-send-email-mst@kernel.org>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Eric Biggers <ebiggers@google.com>
Tested-by: Mathias Krause <minipli@grsecurity.net>
											
										 
											2022-12-30 23:07:25 +01:00
										 |  |  |         first_setup_data = cmdline_addr + setup_data_offset; | 
					
						
							| 
									
										
										
										
											2022-07-21 14:56:36 +02:00
										 |  |  |         setup_data->type = cpu_to_le32(SETUP_RNG_SEED); | 
					
						
							|  |  |  |         setup_data->len = cpu_to_le32(RNG_SEED_LENGTH); | 
					
						
							|  |  |  |         qemu_guest_getrandom_nofail(setup_data->data, RNG_SEED_LENGTH); | 
					
						
							| 
									
										
										
										
											2022-10-25 02:43:19 +02:00
										 |  |  |         qemu_register_reset_nosnapshotload(reset_rng_seed, setup_data); | 
					
						
							| 
									
										
										
										
											2022-09-22 17:28:47 +02:00
										 |  |  |         fw_cfg_add_bytes_callback(fw_cfg, FW_CFG_KERNEL_DATA, reset_rng_seed, NULL, | 
					
						
							|  |  |  |                                   setup_data, kernel, kernel_size, true); | 
					
						
							|  |  |  |     } else { | 
					
						
							|  |  |  |         fw_cfg_add_bytes(fw_cfg, FW_CFG_KERNEL_DATA, kernel, kernel_size); | 
					
						
							| 
									
										
										
										
											2022-07-21 14:56:36 +02:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												x86: don't let decompressed kernel image clobber setup_data
The setup_data links are appended to the compressed kernel image. Since
the kernel image is typically loaded at 0x100000, setup_data lives at
`0x100000 + compressed_size`, which does not get relocated during the
kernel's boot process.
The kernel typically decompresses the image starting at address
0x1000000 (note: there's one more zero there than the compressed image
above). This usually is fine for most kernels.
However, if the compressed image is actually quite large, then
setup_data will live at a `0x100000 + compressed_size` that extends into
the decompressed zone at 0x1000000. In other words, if compressed_size
is larger than `0x1000000 - 0x100000`, then the decompression step will
clobber setup_data, resulting in crashes.
Visually, what happens now is that QEMU appends setup_data to the kernel
image:
          kernel image            setup_data
   |--------------------------||----------------|
0x100000                  0x100000+l1     0x100000+l1+l2
The problem is that this decompresses to 0x1000000 (one more zero). So
if l1 is > (0x1000000-0x100000), then this winds up looking like:
          kernel image            setup_data
   |--------------------------||----------------|
0x100000                  0x100000+l1     0x100000+l1+l2
                                 d e c o m p r e s s e d   k e r n e l
                     |-------------------------------------------------------------|
                0x1000000                                                     0x1000000+l3
The decompressed kernel seemingly overwriting the compressed kernel
image isn't a problem, because that gets relocated to a higher address
early on in the boot process, at the end of startup_64. setup_data,
however, stays in the same place, since those links are self referential
and nothing fixes them up.  So the decompressed kernel clobbers it.
Fix this by appending setup_data to the cmdline blob rather than the
kernel image blob, which remains at a lower address that won't get
clobbered.
This could have been done by overwriting the initrd blob instead, but
that poses big difficulties, such as no longer being able to use memory
mapped files for initrd, hurting performance, and, more importantly, the
initrd address calculation is hard coded in qboot, and it always grows
down rather than up, which means lots of brittle semantics would have to
be changed around, incurring more complexity. In contrast, using cmdline
is simple and doesn't interfere with anything.
The microvm machine has a gross hack where it fiddles with fw_cfg data
after the fact. So this hack is updated to account for this appending,
by reserving some bytes.
Fixup-by: Michael S. Tsirkin <mst@redhat.com>
Cc: x86@kernel.org
Cc: Philippe Mathieu-Daudé <philmd@linaro.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Message-Id: <20221230220725.618763-1-Jason@zx2c4.com>
Message-ID: <20230128061015-mutt-send-email-mst@kernel.org>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Eric Biggers <ebiggers@google.com>
Tested-by: Mathias Krause <minipli@grsecurity.net>
											
										 
											2022-12-30 23:07:25 +01:00
										 |  |  |     fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_ADDR, cmdline_addr); | 
					
						
							|  |  |  |     fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE, cmdline_size); | 
					
						
							|  |  |  |     fw_cfg_add_bytes(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline, cmdline_size); | 
					
						
							|  |  |  |     sev_load_ctx.cmdline_data = (char *)kernel_cmdline; | 
					
						
							|  |  |  |     sev_load_ctx.cmdline_size = cmdline_size; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												x86: return modified setup_data only if read as memory, not as file
If setup_data is being read into a specific memory location, then
generally the setup_data address parameter is read first, so that the
caller knows where to read it into. In that case, we should return
setup_data containing the absolute addresses that are hard coded and
determined a priori. This is the case when kernels are loaded by BIOS,
for example. In contrast, when setup_data is read as a file, then we
shouldn't modify setup_data, since the absolute address will be wrong by
definition. This is the case when OVMF loads the image.
This allows setup_data to be used like normal, without crashing when EFI
tries to use it.
(As a small development note, strangely, fw_cfg_add_file_callback() was
exported but fw_cfg_add_bytes_callback() wasn't, so this makes that
consistent.)
Cc: Gerd Hoffmann <kraxel@redhat.com>
Cc: Laurent Vivier <laurent@vivier.eu>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Maydell <peter.maydell@linaro.org>
Cc: Philippe Mathieu-Daudé <f4bug@amsat.org>
Cc: Richard Henderson <richard.henderson@linaro.org>
Suggested-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Message-Id: <20220921093134.2936487-1-Jason@zx2c4.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
											
										 
											2022-09-21 11:31:31 +02:00
										 |  |  |     fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, prot_addr); | 
					
						
							|  |  |  |     fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, kernel_size); | 
					
						
							|  |  |  |     sev_load_ctx.kernel_data = (char *)kernel; | 
					
						
							|  |  |  |     sev_load_ctx.kernel_size = kernel_size; | 
					
						
							| 
									
										
										
										
											2022-07-21 14:56:36 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-30 08:49:15 +03:00
										 |  |  |     /*
 | 
					
						
							|  |  |  |      * If we're starting an encrypted VM, it will be OVMF based, which uses the | 
					
						
							|  |  |  |      * efi stub for booting and doesn't require any values to be placed in the | 
					
						
							|  |  |  |      * kernel header.  We therefore don't update the header so the hash of the | 
					
						
							|  |  |  |      * kernel on the other side of the fw_cfg interface matches the hash of the | 
					
						
							|  |  |  |      * file the user passed in. | 
					
						
							|  |  |  |      */ | 
					
						
							| 
									
										
											  
											
												x86: don't let decompressed kernel image clobber setup_data
The setup_data links are appended to the compressed kernel image. Since
the kernel image is typically loaded at 0x100000, setup_data lives at
`0x100000 + compressed_size`, which does not get relocated during the
kernel's boot process.
The kernel typically decompresses the image starting at address
0x1000000 (note: there's one more zero there than the compressed image
above). This usually is fine for most kernels.
However, if the compressed image is actually quite large, then
setup_data will live at a `0x100000 + compressed_size` that extends into
the decompressed zone at 0x1000000. In other words, if compressed_size
is larger than `0x1000000 - 0x100000`, then the decompression step will
clobber setup_data, resulting in crashes.
Visually, what happens now is that QEMU appends setup_data to the kernel
image:
          kernel image            setup_data
   |--------------------------||----------------|
0x100000                  0x100000+l1     0x100000+l1+l2
The problem is that this decompresses to 0x1000000 (one more zero). So
if l1 is > (0x1000000-0x100000), then this winds up looking like:
          kernel image            setup_data
   |--------------------------||----------------|
0x100000                  0x100000+l1     0x100000+l1+l2
                                 d e c o m p r e s s e d   k e r n e l
                     |-------------------------------------------------------------|
                0x1000000                                                     0x1000000+l3
The decompressed kernel seemingly overwriting the compressed kernel
image isn't a problem, because that gets relocated to a higher address
early on in the boot process, at the end of startup_64. setup_data,
however, stays in the same place, since those links are self referential
and nothing fixes them up.  So the decompressed kernel clobbers it.
Fix this by appending setup_data to the cmdline blob rather than the
kernel image blob, which remains at a lower address that won't get
clobbered.
This could have been done by overwriting the initrd blob instead, but
that poses big difficulties, such as no longer being able to use memory
mapped files for initrd, hurting performance, and, more importantly, the
initrd address calculation is hard coded in qboot, and it always grows
down rather than up, which means lots of brittle semantics would have to
be changed around, incurring more complexity. In contrast, using cmdline
is simple and doesn't interfere with anything.
The microvm machine has a gross hack where it fiddles with fw_cfg data
after the fact. So this hack is updated to account for this appending,
by reserving some bytes.
Fixup-by: Michael S. Tsirkin <mst@redhat.com>
Cc: x86@kernel.org
Cc: Philippe Mathieu-Daudé <philmd@linaro.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Message-Id: <20221230220725.618763-1-Jason@zx2c4.com>
Message-ID: <20230128061015-mutt-send-email-mst@kernel.org>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Eric Biggers <ebiggers@google.com>
Tested-by: Mathias Krause <minipli@grsecurity.net>
											
										 
											2022-12-30 23:07:25 +01:00
										 |  |  |     if (!sev_enabled() && first_setup_data) { | 
					
						
							| 
									
										
											  
											
												x86: return modified setup_data only if read as memory, not as file
If setup_data is being read into a specific memory location, then
generally the setup_data address parameter is read first, so that the
caller knows where to read it into. In that case, we should return
setup_data containing the absolute addresses that are hard coded and
determined a priori. This is the case when kernels are loaded by BIOS,
for example. In contrast, when setup_data is read as a file, then we
shouldn't modify setup_data, since the absolute address will be wrong by
definition. This is the case when OVMF loads the image.
This allows setup_data to be used like normal, without crashing when EFI
tries to use it.
(As a small development note, strangely, fw_cfg_add_file_callback() was
exported but fw_cfg_add_bytes_callback() wasn't, so this makes that
consistent.)
Cc: Gerd Hoffmann <kraxel@redhat.com>
Cc: Laurent Vivier <laurent@vivier.eu>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Maydell <peter.maydell@linaro.org>
Cc: Philippe Mathieu-Daudé <f4bug@amsat.org>
Cc: Richard Henderson <richard.henderson@linaro.org>
Suggested-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Message-Id: <20220921093134.2936487-1-Jason@zx2c4.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
											
										 
											2022-09-21 11:31:31 +02:00
										 |  |  |         SetupDataFixup *fixup = g_malloc(sizeof(*fixup)); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-30 08:49:15 +03:00
										 |  |  |         memcpy(setup, header, MIN(sizeof(header), setup_size)); | 
					
						
							| 
									
										
											  
											
												x86: return modified setup_data only if read as memory, not as file
If setup_data is being read into a specific memory location, then
generally the setup_data address parameter is read first, so that the
caller knows where to read it into. In that case, we should return
setup_data containing the absolute addresses that are hard coded and
determined a priori. This is the case when kernels are loaded by BIOS,
for example. In contrast, when setup_data is read as a file, then we
shouldn't modify setup_data, since the absolute address will be wrong by
definition. This is the case when OVMF loads the image.
This allows setup_data to be used like normal, without crashing when EFI
tries to use it.
(As a small development note, strangely, fw_cfg_add_file_callback() was
exported but fw_cfg_add_bytes_callback() wasn't, so this makes that
consistent.)
Cc: Gerd Hoffmann <kraxel@redhat.com>
Cc: Laurent Vivier <laurent@vivier.eu>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Maydell <peter.maydell@linaro.org>
Cc: Philippe Mathieu-Daudé <f4bug@amsat.org>
Cc: Richard Henderson <richard.henderson@linaro.org>
Suggested-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Message-Id: <20220921093134.2936487-1-Jason@zx2c4.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
											
										 
											2022-09-21 11:31:31 +02:00
										 |  |  |         /* Offset 0x250 is a pointer to the first setup_data link. */ | 
					
						
							|  |  |  |         fixup->pos = setup + 0x250; | 
					
						
							|  |  |  |         fixup->orig_val = ldq_p(fixup->pos); | 
					
						
							|  |  |  |         fixup->new_val = first_setup_data; | 
					
						
							|  |  |  |         fixup->addr = cpu_to_le32(real_addr); | 
					
						
							|  |  |  |         fw_cfg_add_bytes_callback(fw_cfg, FW_CFG_SETUP_ADDR, fixup_setup_data, NULL, | 
					
						
							|  |  |  |                                   fixup, &fixup->addr, sizeof(fixup->addr), true); | 
					
						
							|  |  |  |         qemu_register_reset(reset_setup_data, fixup); | 
					
						
							|  |  |  |     } else { | 
					
						
							|  |  |  |         fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_ADDR, real_addr); | 
					
						
							| 
									
										
										
										
											2021-09-30 08:49:15 +03:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |     fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, setup_size); | 
					
						
							|  |  |  |     fw_cfg_add_bytes(fw_cfg, FW_CFG_SETUP_DATA, setup, setup_size); | 
					
						
							| 
									
										
										
										
											2021-09-30 08:49:15 +03:00
										 |  |  |     sev_load_ctx.setup_data = (char *)setup; | 
					
						
							|  |  |  |     sev_load_ctx.setup_size = setup_size; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (sev_enabled()) { | 
					
						
							|  |  |  |         sev_add_kernel_loader_hashes(&sev_load_ctx, &error_fatal); | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     option_rom[nb_option_roms].bootindex = 0; | 
					
						
							|  |  |  |     option_rom[nb_option_roms].name = "linuxboot.bin"; | 
					
						
							| 
									
										
										
										
											2019-09-30 17:26:29 +02:00
										 |  |  |     if (linuxboot_dma_enabled && fw_cfg_dma_enabled(fw_cfg)) { | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |         option_rom[nb_option_roms].name = "linuxboot_dma.bin"; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     nb_option_roms++; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-26 10:30:18 -04:00
										 |  |  | void x86_bios_rom_init(MachineState *ms, const char *default_firmware, | 
					
						
							|  |  |  |                        MemoryRegion *rom_memory, bool isapc_ram_fw) | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | { | 
					
						
							| 
									
										
										
										
											2020-10-26 10:30:18 -04:00
										 |  |  |     const char *bios_name; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |     char *filename; | 
					
						
							|  |  |  |     MemoryRegion *bios, *isa_bios; | 
					
						
							|  |  |  |     int bios_size, isa_bios_size; | 
					
						
							| 
									
										
										
										
											2021-11-11 14:11:40 +00:00
										 |  |  |     ssize_t ret; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     /* BIOS load */ | 
					
						
							| 
									
										
										
										
											2020-10-26 10:30:18 -04:00
										 |  |  |     bios_name = ms->firmware ?: default_firmware; | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |     filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name); | 
					
						
							|  |  |  |     if (filename) { | 
					
						
							|  |  |  |         bios_size = get_image_size(filename); | 
					
						
							|  |  |  |     } else { | 
					
						
							|  |  |  |         bios_size = -1; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     if (bios_size <= 0 || | 
					
						
							|  |  |  |         (bios_size % 65536) != 0) { | 
					
						
							|  |  |  |         goto bios_error; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     bios = g_malloc(sizeof(*bios)); | 
					
						
							|  |  |  |     memory_region_init_ram(bios, NULL, "pc.bios", bios_size, &error_fatal); | 
					
						
							| 
									
										
										
										
											2022-04-25 15:50:51 +02:00
										 |  |  |     if (sev_enabled()) { | 
					
						
							|  |  |  |         /*
 | 
					
						
							|  |  |  |          * The concept of a "reset" simply doesn't exist for | 
					
						
							|  |  |  |          * confidential computing guests, we have to destroy and | 
					
						
							|  |  |  |          * re-launch them instead.  So there is no need to register | 
					
						
							|  |  |  |          * the firmware as rom to properly re-initialize on reset. | 
					
						
							|  |  |  |          * Just go for a straight file load instead. | 
					
						
							|  |  |  |          */ | 
					
						
							|  |  |  |         void *ptr = memory_region_get_ram_ptr(bios); | 
					
						
							|  |  |  |         load_image_size(filename, ptr, bios_size); | 
					
						
							|  |  |  |         x86_firmware_configure(ptr, bios_size); | 
					
						
							|  |  |  |     } else { | 
					
						
							|  |  |  |         if (!isapc_ram_fw) { | 
					
						
							|  |  |  |             memory_region_set_readonly(bios, true); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         ret = rom_add_file_fixed(bios_name, (uint32_t)(-bios_size), -1); | 
					
						
							|  |  |  |         if (ret != 0) { | 
					
						
							|  |  |  |             goto bios_error; | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  |     } | 
					
						
							|  |  |  |     g_free(filename); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /* map the last 128KB of the BIOS in ISA space */ | 
					
						
							|  |  |  |     isa_bios_size = MIN(bios_size, 128 * KiB); | 
					
						
							|  |  |  |     isa_bios = g_malloc(sizeof(*isa_bios)); | 
					
						
							|  |  |  |     memory_region_init_alias(isa_bios, NULL, "isa-bios", bios, | 
					
						
							|  |  |  |                              bios_size - isa_bios_size, isa_bios_size); | 
					
						
							|  |  |  |     memory_region_add_subregion_overlap(rom_memory, | 
					
						
							|  |  |  |                                         0x100000 - isa_bios_size, | 
					
						
							|  |  |  |                                         isa_bios, | 
					
						
							|  |  |  |                                         1); | 
					
						
							|  |  |  |     if (!isapc_ram_fw) { | 
					
						
							|  |  |  |         memory_region_set_readonly(isa_bios, true); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /* map all the bios at the top of memory */ | 
					
						
							|  |  |  |     memory_region_add_subregion(rom_memory, | 
					
						
							|  |  |  |                                 (uint32_t)(-bios_size), | 
					
						
							|  |  |  |                                 bios); | 
					
						
							| 
									
										
										
										
											2022-04-25 15:50:49 +02:00
										 |  |  |     return; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | bios_error: | 
					
						
							|  |  |  |     fprintf(stderr, "qemu: could not load PC BIOS '%s'\n", bios_name); | 
					
						
							|  |  |  |     exit(1); | 
					
						
							| 
									
										
										
										
											2019-10-08 11:56:49 +02:00
										 |  |  | } | 
					
						
							| 
									
										
										
										
											2019-10-22 09:39:50 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-15 14:09:01 +02:00
										 |  |  | bool x86_machine_is_smm_enabled(const X86MachineState *x86ms) | 
					
						
							| 
									
										
										
										
											2019-12-12 17:28:01 +01:00
										 |  |  | { | 
					
						
							|  |  |  |     bool smm_available = false; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (x86ms->smm == ON_OFF_AUTO_OFF) { | 
					
						
							|  |  |  |         return false; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (tcg_enabled() || qtest_enabled()) { | 
					
						
							|  |  |  |         smm_available = true; | 
					
						
							|  |  |  |     } else if (kvm_enabled()) { | 
					
						
							|  |  |  |         smm_available = kvm_has_smm(); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (smm_available) { | 
					
						
							|  |  |  |         return true; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (x86ms->smm == ON_OFF_AUTO_ON) { | 
					
						
							|  |  |  |         error_report("System Management Mode not supported by this hypervisor."); | 
					
						
							|  |  |  |         exit(1); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return false; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static void x86_machine_get_smm(Object *obj, Visitor *v, const char *name, | 
					
						
							|  |  |  |                                void *opaque, Error **errp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     X86MachineState *x86ms = X86_MACHINE(obj); | 
					
						
							|  |  |  |     OnOffAuto smm = x86ms->smm; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     visit_type_OnOffAuto(v, name, &smm, errp); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static void x86_machine_set_smm(Object *obj, Visitor *v, const char *name, | 
					
						
							|  |  |  |                                void *opaque, Error **errp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     X86MachineState *x86ms = X86_MACHINE(obj); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     visit_type_OnOffAuto(v, name, &x86ms->smm, errp); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-15 14:09:01 +02:00
										 |  |  | bool x86_machine_is_acpi_enabled(const X86MachineState *x86ms) | 
					
						
							| 
									
										
										
										
											2020-03-20 11:01:36 +01:00
										 |  |  | { | 
					
						
							|  |  |  |     if (x86ms->acpi == ON_OFF_AUTO_OFF) { | 
					
						
							|  |  |  |         return false; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return true; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static void x86_machine_get_acpi(Object *obj, Visitor *v, const char *name, | 
					
						
							|  |  |  |                                  void *opaque, Error **errp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     X86MachineState *x86ms = X86_MACHINE(obj); | 
					
						
							|  |  |  |     OnOffAuto acpi = x86ms->acpi; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     visit_type_OnOffAuto(v, name, &acpi, errp); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static void x86_machine_set_acpi(Object *obj, Visitor *v, const char *name, | 
					
						
							|  |  |  |                                  void *opaque, Error **errp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     X86MachineState *x86ms = X86_MACHINE(obj); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     visit_type_OnOffAuto(v, name, &x86ms->acpi, errp); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-03-10 20:28:10 +08:00
										 |  |  | static void x86_machine_get_pit(Object *obj, Visitor *v, const char *name, | 
					
						
							|  |  |  |                                     void *opaque, Error **errp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     X86MachineState *x86ms = X86_MACHINE(obj); | 
					
						
							|  |  |  |     OnOffAuto pit = x86ms->pit; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     visit_type_OnOffAuto(v, name, &pit, errp); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static void x86_machine_set_pit(Object *obj, Visitor *v, const char *name, | 
					
						
							|  |  |  |                                     void *opaque, Error **errp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     X86MachineState *x86ms = X86_MACHINE(obj);; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     visit_type_OnOffAuto(v, name, &x86ms->pit, errp); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-03-10 20:28:11 +08:00
										 |  |  | static void x86_machine_get_pic(Object *obj, Visitor *v, const char *name, | 
					
						
							|  |  |  |                                 void *opaque, Error **errp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     X86MachineState *x86ms = X86_MACHINE(obj); | 
					
						
							|  |  |  |     OnOffAuto pic = x86ms->pic; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     visit_type_OnOffAuto(v, name, &pic, errp); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static void x86_machine_set_pic(Object *obj, Visitor *v, const char *name, | 
					
						
							|  |  |  |                                 void *opaque, Error **errp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     X86MachineState *x86ms = X86_MACHINE(obj); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     visit_type_OnOffAuto(v, name, &x86ms->pic, errp); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-02-21 02:17:36 +02:00
										 |  |  | static char *x86_machine_get_oem_id(Object *obj, Error **errp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     X86MachineState *x86ms = X86_MACHINE(obj); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return g_strdup(x86ms->oem_id); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static void x86_machine_set_oem_id(Object *obj, const char *value, Error **errp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     X86MachineState *x86ms = X86_MACHINE(obj); | 
					
						
							|  |  |  |     size_t len = strlen(value); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (len > 6) { | 
					
						
							|  |  |  |         error_setg(errp, | 
					
						
							|  |  |  |                    "User specified "X86_MACHINE_OEM_ID" value is bigger than " | 
					
						
							|  |  |  |                    "6 bytes in size"); | 
					
						
							|  |  |  |         return; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     strncpy(x86ms->oem_id, value, 6); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static char *x86_machine_get_oem_table_id(Object *obj, Error **errp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     X86MachineState *x86ms = X86_MACHINE(obj); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return g_strdup(x86ms->oem_table_id); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static void x86_machine_set_oem_table_id(Object *obj, const char *value, | 
					
						
							|  |  |  |                                          Error **errp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     X86MachineState *x86ms = X86_MACHINE(obj); | 
					
						
							|  |  |  |     size_t len = strlen(value); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (len > 8) { | 
					
						
							|  |  |  |         error_setg(errp, | 
					
						
							|  |  |  |                    "User specified "X86_MACHINE_OEM_TABLE_ID | 
					
						
							|  |  |  |                    " value is bigger than " | 
					
						
							|  |  |  |                    "8 bytes in size"); | 
					
						
							|  |  |  |         return; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     strncpy(x86ms->oem_table_id, value, 8); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-05-21 12:38:20 +08:00
										 |  |  | static void x86_machine_get_bus_lock_ratelimit(Object *obj, Visitor *v, | 
					
						
							|  |  |  |                                 const char *name, void *opaque, Error **errp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     X86MachineState *x86ms = X86_MACHINE(obj); | 
					
						
							|  |  |  |     uint64_t bus_lock_ratelimit = x86ms->bus_lock_ratelimit; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     visit_type_uint64(v, name, &bus_lock_ratelimit, errp); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static void x86_machine_set_bus_lock_ratelimit(Object *obj, Visitor *v, | 
					
						
							|  |  |  |                                const char *name, void *opaque, Error **errp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     X86MachineState *x86ms = X86_MACHINE(obj); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     visit_type_uint64(v, name, &x86ms->bus_lock_ratelimit, errp); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												vl: Add sgx compound properties to expose SGX EPC sections to guest
Because SGX EPC is enumerated through CPUID, EPC "devices" need to be
realized prior to realizing the vCPUs themselves, i.e. long before
generic devices are parsed and realized.  From a virtualization
perspective, the CPUID aspect also means that EPC sections cannot be
hotplugged without paravirtualizing the guest kernel (hardware does
not support hotplugging as EPC sections must be locked down during
pre-boot to provide EPC's security properties).
So even though EPC sections could be realized through the generic
-devices command, they need to be created much earlier for them to
actually be usable by the guest.  Place all EPC sections in a
contiguous block, somewhat arbitrarily starting after RAM above 4g.
Ensuring EPC is in a contiguous region simplifies calculations, e.g.
device memory base, PCI hole, etc..., allows dynamic calculation of the
total EPC size, e.g. exposing EPC to guests does not require -maxmem,
and last but not least allows all of EPC to be enumerated in a single
ACPI entry, which is expected by some kernels, e.g. Windows 7 and 8.
The new compound properties command for sgx like below:
 ......
 -object memory-backend-epc,id=mem1,size=28M,prealloc=on \
 -object memory-backend-epc,id=mem2,size=10M \
 -M sgx-epc.0.memdev=mem1,sgx-epc.1.memdev=mem2
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20210719112136.57018-6-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
											
										 
											2021-09-28 10:40:58 +02:00
										 |  |  | static void machine_get_sgx_epc(Object *obj, Visitor *v, const char *name, | 
					
						
							|  |  |  |                                 void *opaque, Error **errp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     X86MachineState *x86ms = X86_MACHINE(obj); | 
					
						
							|  |  |  |     SgxEPCList *list = x86ms->sgx_epc_list; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     visit_type_SgxEPCList(v, name, &list, errp); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static void machine_set_sgx_epc(Object *obj, Visitor *v, const char *name, | 
					
						
							|  |  |  |                                 void *opaque, Error **errp) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     X86MachineState *x86ms = X86_MACHINE(obj); | 
					
						
							|  |  |  |     SgxEPCList *list; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     list = x86ms->sgx_epc_list; | 
					
						
							|  |  |  |     visit_type_SgxEPCList(v, name, &x86ms->sgx_epc_list, errp); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     qapi_free_SgxEPCList(list); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-10-22 09:39:50 +02:00
										 |  |  | static void x86_machine_initfn(Object *obj) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     X86MachineState *x86ms = X86_MACHINE(obj); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-12-12 17:28:01 +01:00
										 |  |  |     x86ms->smm = ON_OFF_AUTO_AUTO; | 
					
						
							| 
									
										
										
										
											2020-03-20 11:01:36 +01:00
										 |  |  |     x86ms->acpi = ON_OFF_AUTO_AUTO; | 
					
						
							| 
									
										
										
										
											2022-03-10 20:28:10 +08:00
										 |  |  |     x86ms->pit = ON_OFF_AUTO_AUTO; | 
					
						
							| 
									
										
										
										
											2022-03-10 20:28:11 +08:00
										 |  |  |     x86ms->pic = ON_OFF_AUTO_AUTO; | 
					
						
							| 
									
										
										
										
											2020-10-16 13:38:31 +02:00
										 |  |  |     x86ms->pci_irq_mask = ACPI_BUILD_PCI_IRQS; | 
					
						
							| 
									
										
										
										
											2021-02-21 02:17:36 +02:00
										 |  |  |     x86ms->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6); | 
					
						
							|  |  |  |     x86ms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8); | 
					
						
							| 
									
										
										
										
											2021-05-21 12:38:20 +08:00
										 |  |  |     x86ms->bus_lock_ratelimit = 0; | 
					
						
							| 
									
										
										
										
											2022-07-19 18:00:04 +01:00
										 |  |  |     x86ms->above_4g_mem_start = 4 * GiB; | 
					
						
							| 
									
										
										
										
											2019-10-22 09:39:50 +02:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static void x86_machine_class_init(ObjectClass *oc, void *data) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     MachineClass *mc = MACHINE_CLASS(oc); | 
					
						
							|  |  |  |     X86MachineClass *x86mc = X86_MACHINE_CLASS(oc); | 
					
						
							|  |  |  |     NMIClass *nc = NMI_CLASS(oc); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     mc->cpu_index_to_instance_props = x86_cpu_index_to_props; | 
					
						
							|  |  |  |     mc->get_default_cpu_node_id = x86_get_default_cpu_node_id; | 
					
						
							|  |  |  |     mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids; | 
					
						
							| 
									
										
										
										
											2019-11-18 11:13:25 +00:00
										 |  |  |     x86mc->save_tsc_khz = true; | 
					
						
							| 
									
										
										
										
											2021-10-20 14:48:10 +02:00
										 |  |  |     x86mc->fwcfg_dma_enabled = true; | 
					
						
							| 
									
										
										
										
											2019-10-22 09:39:50 +02:00
										 |  |  |     nc->nmi_monitor_handler = x86_nmi; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-12-12 17:28:01 +01:00
										 |  |  |     object_class_property_add(oc, X86_MACHINE_SMM, "OnOffAuto", | 
					
						
							|  |  |  |         x86_machine_get_smm, x86_machine_set_smm, | 
					
						
							| 
									
										
											  
											
												qom: Drop parameter @errp of object_property_add() & friends
The only way object_property_add() can fail is when a property with
the same name already exists.  Since our property names are all
hardcoded, failure is a programming error, and the appropriate way to
handle it is passing &error_abort.
Same for its variants, except for object_property_add_child(), which
additionally fails when the child already has a parent.  Parentage is
also under program control, so this is a programming error, too.
We have a bit over 500 callers.  Almost half of them pass
&error_abort, slightly fewer ignore errors, one test case handles
errors, and the remaining few callers pass them to their own callers.
The previous few commits demonstrated once again that ignoring
programming errors is a bad idea.
Of the few ones that pass on errors, several violate the Error API.
The Error ** argument must be NULL, &error_abort, &error_fatal, or a
pointer to a variable containing NULL.  Passing an argument of the
latter kind twice without clearing it in between is wrong: if the
first call sets an error, it no longer points to NULL for the second
call.  ich9_pm_add_properties(), sparc32_ledma_realize(),
sparc32_dma_realize(), xilinx_axidma_realize(), xilinx_enet_realize()
are wrong that way.
When the one appropriate choice of argument is &error_abort, letting
users pick the argument is a bad idea.
Drop parameter @errp and assert the preconditions instead.
There's one exception to "duplicate property name is a programming
error": the way object_property_add() implements the magic (and
undocumented) "automatic arrayification".  Don't drop @errp there.
Instead, rename object_property_add() to object_property_try_add(),
and add the obvious wrapper object_property_add().
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20200505152926.18877-15-armbru@redhat.com>
[Two semantic rebase conflicts resolved]
											
										 
											2020-05-05 17:29:22 +02:00
										 |  |  |         NULL, NULL); | 
					
						
							| 
									
										
										
										
											2019-12-12 17:28:01 +01:00
										 |  |  |     object_class_property_set_description(oc, X86_MACHINE_SMM, | 
					
						
							| 
									
										
										
										
											2020-05-05 17:29:15 +02:00
										 |  |  |         "Enable SMM"); | 
					
						
							| 
									
										
										
										
											2020-03-20 11:01:36 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     object_class_property_add(oc, X86_MACHINE_ACPI, "OnOffAuto", | 
					
						
							|  |  |  |         x86_machine_get_acpi, x86_machine_set_acpi, | 
					
						
							| 
									
										
											  
											
												qom: Drop parameter @errp of object_property_add() & friends
The only way object_property_add() can fail is when a property with
the same name already exists.  Since our property names are all
hardcoded, failure is a programming error, and the appropriate way to
handle it is passing &error_abort.
Same for its variants, except for object_property_add_child(), which
additionally fails when the child already has a parent.  Parentage is
also under program control, so this is a programming error, too.
We have a bit over 500 callers.  Almost half of them pass
&error_abort, slightly fewer ignore errors, one test case handles
errors, and the remaining few callers pass them to their own callers.
The previous few commits demonstrated once again that ignoring
programming errors is a bad idea.
Of the few ones that pass on errors, several violate the Error API.
The Error ** argument must be NULL, &error_abort, &error_fatal, or a
pointer to a variable containing NULL.  Passing an argument of the
latter kind twice without clearing it in between is wrong: if the
first call sets an error, it no longer points to NULL for the second
call.  ich9_pm_add_properties(), sparc32_ledma_realize(),
sparc32_dma_realize(), xilinx_axidma_realize(), xilinx_enet_realize()
are wrong that way.
When the one appropriate choice of argument is &error_abort, letting
users pick the argument is a bad idea.
Drop parameter @errp and assert the preconditions instead.
There's one exception to "duplicate property name is a programming
error": the way object_property_add() implements the magic (and
undocumented) "automatic arrayification".  Don't drop @errp there.
Instead, rename object_property_add() to object_property_try_add(),
and add the obvious wrapper object_property_add().
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20200505152926.18877-15-armbru@redhat.com>
[Two semantic rebase conflicts resolved]
											
										 
											2020-05-05 17:29:22 +02:00
										 |  |  |         NULL, NULL); | 
					
						
							| 
									
										
										
										
											2020-03-20 11:01:36 +01:00
										 |  |  |     object_class_property_set_description(oc, X86_MACHINE_ACPI, | 
					
						
							| 
									
										
										
										
											2020-05-05 17:29:15 +02:00
										 |  |  |         "Enable ACPI"); | 
					
						
							| 
									
										
										
										
											2021-02-21 02:17:36 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-03-10 20:28:10 +08:00
										 |  |  |     object_class_property_add(oc, X86_MACHINE_PIT, "OnOffAuto", | 
					
						
							|  |  |  |                               x86_machine_get_pit, | 
					
						
							|  |  |  |                               x86_machine_set_pit, | 
					
						
							|  |  |  |                               NULL, NULL); | 
					
						
							|  |  |  |     object_class_property_set_description(oc, X86_MACHINE_PIT, | 
					
						
							|  |  |  |         "Enable i8254 PIT"); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-03-10 20:28:11 +08:00
										 |  |  |     object_class_property_add(oc, X86_MACHINE_PIC, "OnOffAuto", | 
					
						
							|  |  |  |                               x86_machine_get_pic, | 
					
						
							|  |  |  |                               x86_machine_set_pic, | 
					
						
							|  |  |  |                               NULL, NULL); | 
					
						
							|  |  |  |     object_class_property_set_description(oc, X86_MACHINE_PIC, | 
					
						
							|  |  |  |         "Enable i8259 PIC"); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-02-21 02:17:36 +02:00
										 |  |  |     object_class_property_add_str(oc, X86_MACHINE_OEM_ID, | 
					
						
							|  |  |  |                                   x86_machine_get_oem_id, | 
					
						
							|  |  |  |                                   x86_machine_set_oem_id); | 
					
						
							|  |  |  |     object_class_property_set_description(oc, X86_MACHINE_OEM_ID, | 
					
						
							|  |  |  |                                           "Override the default value of field OEMID " | 
					
						
							|  |  |  |                                           "in ACPI table header." | 
					
						
							|  |  |  |                                           "The string may be up to 6 bytes in size"); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     object_class_property_add_str(oc, X86_MACHINE_OEM_TABLE_ID, | 
					
						
							|  |  |  |                                   x86_machine_get_oem_table_id, | 
					
						
							|  |  |  |                                   x86_machine_set_oem_table_id); | 
					
						
							|  |  |  |     object_class_property_set_description(oc, X86_MACHINE_OEM_TABLE_ID, | 
					
						
							|  |  |  |                                           "Override the default value of field OEM Table ID " | 
					
						
							|  |  |  |                                           "in ACPI table header." | 
					
						
							|  |  |  |                                           "The string may be up to 8 bytes in size"); | 
					
						
							| 
									
										
										
										
											2021-05-21 12:38:20 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     object_class_property_add(oc, X86_MACHINE_BUS_LOCK_RATELIMIT, "uint64_t", | 
					
						
							|  |  |  |                                 x86_machine_get_bus_lock_ratelimit, | 
					
						
							|  |  |  |                                 x86_machine_set_bus_lock_ratelimit, NULL, NULL); | 
					
						
							|  |  |  |     object_class_property_set_description(oc, X86_MACHINE_BUS_LOCK_RATELIMIT, | 
					
						
							|  |  |  |             "Set the ratelimit for the bus locks acquired in VMs"); | 
					
						
							| 
									
										
											  
											
												vl: Add sgx compound properties to expose SGX EPC sections to guest
Because SGX EPC is enumerated through CPUID, EPC "devices" need to be
realized prior to realizing the vCPUs themselves, i.e. long before
generic devices are parsed and realized.  From a virtualization
perspective, the CPUID aspect also means that EPC sections cannot be
hotplugged without paravirtualizing the guest kernel (hardware does
not support hotplugging as EPC sections must be locked down during
pre-boot to provide EPC's security properties).
So even though EPC sections could be realized through the generic
-devices command, they need to be created much earlier for them to
actually be usable by the guest.  Place all EPC sections in a
contiguous block, somewhat arbitrarily starting after RAM above 4g.
Ensuring EPC is in a contiguous region simplifies calculations, e.g.
device memory base, PCI hole, etc..., allows dynamic calculation of the
total EPC size, e.g. exposing EPC to guests does not require -maxmem,
and last but not least allows all of EPC to be enumerated in a single
ACPI entry, which is expected by some kernels, e.g. Windows 7 and 8.
The new compound properties command for sgx like below:
 ......
 -object memory-backend-epc,id=mem1,size=28M,prealloc=on \
 -object memory-backend-epc,id=mem2,size=10M \
 -M sgx-epc.0.memdev=mem1,sgx-epc.1.memdev=mem2
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20210719112136.57018-6-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
											
										 
											2021-09-28 10:40:58 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     object_class_property_add(oc, "sgx-epc", "SgxEPC", | 
					
						
							|  |  |  |         machine_get_sgx_epc, machine_set_sgx_epc, | 
					
						
							|  |  |  |         NULL, NULL); | 
					
						
							|  |  |  |     object_class_property_set_description(oc, "sgx-epc", | 
					
						
							|  |  |  |         "SGX EPC device"); | 
					
						
							| 
									
										
										
										
											2019-10-22 09:39:50 +02:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static const TypeInfo x86_machine_info = { | 
					
						
							|  |  |  |     .name = TYPE_X86_MACHINE, | 
					
						
							|  |  |  |     .parent = TYPE_MACHINE, | 
					
						
							|  |  |  |     .abstract = true, | 
					
						
							|  |  |  |     .instance_size = sizeof(X86MachineState), | 
					
						
							|  |  |  |     .instance_init = x86_machine_initfn, | 
					
						
							|  |  |  |     .class_size = sizeof(X86MachineClass), | 
					
						
							|  |  |  |     .class_init = x86_machine_class_init, | 
					
						
							|  |  |  |     .interfaces = (InterfaceInfo[]) { | 
					
						
							|  |  |  |          { TYPE_NMI }, | 
					
						
							|  |  |  |          { } | 
					
						
							|  |  |  |     }, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static void x86_machine_register_types(void) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     type_register_static(&x86_machine_info); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | type_init(x86_machine_register_types) |