Sync from SUSE:ALP:Source:Standard:1.0 xen revision 2e30689375849799569c0a388d847125

This commit is contained in:
Adrian Schröter 2023-12-21 13:48:34 +01:00
commit e97948ff44
91 changed files with 26737 additions and 0 deletions

23
.gitattributes vendored Normal file
View File

@ -0,0 +1,23 @@
## Default LFS
*.7z filter=lfs diff=lfs merge=lfs -text
*.bsp filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.gem filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.jar filter=lfs diff=lfs merge=lfs -text
*.lz filter=lfs diff=lfs merge=lfs -text
*.lzma filter=lfs diff=lfs merge=lfs -text
*.obscpio filter=lfs diff=lfs merge=lfs -text
*.oxt filter=lfs diff=lfs merge=lfs -text
*.pdf filter=lfs diff=lfs merge=lfs -text
*.png filter=lfs diff=lfs merge=lfs -text
*.rpm filter=lfs diff=lfs merge=lfs -text
*.tbz filter=lfs diff=lfs merge=lfs -text
*.tbz2 filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.ttf filter=lfs diff=lfs merge=lfs -text
*.txz filter=lfs diff=lfs merge=lfs -text
*.whl filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text

View File

@ -0,0 +1,31 @@
# Commit 26a449ce32cef33f2cb50602be19fcc0c4223ba9
# Date 2023-11-02 10:50:26 +0100
# Author Roger Pau Monné <roger.pau@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/x2apic: remove usage of ACPI_FADT_APIC_CLUSTER
The ACPI FADT APIC_CLUSTER flag mandates that when the interrupt delivery is
Logical mode APIC must be configured for Cluster destination model. However in
apic_x2apic_probe() such flag is incorrectly used to gate whether Physical mode
can be used.
Since Xen when in x2APIC mode only uses Logical mode together with Cluster
model completely remove checking for ACPI_FADT_APIC_CLUSTER, as Xen always
fulfills the requirement signaled by the flag.
Fixes: eb40ae41b658 ('x86/Kconfig: add option for default x2APIC destination mode')
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/genapic/x2apic.c
+++ b/xen/arch/x86/genapic/x2apic.c
@@ -231,8 +231,7 @@ const struct genapic *__init apic_x2apic
*/
x2apic_phys = iommu_intremap != iommu_intremap_full ||
(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) ||
- (IS_ENABLED(CONFIG_X2APIC_PHYSICAL) &&
- !(acpi_gbl_FADT.flags & ACPI_FADT_APIC_CLUSTER));
+ IS_ENABLED(CONFIG_X2APIC_PHYSICAL);
}
else if ( !x2apic_phys )
switch ( iommu_intremap )

View File

@ -0,0 +1,103 @@
# Commit 87f37449d586b4d407b75235bb0a171e018e25ec
# Date 2023-11-02 10:50:59 +0100
# Author Roger Pau Monné <roger.pau@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/i8259: do not assume interrupts always target CPU0
Sporadically we have seen the following during AP bringup on AMD platforms
only:
microcode: CPU59 updated from revision 0x830107a to 0x830107a, date = 2023-05-17
microcode: CPU60 updated from revision 0x830104d to 0x830107a, date = 2023-05-17
CPU60: No irq handler for vector 27 (IRQ -2147483648)
microcode: CPU61 updated from revision 0x830107a to 0x830107a, date = 2023-05-17
This is similar to the issue raised on Linux commit 36e9e1eab777e, where they
observed i8259 (active) vectors getting delivered to CPUs different than 0.
On AMD or Hygon platforms adjust the target CPU mask of i8259 interrupt
descriptors to contain all possible CPUs, so that APs will reserve the vector
at startup if any legacy IRQ is still delivered through the i8259. Note that
if the IO-APIC takes over those interrupt descriptors the CPU mask will be
reset.
Spurious i8259 interrupt vectors however (IRQ7 and IRQ15) can be injected even
when all i8259 pins are masked, and hence would need to be handled on all CPUs.
Continue to reserve PIC vectors on CPU0 only, but do check for such spurious
interrupts on all CPUs if the vendor is AMD or Hygon. Note that once the
vectors get used by devices detecting PIC spurious interrupts will no longer be
possible, however the device driver should be able to cope with spurious
interrupts. Such PIC spurious interrupts occurring when the vector is in use
by a local APIC routed source will lead to an extra EOI, which might
unintentionally clear a different vector from ISR. Note this is already the
current behavior, so assume it's infrequent enough to not cause real issues.
Finally, adjust the printed message to display the CPU where the spurious
interrupt has been received, so it looks like:
microcode: CPU1 updated from revision 0x830107a to 0x830107a, date = 2023-05-17
cpu1: spurious 8259A interrupt: IRQ7
microcode: CPU2 updated from revision 0x830104d to 0x830107a, date = 2023-05-17
Amends: 3fba06ba9f8b ('x86/IRQ: re-use legacy vector ranges on APs')
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/i8259.c
+++ b/xen/arch/x86/i8259.c
@@ -222,7 +222,8 @@ static bool _mask_and_ack_8259A_irq(unsi
is_real_irq = false;
/* Report spurious IRQ, once per IRQ line. */
if (!(spurious_irq_mask & irqmask)) {
- printk("spurious 8259A interrupt: IRQ%d.\n", irq);
+ printk("cpu%u: spurious 8259A interrupt: IRQ%u\n",
+ smp_processor_id(), irq);
spurious_irq_mask |= irqmask;
}
/*
@@ -349,7 +350,23 @@ void __init init_IRQ(void)
continue;
desc->handler = &i8259A_irq_type;
per_cpu(vector_irq, cpu)[LEGACY_VECTOR(irq)] = irq;
- cpumask_copy(desc->arch.cpu_mask, cpumask_of(cpu));
+
+ /*
+ * The interrupt affinity logic never targets interrupts to offline
+ * CPUs, hence it's safe to use cpumask_all here.
+ *
+ * Legacy PIC interrupts are only targeted to CPU0, but depending on
+ * the platform they can be distributed to any online CPU in hardware.
+ * Note this behavior has only been observed on AMD hardware. In order
+ * to cope install all active legacy vectors on all CPUs.
+ *
+ * IO-APIC will change the destination mask if/when taking ownership of
+ * the interrupt.
+ */
+ cpumask_copy(desc->arch.cpu_mask,
+ (boot_cpu_data.x86_vendor &
+ (X86_VENDOR_AMD | X86_VENDOR_HYGON) ? &cpumask_all
+ : cpumask_of(cpu)));
desc->arch.vector = LEGACY_VECTOR(irq);
}
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -1920,7 +1920,16 @@ void do_IRQ(struct cpu_user_regs *regs)
kind = "";
if ( !(vector >= FIRST_LEGACY_VECTOR &&
vector <= LAST_LEGACY_VECTOR &&
- !smp_processor_id() &&
+ (!smp_processor_id() ||
+ /*
+ * For AMD/Hygon do spurious PIC interrupt
+ * detection on all CPUs, as it has been observed
+ * that during unknown circumstances spurious PIC
+ * interrupts have been delivered to CPUs
+ * different than the BSP.
+ */
+ (boot_cpu_data.x86_vendor & (X86_VENDOR_AMD |
+ X86_VENDOR_HYGON))) &&
bogus_8259A_irq(vector - FIRST_LEGACY_VECTOR)) )
{
printk("CPU%u: No irq handler for vector %02x (IRQ %d%s)\n",

View File

@ -0,0 +1,70 @@
# Commit 4709ec82917668c2df958ef91b4f21c049c76bee
# Date 2023-11-20 10:49:29 +0100
# Author Juergen Gross <jgross@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
xen/sched: fix sched_move_domain()
When moving a domain out of a cpupool running with the credit2
scheduler and having multiple run-queues, the following ASSERT() can
be observed:
(XEN) Xen call trace:
(XEN) [<ffff82d04023a700>] R credit2.c#csched2_unit_remove+0xe3/0xe7
(XEN) [<ffff82d040246adb>] S sched_move_domain+0x2f3/0x5b1
(XEN) [<ffff82d040234cf7>] S cpupool.c#cpupool_move_domain_locked+0x1d/0x3b
(XEN) [<ffff82d040236025>] S cpupool_move_domain+0x24/0x35
(XEN) [<ffff82d040206513>] S domain_kill+0xa5/0x116
(XEN) [<ffff82d040232b12>] S do_domctl+0xe5f/0x1951
(XEN) [<ffff82d0402276ba>] S timer.c#timer_lock+0x69/0x143
(XEN) [<ffff82d0402dc71b>] S pv_hypercall+0x44e/0x4a9
(XEN) [<ffff82d0402012b7>] S lstar_enter+0x137/0x140
(XEN)
(XEN)
(XEN) ****************************************
(XEN) Panic on CPU 1:
(XEN) Assertion 'svc->rqd == c2rqd(sched_unit_master(unit))' failed at common/sched/credit2.c:1159
(XEN) ****************************************
This is happening as sched_move_domain() is setting a different cpu
for a scheduling unit without telling the scheduler. When this unit is
removed from the scheduler, the ASSERT() will trigger.
In non-debug builds the result is usually a clobbered pointer, leading
to another crash a short time later.
Fix that by swapping the two involved actions (setting another cpu and
removing the unit from the scheduler).
Link: https://github.com/Dasharo/dasharo-issues/issues/488
Fixes: 70fadc41635b ("xen/cpupool: support moving domain between cpupools with different granularity")
Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: George Dunlap <george.dunlap@cloud.com>
--- a/xen/common/sched/core.c
+++ b/xen/common/sched/core.c
@@ -732,18 +732,20 @@ int sched_move_domain(struct domain *d,
old_domdata = d->sched_priv;
/*
- * Temporarily move all units to same processor to make locking
- * easier when moving the new units to the new processors.
+ * Remove all units from the old scheduler, and temporarily move them to
+ * the same processor to make locking easier when moving the new units to
+ * new processors.
*/
new_p = cpumask_first(d->cpupool->cpu_valid);
for_each_sched_unit ( d, unit )
{
- spinlock_t *lock = unit_schedule_lock_irq(unit);
+ spinlock_t *lock;
+ sched_remove_unit(old_ops, unit);
+
+ lock = unit_schedule_lock_irq(unit);
sched_set_res(unit, get_sched_res(new_p));
spin_unlock_irq(lock);
-
- sched_remove_unit(old_ops, unit);
}
old_units = d->sched_unit_list;

704
README.SUSE Normal file
View File

@ -0,0 +1,704 @@
README for the Xen packages
===========================
This file contains SUSE-specific instructions and suggestions for using Xen.
For more in-depth documentation of using Xen on SUSE, consult the
virtualization chapter in the SLES or SUSE Linux manual, or read up-to-date
virtualization information, at
https://www.suse.com/documentation/sles11/singlehtml/book_xen/book_xen.html
For more complete documentation on Xen itself, please install the xen-doc-html
package and read the documentation installed into /usr/share/doc/packages/xen/.
About
-----
Xen allows you to run multiple virtual machines on a single physical machine.
See the Xen homepage for more information:
http://www.xenproject.org/
If you want to use Xen, you need to install the Xen hypervisor and a number of
supporting packages. During the initial SUSE installation (or when installing
from YaST) check-mark the "Xen Virtual Machine Host Server" pattern. If,
instead, you wish to install Xen manually later, click on the "Install
Hypervisor and Tools" icon in YaST.
If you want to install and manage VMs graphically, be sure to install a
graphical desktop environment like KDE or GNOME. The following optional
packages are needed to manage VMs graphically. Note that "Install Hypervisor
and Tools" installs all the packages below:
virt-install (Optional, to install VMs)
virt-manager (Optional, to manage VMs graphically)
virt-viewer (Optional, to view VMs outside virt-manager)
vm-install (Optional, to install VMs with xl only)
You then need to reboot your machine. Instead of booting a normal Linux
kernel, you will boot the Xen hypervisor and a slightly changed Linux kernel.
This Linux kernel runs in the first virtual machine and will drive most of
your hardware.
This approach is called paravirtualization, since it is a partial
virtualization (the Linux kernel needs to be changed slightly, to make the
virtualization easier). It results in very good performance (consult
http://www.cl.cam.ac.uk/research/srg/netos/xen/performance.html) but has the
downside of unchanged operating systems not being supported. However, new
hardware features (e.g., Intel VT and AMD-V) are overcoming this limitation.
Terminology
-----------
The Xen open-source community has a number of terms that you should be
familiar with.
A "domain" is Xen's term for a virtual machine.
"Domain 0" is the first virtual machine. It can control all other virtual
machines. It also (usually) controls the physical hardware. A kernel used in
domain 0 may sometimes be referred to as a dom0 kernel.
"Domain U" is any virtual machine other than domain 0. The "U" indicates it
is unprivileged (that is, it cannot control other domains). A kernel used in
an unprivileged domain may be referred to as a domU kernel.
SUSE documentation will use the more industry-standard term "virtual
machine", or "VM", rather than "domain" where possible. And to that end,
domain 0 will be called the "virtual machine server", since it essentially the
server on which the other VMs run. All other domains are simply "virtual
machines".
The acronym "HVM" refers to a hardware-assisted virtual machine. These are
VMs that have not been modified (e.g., Windows) and therefore need hardware
support such as Intel VT or AMD-V to run on Xen.
Kernels
-------
Xen supports two kinds of kernels: A privileged kernel (which boots the
machine, controls other VMs, and usually controls all your physical hardware)
and unprivileged kernels (which can't control other VMs, and usually don't need
drivers for physical hardware). The privileged kernel boots first (as the VM
server); an unprivileged kernel is used in all subsequent VMs.
The VM server takes control of the boot process after Xen has initialized the
CPU and the memory. This VM contains a privileged kernel and all the hardware
drivers.
For the other virtual machines, you usually don't need the hardware drivers.
(It is possible to hide a PCI device from the VM server and re-assign it to
another VM for direct access, but that is a more advanced topic.) Instead you
use virtual network and block device drivers in the unprivileged VMs to access
the physical network and block drivers in the VM server.
For simplicity, SUSE ships a single Xen-enabled Linux kernel, rather than
separate privileged and unprivileged kernels. As most of the hardware drivers
are modules anyway, using this kernel as an unprivileged kernel has very
little extra overhead.
The kernel is contained in the kernel-xen package, which you need to install to
use Xen.
Booting
-------
If you installed Xen during the initial SUSE installation, or installed one
of the kernel-xen* packages later, a "XEN" option should exist in your Grub
bootloader. Select that to boot SUSE on top of Xen.
If you want to add additional entries, or modify the existing ones, you may
run the YaST2 Boot Loader program.
Once you have booted this configuration successfully, you are running Xen with
a privileged kernel on top of it.
Xen Boot Parameters
-------------------
Normally, xen.gz requires no parameters. However, in special cases (such as
debugging or a dedicated VM server) you may wish to pass it parameters.
Adding parameters to xen.gz can be done by editing the /etc/default/grub file.
Add the following line to this file; GRUB_CMDLINE_XEN_DEFAULT="<parameters>". The
parameters may be valid options passed to xen.gz (the hypervisor). After
editing this file, you must first run 'grub2-mkconfig -o /boot/grub2/grub.cfg'
and then reboot for the changes to take effect.
For more information on how to add options to the hypervisor, see the sections
below called; "Dom0 Memory Ballooning" and "Troubleshooting".
For a more complete discussion of possible parameters, see the user
documentation in the xen-doc-html package.
Creating a VM with virt-install
-------------------------------
The virt-install program (part of the virt-install package, and accessible
through YaST's Control Center) is the recommended method to create VMs. This
program handles creating both the VM's libvirt XML definition and disk(s).
It can help install any operating system, not just SUSE. virt-install has both
a command line only mode and a graphical wizard mode that may be used to define
and start VM installations.
virt-install may be launched from the virt-manager VM management tool. Start
virt-manager either from the YaST Control Center or from the command line.
The installation icon from the main virt-manager screen may be selected to
begin the virt-install installation wizard.
The use of virt-install or virt-manager requires the installation of the
libvirt packages and the libvirt daemon must be running on the host unless
you are managing a remote host.
Each VM needs to have its own root filesystem. The root filesystem can live
on a block device (e.g., a hard disk partition, or an LVM2 or EVMS volume) or
in a file that holds the filesystem image.
VMs can share filesystems, such as /usr or /opt, that are mounted read-only
from _all_ VMs. Never try to share a filesystem that is mounted read-write;
filesystem corruption will result. For sharing writable data between VMs, use
NFS or other networked or cluster filesystems.
When defining the virtual network adapter(s), we recommend using a static MAC
for the VM rather than allowing Xen to randomly select one each time the VM
boots. (See "Network Troubleshooting" below.) The Xen Project has been
allocated a range of MAC addresses with the OUI of 00-16-3E. By using MACs
from this range you can be sure they will not conflict with any physical
adapters.
When the VM shuts down (because the installation -- or at least the first
stage of it -- is done), the wizard finalizes the VM's configuration and
restarts the VM.
The creation of VMs can be automated; read the virt-install man page for more
details. The installation of an OS within the VM can be automated if the OS
supports it.
Creating a VM with vm-install
-----------------------------
The vm-install program is also provided to create VMs. Like virt-install,
this optional program handles creating both the VM's libvirt XML definition
and disk(s). It also creates a legacy configuration file for use with 'xl'.
It can help install any operating system, not just SUSE.
From the command line, run "vm-install". If the DISPLAY environment variable
is set and the supporting packages (python-gtk) are installed, a graphical
wizard will start. Otherwise, a text wizard will start. If vm-install is
started with the '--use-xl' flag, it will not require libvirt nor attempt
to communicate with libvirt when creating a VM and instead will only use the
'xl' toolstack to start VM installations.
Once you have the VM configured, click "OK". The wizard will now create a
configuration file for the VM, and create a disk image. The disk image will
exist in /var/lib/xen/images, and a corresponding configuration file will exist
in /etc/xen/vm. The operating system's installation program will then run
within the VM.
When the VM shuts down (because the installation -- or at least the first
stage of it -- is done), the wizard finalizes the VM's configuration and
restarts the VM.
The creation of VMs can be automated; read the vm-install man page for more
details. The installation of an OS within the VM can be automated if the OS
supports it.
Creating a VM Manually
----------------------
If you create a VM manually (as opposed to using virt-install, which is the
recommended way), you will need to create a disk (or reuse an existing one)
and a configuration file.
If you are using a disk or disk image that is already installed with an
operating system and you want the VM to run in paravirtual mode, you'll
probably need to replace its kernel with a Xen-enabled kernel.
The kernel and ramdisk used to bootstrap the VM must match any kernel modules
that might be present in the VM's disk. It is possible to manually copy the
kernel and ramdisk from the VM's disk (for example, after updating the kernel
within that VM) to the VM server's filesystem. However, an easier (and less
error-prone) method is to use /usr/lib/grub2/x86_64-xen/grub.xen as the VM
kernel. When the new VM is started, it runs grub.xen to read the grub
configuration from the VM disk, selecting the configured kernel and ramdisk
so that it can be used to bootstrap the new VM.
Next, make a copy of one of the /etc/xen/examples/* files, and modify it to
suit your needs. You'll need to change (at very least) the "name" and "disk"
parameters. See /etc/xen/examples/ for example configuration files.
Managing Virtual Machines
-------------------------
VMs can be managed from the command line using 'virsh' or from virt-manager.
VMs created by virt-install or vm-install (without vm-install's --use-xl flag)
will automatically be defined in libvirt. VMs defined in libvirt may be managed
by virt-manager or from the command line using the 'virsh' command. However,
if you copy a VM from another machine and manually create a VM XML configuration
file, you will need to import it into libvirt with a command like:
virsh define <path to>/my-vm.xml
This imports the configuration into libvirt (and therefore virt-manager becomes
aware of it, also).
Now to start the VM:
virsh start my-vm
or start it from virt-manager's graphical menu.
Have a look at running VMs with "virsh list". Attach to the VM's text console
with "virsh console <vm-name>". Attaching to multiple VM consoles is most
conveniently done with the terminal multiplexer "screen".
Have a look at the other virsh commands by typing "virsh help". Note that most
virsh commands must be done as root.
Changes in the Xen VM Management Toolstack
------------------------------------------
With SUSE Linux Enterprise Server 12, the way VMs are managed has changed
when compared with older SLES versions. Users familiar with the 'xm' command
and the xend management daemon will notice that these are absent. The xm/xend
toolstack has been replaced with the xl toolstack. The xl toolstack is
intended to remain backwards compatible with existing xm domain configuration
files. Most 'xm' commands can simply be replaced with 'xl'. One significant
difference is that xl does not support the concept of Managed Domains. The xl
command can only modify running VMs. Once the VM is shutdown, there is no
preserved state information other than what is saved in the configuration
file used to start the VM. In order to provide Managed Domains, users are
encouraged to use libvirt and it's tools to create and modify VMs. These
tools include the command line tool 'virsh' and the graphical tools
virt-manager and virt-install.
Warning: Using xl commands to modify libvirt managed domains will result in
errors when virsh or virt-manager is used. Please use only virsh or
virt-manager to manage libvirt managed domains. If you are not using libvirt
managed domains then using xl commands is the correct way to modify running
domains.
Using the Mouse via VNC in Fully Virtual Mode
---------------------------------------------
In a fully virtualized VM, the mouse may be emulated as a PS/2 mouse, USB
mouse, or USB tablet. The virt-install tool selects the best emulation that is
known to be automatically detected and supported by the operating system.
However, when accessing some fully virtualized operating systems via VNC, the
mouse may be difficult to control if the VM is emulating a PS/2 mouse. PS/2
provides mouse deltas, but VNC only provides absolute coordinates. In such
cases, you may want to manually switch the operating system and VM to use a
USB tablet.
Emulation of a SummaSketch graphics tablet is provided for this reason. To
use the Summa emulation, you will need to configure your fully virtualized OS.
Note that the virtual tablet is connected to the second virtual serial port
(/dev/ttyS1 or COM2).
Most Linux distributions ship with appropriate drivers, and only need to be
configured. To configure gpm, edit /etc/sysconfig/mouse and add these lines:
MOUSETYPE="summa"
XMOUSETYPE="SUMMA"
DEVICE=/dev/ttyS1
The format and location of your configuration file could vary depending upon
your Linux distribution. The goal is to run the gpm daemon as follows:
gpm -t summa -m /dev/ttyS1
X also needs to be configured to use the Summa emulation. Add the following
stanza to /etc/X11/xorg.conf, or use your distribution's tools to add these
settings:
Section "InputDevice"
Identifier "Mouse0"
Driver "summa"
Option "Device" "/dev/ttyS1"
Option "InputFashion" "Tablet"
Option "Mode" "Absolute"
Option "Name" "EasyPen"
Option "Compatible" "True"
Option "Protocol" "Auto"
Option "SendCoreEvents" "on"
Option "Vendor" "GENIUS"
EndSection
After making these changes, restart gpm and X.
HVM Console in Fully Virtual Mode
---------------------------------
When running a VM in fully virtual mode, a special console is available that
provides some additional ways to control the VM. Press Ctrl-Alt-2 to access
the console; press Ctrl-Alt-1 to return to the VM. While at the console,
type "help" for help.
The two most important commands are "send-key" and "change". The "send-key"
command allows you to send any key sequence to the VM, which might otherwise
be intercepted by your local window manager.
The "change" command allows the target of a block device to be changed; for
example, use it to change from one CD ISO to another. Some versions of Xen
have this command disabled for security reasons. Consult the online
documentation for workarounds.
Networking
----------
Your virtual machines become much more useful if you can reach them via the
network. Starting with openSUSE11.1 and SLE11, networking in domain 0 is
configured and managed via YaST. The yast2-networking module can be used
to create and manage bridged networks. During initial installation, a bridged
networking proposal will be presented if the "Xen Virtual Machine Host Server"
pattern is selected. The proposal will also be presented if you install Xen
after initial installation using the "Install Hypervisor and Tools" module in
YaST.
The default proposal creates a virtual bridge in domain 0 for each active
ethernet device, enslaving the device to the bridge. Consider a machine
containing two ethernet devices (eth0 and eth1), both with active carriers.
YaST will create br0 and br1, enslaving the eth0 and eth1 devices repectively.
VMs get a virtual network interface (e.g. eth0), which is visible in domain 0
as vifN.0 and connected to the bridge. This means that if you set up an IP
address in the VMs belonging to the same subnet as br0 from your domain 0,
you'll be able to communicate not only with the other slave VMs, but also with
domain 0 and with the external network. If you have a DHCP server running in
your network, your VMs should succeed in getting an IP address.
Be aware that this may have unwanted security implications. You may want to
opt for routing instead of bridging, so you can set up firewalling rules in
domain 0.
Please read about the network configuration in the Xen manual. You can set up
bridging or routing for other interfaces also.
For debugging, here's what happens on bootup of a domU:
- xenstored saves the device setup in xenstore
- domU is created
- vifN.0 shows up in domain 0 and a hotplug event is triggered
- hotplug is /sbin/udev; udev looks at /etc/udev/rules.d/40-xen.rules and
calls /etc/xen/scripts/vif-bridge online
- vif-bridge set the vifN.0 device up and enslaves it to the bridge
- eth0 shows up in domU (hotplug event triggered)
Similar things happen for block devices, except that /etc/xen/scripts/block is
called.
It's not recommended to use ifplugd nor NetworkManager for managing the
interfaces if you use bridging mode. Use routing with nat or proxy-arp
in that case. You also need to do that in case you want to send out packets
on wireless; you can't bridge Xen "ethernet" packets into 802.11 packets.
Network Troubleshooting
-----------------------
First ensure the VM server is configured correctly and can access the network.
Do not use ifplugd or NetworkManager, neither are bridge aware.
Specify a static virtual MAC in the VM's configuration file. Random MACs can
be problematic, since with each boot of the VM it appears that some hardware
has been removed (the previous random MAC) and new hardware is present (the
new random MAC). This can cause network configuration files (which were
intended for the old MAC) to not be matched up with the new virtual hardware.
In the VM's filesystem, ensure the ifcfg-eth* files are named appropriately.
For example, if you do decide to use a randomly-selected MAC for the VM, the
ifcfg-eth* file must not include the MAC in its name; name it generically
("ifcfg-eth0") instead. If you use a static virtual MAC for the VM, be sure
that is reflected in the file's name.
Thread-Local Storage
--------------------
For some time now, the glibc thread library (NPTL) has used a shortcut to
access thread-local variables at a negative segment offset from the segment
selector GS instead of reading the linear address from the TDB (offset 0).
Unfortunately, this optimization has been made the default by the glibc and
gcc maintainers, as it saves one indirection. For Xen this is bad: The access
to these variables will trap, and Xen will need to use some tricks to make the
access work. It does work, but it's very slow.
SUSE Linux 9.1 and SLES 9 were prior to this change, and thus are not
affected. SUSE Linux 9.2 and 9.3 are affected. For SUSE Linux 10.x and SLES
10, we have disabled negative segment references in gcc and glibc, and so
these are not affected. Other non-SUSE Linux distributions may be affected.
For affected distributions, one way to work around the problem is to rename
the /lib/tls directory, so the pre-i686 version gets used, where no such
tricks are done. An example LSB-compliant init script which automates these
steps is installed at /usr/share/doc/packages/xen/boot.xen. This script
renames /lib/tls when running on Xen, and restores it when not running on Xen.
Modify this script to work with your specific distribution.
Mono has a similar problem, but this has been fixed in SUSE Linux 10.1 and
SLES 10. Older or non-SUSE versions of Mono may have a performance impact.
Security
--------
Domain 0 has control over all domains. This means that care should be taken to
keep domain 0 safe; ideally you strip it down to only do as little there as
possible, preferably with no local users except for the system administrator.
Most commands in domain 0 can only be performed as root, but this protection
scheme only has moderate security and might be defeated. In case domain 0 is
compromised, all other domains are compromised as well.
To allow relocation of VMs (migration), the receiving machine listens on TCP
port 8002. You might want to put firewall rules in place in domain 0 to
restrict this to machines which you trust. Relocating VMs with sensitive data
is not a good idea in untrusted networks, since the data is not sent encrypted.
The memory protections for the domUs are effective; so far no way to break out
of a virtual machine is known. A VM is an effective jail.
Limitations
-----------
When booting, Linux reserves data structures matching the amount of RAM found.
This has the side-effect that you can't dynamically grow the memory beyond
what the kernel has been booted with. But you can trick domU Linux to prepare
for a larger amount of RAM by passing the mem= boot parameter.
The export of virtual hard disks from files in Xen can be handled via the
loopback driver (although in Xen >= 3.0.4, this is can be replaced by the
"blktap" user-space driver.) If you are still using loopback, it may be
possible to run out of loopback devices, as by default only 64 are supported.
You can change this by inserting:
options loop max_loop=128
into /etc/modprobe.conf.local in domain 0.
Upgrading the Host Operating System
-----------------------------------
When upgrading the host operating system from one major release to another
(for example, SLES 11 to SLES 12 or openSUSE 12.3 to openSUSE 13.1) or when
applying a service pack like SLES 11 SP3 to SLES 11 SP2 all running VMs must
be shut down before the upgrade process is begun.
On versions of SLES 11 and openSUSE 12 you are using the xm/xend toolstack.
After upgrading to SLES 12 and newer openSUSE versions this toolstack will be
replaced with the xl toolstack. The xl toolstack does not support Managed
Domains. If you wish to continue using Managed Domains you must switch to
using libvirt and its command line interface 'virsh'. You may also use
virt-manager as a GUI interface to libvirt. After upgrading the host but
before you can begin using libvirt on VMs that were previously managed by
xm/xend, you must run a conversion tool called /usr/sbin/xen2libvirt for all
VMs.
For example, to convert all domains previously managed by xend:
xen2libvirt -r /var/lib/xend/domains/
Now typing 'virsh list --all' will show your previously xend managed domains
being managed by libvirt. Run 'xen2libvirt -h' to see additional options for
using this tool.
Memory Ballooning in VMs
------------------------
Setting a VMs maximum memory value greater than the initial memory value
requires support for memory ballooning in the VMs operating system. Modern SLES
and openSUSE guests have this capability built-in. Windows installation media
does not support memory ballooning so you must first install the VM without
memory ballooning (maxmem equal to initial memory). After the installation, the
Virtual Machine Driver Pack (vmdp) must be installed. After this, the VMs
maxmem value may be increased. A reboot of the VM is required for this action
to take effect.
Dom0 Memory Ballooning
----------------------
It is strongly recommended that you dedicate a fixed amount of RAM to dom0
rather than relying on dom0 auto ballooning. Doing so will ensure your dom0
has enough resources to operate well and will improve startup times for your
VMs. The amount of RAM dedicated to dom0 should never be less than the
recommended minimum amount for running your SUSE distribution in native mode.
The actual amount of RAM needed for dom0 depends on several factors including
how much physical RAM is on the host, the number of physical CPUs, and the
number of VMs running simultaneously where each VM has a specific requirement
for RAM. The following example shows the syntax for doing this. This would be
added to your grub1 or grub2 configuration;
Grub2 Example:
Edit /etc/default/grub and add,
GRUB_CMDLINE_XEN_DEFAULT="dom0_mem=1024M,max:1024M"
and then run
grub2-mkconfig -o /boot/grub2/grub.cfg
Grub1 Example:
Edit /boot/grub/menu.lst and edit the line containing xen.gz
kernel /boot/xen.gz dom0_mem=1024M,max:1024M
After modifying your grub configuration, you will need to edit /etc/xen/xl.conf
and set autoballoon="off". This will prevent xl from automatically adjusting
the amount of memory assigned to dom0. Reboot the host for these changes to
take effect.
Adjusting LIBXL_HOTPLUG_TIMEOUT at runtime
------------------------------------------
A domU with a large amount of disks may run into the hardcoded
LIBXL_HOTPLUG_TIMEOUT limit, which is 40 seconds. This happens if the
preparation for each disk takes an unexpected large amount of time. Then
the sum of all configured disks and the individual preparation time will
be larger than 40 seconds. The hotplug script which does the preparation
takes a lock before doing the actual preparation. Since the hotplug
scripts for each disk are spawned at nearly the same time, each one has
to wait for the lock. Due to this contention, the total execution time
of a script can easily exceed the timeout. In this case libxl will
terminate the script because it has to assume an error condition.
Example:
10 configured disks, each one takes 3 seconds within the critital
section. The total execution time will be 30 seconds, which is still
within the limit. With 5 additional configured disks, the total
execution time will be 45 seconds, which would trigger the timeout.
To handle such setup without a recompile of libxl, a special key/value
has to be created in xenstore prior domain creation. This can be done
either manually, or at system startup. A dedicated systemd service file
exists to set the required value. To enable it, run these commands:
/etc/systemd/system # systemctl enable xen-LIBXL_HOTPLUG_TIMEOUT.service
/etc/systemd/system # systemctl start xen-LIBXL_HOTPLUG_TIMEOUT.service
In case the value in this service file needs to be changed, a copy with
the exact same name must be created in the /etc/systemd/system directory:
/etc/systemd/system # cat xen-LIBXL_HOTPLUG_TIMEOUT.service
[Unit]
Description=set global LIBXL_HOTPLUG_TIMEOUT
ConditionPathExists=/proc/xen/capabilities
Requires=xenstored.service
After=xenstored.service
Requires=xen-init-dom0.service
After=xen-init-dom0.service
Before=xencommons.service
[Service]
Type=oneshot
RemainAfterExit=true
ExecStartPre=/bin/grep -q control_d /proc/xen/capabilities
ExecStart=/usr/bin/xenstore-write /libxl/suse/per-device-LIBXL_HOTPLUG_TIMEOUT 10
[Install]
WantedBy=multi-user.target
In this example the per-device value will be set to 10 seconds.
The change for libxl which handles this xenstore value will enable
additional logging if the key is found. That extra logging will show how
the execution time of each script.
Troubleshooting
---------------
First try to get Linux running on bare metal before trying with Xen.
Be sure your Xen hypervisor (xen) and VM kernels (kernel-xen) are compatible.
The hypervisor and domain 0 kernel are a matched set, and usually must be
upgraded together. Consult the online documentation for a matrix of supported
32- and 64-bit combinations
If you have trouble early in the boot, try passing pnpacpi=off to the Linux
kernel. If you have trouble with interrupts or timers, passing lapic to Xen
may help. Xen and Linux understand similar ACPI boot parameters. Try the
options acpi=off,force,ht,noirq or acpi_skip_timer_override.
Other useful debugging options to Xen may be nosmp, noreboot, mem=4096M,
sync_console, noirqbalance (Dell). For a complete list of Xen boot options,
consult the "Xen Hypervisor Command Line Options" documentation.
If domain 0 Linux crashes on X11 startup, please try to boot into runlevel 3.
1) As a first step in debugging Xen you should add the following hypervisor
options to the xen.gz line in your grub configuration file. After rebooting,
the 'xl dmesg' command will produce more output to better analyze problems.
Grub2 Example:
Edit /etc/default/grub and add,
GRUB_CMDLINE_XEN_DEFAULT="loglvl=all guest_loglvl=all"
and then run,
grub2-mkconfig -o /boot/grub2/grub.cfg
Grub1 Example:
Edit /boot/grub/menu.lst and edit the line containing xen.gz
kernel /boot/xen.gz loglvl=all guest_loglvl=all
2) With the log levels specified above and the host rebooted, more useful
information about domain 0 and running VMs can be obtained using the
'xl dmesg' and 'xl debug-keys' commands. For example, from the command line
run:
xl debug-keys h
and then run:
xl dmesg
Note that at the end of the output from 'xl dmesg' it includes help on a
series of commands that may be passed to 'xl debug-keys'. For example, by
passing the letter 'q' to 'xl debug-keys' it will "dump domain (and guest
debug) info".
xl debug-keys q
Now you can again run 'xl dmesg' to see the domain and guest debug info.
3) Sometimes it is useful to attach a serial terminal and direct Xen to send
its output not only to the screen, but also to that terminal. First you need
to attach a serial cable from the serial port on the server to a second
machine's serial port. That second machine could be running minicom (or some
other program that can be setup to read from the serial port). Do the
following to prepare Xen to send its output over this serial line.
Grub2 Example:
Edit /etc/default/grub and add,
GRUB_CMDLINE_XEN_DEFAULT="loglvl=all guest_loglvl=all console=com1 com1=115200,8n1"
Also append additional serial flags to the option below such that it appears as,
GRUB_CMDLINE_LINUX_DEFAULT="<pre-existing flags> console=ttyS0, 115200"
where pre-existing flags are those options already present and then run,
grub2-mkconfig -o /boot/grub2/grub.cfg
Grub1 Example:
Edit the /etc/grub/menu.lst file and add the following to the Xen entry,
kernel /boot/xen.gz loglvl=all guest_loglvl=all console=com1 com1=115200,8n1
module /boot/vmlinuz-xen <pre-existing flags> console=ttyS0, 115200
Once the hardware and software are configured correctly the server is rebooted
and its output should appear on the other terminal as the server boots up.
4) To further debug Xen or domain 0 Linux crashes or hangs, it may be useful to
use the debug-enabled hypervisor, and/or to prevent automatic rebooting.
Grub2 Example:
Edit /etc/default/grub and add,
GRUB_CMDLINE_XEN_DEFAULT="noreboot loglvl=all guest_loglvl=all"
Edit /boot/grub2/grub.cfg and look for these lines:
multiboot /boot/xen-<version>.gz ...
and replace them with:
multiboot /boot/xen-dbg-<version>.gz' ... Replace <version> with the
appropriate version string contained in the filename. Note that running
grub2-mkconfig -o /boot/grub2/grub.cfg will overwrite all manual changes
made to grub.cfg.
Grub1 Example:
Edit your menu.lst configuration from something like this:
kernel (hd0,5)/xen.gz
To something like this:
kernel (hd0,5)/xen-dbg.gz noreboot loglvl=all guest_loglvl=all
All hypervisor options require a reboot to take effect. After rebooting, the
Xen hypervisor will write any error messages to the log file (viewable with
the "xl dmesg" command).
If problems persist, check if a newer version is available. Well-tested
versions will be shipped with SUSE and via YaST Online Update.
Resources
---------
https://www.suse.com/documentation/sles11/singlehtml/book_xen/book_xen.html
http://doc.opensuse.org/products/draft/SLES/SLES-xen_sd_draft/cha.xen.basics.html
Feedback
--------
In case you have remarks about, problems with, ideas for, or praise for Xen,
please report it back to the xen-devel list:
xen-devel@lists.xen.org
If you find issues with the packaging or setup done by SUSE, please report
it through bugzilla:
https://bugzilla.suse.com
ENJOY!
Your SUSE Team.

1
baselibs.conf Normal file
View File

@ -0,0 +1 @@
xen-libs

View File

@ -0,0 +1,50 @@
Index: xen-4.18.0-testing/tools/misc/xencov_split
===================================================================
--- xen-4.18.0-testing.orig/tools/misc/xencov_split
+++ xen-4.18.0-testing/tools/misc/xencov_split
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
from __future__ import print_function
from builtins import str
Index: xen-4.18.0-testing/tools/python/scripts/convert-legacy-stream
===================================================================
--- xen-4.18.0-testing.orig/tools/python/scripts/convert-legacy-stream
+++ xen-4.18.0-testing/tools/python/scripts/convert-legacy-stream
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
Index: xen-4.18.0-testing/tools/python/scripts/verify-stream-v2
===================================================================
--- xen-4.18.0-testing.orig/tools/python/scripts/verify-stream-v2
+++ xen-4.18.0-testing/tools/python/scripts/verify-stream-v2
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
# -*- coding: utf-8 -*-
""" Verify a v2 format migration stream """
Index: xen-4.18.0-testing/tools/xenmon/xenmon.py
===================================================================
--- xen-4.18.0-testing.orig/tools/xenmon/xenmon.py
+++ xen-4.18.0-testing/tools/xenmon/xenmon.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
#####################################################################
# xenmon is a front-end for xenbaked.
Index: xen-4.18.0-testing/tools/xentrace/xentrace_format
===================================================================
--- xen-4.18.0-testing.orig/tools/xentrace/xentrace_format
+++ xen-4.18.0-testing/tools/xentrace/xentrace_format
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
# by Mark Williamson, (C) 2004 Intel Research Cambridge

384
block-dmmd Normal file
View File

@ -0,0 +1,384 @@
#! /bin/bash
# Usage: block-dmmd [add args | remove args]
#
# the dmmd device syntax (in xl commands/configs) is something like:
# script=block-dmmd,md;/dev/md0;md;/dev/md1;lvm;/dev/vg1/lv1
# or
# script=block-dmmd,lvm;/dev/vg1/lv1;lvm;/dev/vg1/lv2;md;/dev/md0
# device pairs (type;dev) are processed in order, with the last device
# assigned to the VM
#
# Note - When using the libxl stack, the "script=block-dmmd" option
# is required. See man xl-disk-configuration(5) for more information.
#
# md devices can optionally:
# specify a config file through:
# md;/dev/md100(/var/xen/config/mdadm.conf)
# use an array name (mdadm -N option):
# md;My-MD-name;lvm;/dev/vg1/lv1
#
# Completely expressive syntax should be similar to:
# "format=raw, vdev=xvdb, access=rw, script=block-dmmd, \
# target=md;/dev/md0(/etc/mdadm.conf);lvm;/dev/vg1/lv1"
#
##
# History:
# 2017-07-10, mlatimer@suse.com:
# Modification to use syslog for progress messages by ldevulder@suse.com
# 2017-06-12, mlatimer@suse.com:
# Merge LVM improvements by loic.devulder@mpsa.com
# Document libxl "script=block-dmmd" syntax in examples
# Remove xm/xend references (e.g. parsed_timeout from xend-config.sxp)
# 2016-05-27, mlatimer@suse.com:
# Merge improvements by loic.devulder@mpsa.com. Highlights include:
# - Re-write and simplification to speed up the script!
# - Add some (useful) logging messages and comments
# Minor tweaks and logging improvements
# 2016-05-26, mlatimer@suse.com:
# Verify MD activation if mdadm returns 2
# 2016-05-20, mlatimer@suse.com:
# Strip leading "dmmd:" if present in xenstore params value
# 2013-07-03, loic.devulder@mpsa.com:
# Partial rewrite of the script for supporting MD activation by name
# 2009-06-09, mh@novell.com:
# Emit debugging messages into a temporary file; if no longer needed,
# just comment the exec I/O redirection below
# Make variables used in functions local to avoid global overridings
# Use vgscan and vgchange where required
# Use the C locale to avoid dealing with localized messages
# Assign output from assembling an MD device to a variable to aid
# debugging
# We do not want to deal with localized messages
# We use LC_ALL because LC_ALL superse LANG
# But we also use LANG because some applications may still use LANG...
export LC_ALL=C
export LANG=${LC_ALL}
# Loading common libraries
. $(dirname $0)/block-common.sh
# Constants
typeset -rx MDADM_BIN=/sbin/mdadm
typeset -rx LVCHANGE_BIN=/sbin/lvchange
typeset -rx PVSCAN_BIN=/sbin/pvscan
typeset -rx VGSCAN_BIN=/sbin/vgscan
typeset -rx VGCHANGE_BIN=/sbin/vgchange
typeset -rx CLVMD_BIN=/usr/sbin/clvmd
typeset -rx DATE_SEC="date +%s"
# We check for errors ourselves
set +e
function reload_clvm()
{
# If we are in cluster mode
if ps -e | grep -q [c]lvmd 2>/dev/null; then
# Logging message
log info "Synchronizing cLVM..."
# Synchronize cLVM
${CLVMD_BIN} -R > /dev/null 2>&1 \
|| return 1
fi
return 0
}
function run_mdadm()
{
local mdadm_cmd=$1
local msg
local rc
msg="$(${MDADM_BIN} ${mdadm_cmd} 2>&1)"
rc=$?
case "${msg}" in
*"has been started"* | *"already active"*)
return 0
;;
*"is already in use"*)
# Hmm, might be used by another device in this domU
# Leave it to upper layers to detect a real error
return 2
;;
*)
return ${rc}
;;
esac
# Normally we should not get here, but if this happens
# we have to return an error
return 1
}
function activate_md()
{
# Make it explicitly local
local par=$1
local cfg dev dev_path rc t mdadm_opts
if [[ ${par} == ${par%%(*} ]]; then
# No configuration file specified
dev=${par}
cfg=""
else
dev=${par%%(*}
t=${par#*(}
cfg="-c ${t%%)*}"
fi
# Looking for device name or aliase
if [[ ${dev:0:1} == / ]]; then
dev_path=${dev%/*}
mdadm_opts=""
else
dev_path=/dev/md
mdadm_opts="-s -N"
fi
# Logging message
log info "Activating MD device ${dev}..."
# Is MD device already active?
# We need to use full path name, aliase is not possible...
if [ -e ${dev_path}/${dev##*/} ]; then
${MDADM_BIN} -Q -D ${dev_path}/${dev##*/} 2>/dev/null \
| grep -iq state.*\:.*inactive || return 0
fi
# Activate MD device
run_mdadm "-A ${mdadm_opts} ${dev} ${cfg}"
rc=$?
# A return code of 2 can indicate the array configuration was incorrect
if [[ ${rc} == 2 ]]; then
# Logging message
log info "Verifying MD device ${dev} activation..."
# If the array is active, return 0, otherwise return an error
${MDADM_BIN} -Q -D ${dev_path}/${dev##*/} &>/dev/null && return 0 \
|| return 1
fi
return ${rc}
}
function deactivate_md()
{
local par=$1
local dev
if [[ ${par} == ${par%%(*} ]]; then
# No configuration file specified
dev=${par}
else
dev=${par%%(*}
fi
# Looking for device name or aliase
if [[ ${dev:0:1} == / ]]; then
dev_path=${dev%/*}
else
dev_path=/dev/md
fi
# Logging message
log info "Deactivating MD device ${dev}..."
# We need the device name only while deactivating
${MDADM_BIN} -S ${dev_path}/${dev##*/} > /dev/null 2>&1
return $?
}
function lvm_action()
{
local action=$1
local dev=$2
local run_timeout=90
local end_time
# Logging message
log info "${action} LVM device ${dev}..."
# Set end_time for the loop
(( end_time = $(${DATE_SEC}) + run_timeout ))
while true; do
# Action depends of what the user asks
if [[ ${action} == activate ]]; then
# First scan for PVs and VGs
# We need this for using MD device as PV
${PVSCAN_BIN} > /dev/null 2>&1
${LVCHANGE_BIN} -aey ${dev} > /dev/null 2>&1 \
&& [[ -e ${dev} ]] \
&& return 0
elif [[ ${action} == deactivate ]]; then
${LVCHANGE_BIN} -aen ${dev} > /dev/null 2>&1 \
&& return 0
# If the LV is already deactivated we may be in an infinite loop
# So we need to test if the LV is still present
[[ -e ${dev} ]] || return 0
fi
# It seems that we had a problem during lvchange
# If we are in a cluster the problem may be due to a cLVM locking bug,
# so try to reload it
reload_clvm
# If it takes too long we need to return an error
if (( $(${DATE_SEC}) >= end_time )); then
log err "Failed to ${action} $1 within ${run_timeout} seconds"
return 1
fi
# Briefly sleep before restarting the loop
sleep 0.1
done
# Normally we should not get here, but if this happens
# we have to return an error
return 1
}
# Variables
typeset command=$1
typeset BP=100
typeset SP=${BP}
typeset VBD
typeset -a stack
function push()
{
local value="$1"
[[ -n "${value}" ]] \
&& stack[$((--SP))]="${value}"
return 0
}
function pop()
{
[[ "${SP}" != "${BP}" ]] \
&& VBD=${stack[$((SP++))]} \
|| VBD=""
return 0
}
function activate_dmmd()
{
case "$1" in
"md")
activate_md $2
return $?
;;
"lvm")
lvm_action activate $2
return $?
;;
esac
# Normally we should not get here, but if this happens
# we have to return an error
return 1
}
function deactivate_dmmd()
{
case "$1" in
"md")
deactivate_md $2
return $?
;;
"lvm")
lvm_action deactivate $2
return $?
;;
esac
# Normally we should not get here, but if this happens
# we have to return an error
return 1
}
function cleanup_stack()
{
while true; do
pop
[[ -z "${VBD}" ]] && break
deactivate_dmmd ${VBD}
done
}
function parse_par()
{
# Make these vars explicitly local
local ac par rc s t
ac=$1
par="$2"
par="${par};"
while true; do
t=${par%%;*}
[[ -z "${t}" ]] && return 0
par=${par#*;}
s=${par%%;*}
[[ -z "${s}" ]] && return 1
par=${par#*;}
if [[ "${ac}" == "activate" ]]; then
activate_dmmd ${t} ${s} \
|| return 1
fi
push "${t} ${s}"
done
}
case "${command}" in
"add")
p=$(xenstore-read ${XENBUS_PATH}/params) || true
claim_lock "dmmd"
dmmd=${p#dmmd:}
if ! parse_par activate "${dmmd}"; then
cleanup_stack
release_lock "dmmd"
exit 1
fi
lastparam=${dmmd##*;}
usedevice=${lastparam%(*}
xenstore-write ${XENBUS_PATH}/node "${usedevice}"
write_dev "${usedevice}"
release_lock "dmmd"
exit 0
;;
"remove")
p=$(xenstore-read ${XENBUS_PATH}/params) || true
claim_lock "dmmd"
dmmd=${p#dmmd:}
parse_par noactivate "${dmmd}"
cleanup_stack
release_lock "dmmd"
exit 0
;;
esac
# Normally we should not get here, but if this happens
# we have to return an error
return 1

129
block-npiv Normal file
View File

@ -0,0 +1,129 @@
#!/bin/bash
# Usage: block-npiv [add npiv | remove dev]
dir=$(dirname "$0")
. "$dir/block-npiv-common.sh"
. "$dir/block-common.sh"
#set -x
#command=$1
case "$command" in
add)
# Params is one big arg, with fields separated by hyphens:
# single path:
# VPWWPN-TGTWWPN-LUN#
# multipath:
# {VPWWPN1.VPWWPN2....VPWWPNx}-{TGTWWPN1.TGTWWPN2....TGTWWPNx}-LUN#
# arg 1 - VPORT's WWPN
# arg 2 - Target's WWPN
# arg 3 - LUN # on Target
# no wwn contains a leading 0x - it is a 16 character hex value
# You may want to optionally pick a specific adapter ?
par=`xenstore-read $XENBUS_PATH/params` || true
NPIVARGS=(${par//-/ })
wc=${#NPIVARGS[@]}
if [ $wc -eq 5 ]; then
# support old syntax
# FABRIC-VPWWPN-VPWWNN-TGTWWPN-LUN
VPORTWWPNS=${NPIVARGS[1]}
VPORTWWNNS=${NPIVARGS[2]}
TGTWWPNS=${NPIVARGS[3]}
LUN=${NPIVARGS[4]}
elif [ $wc -eq 3 ]; then
# new syntax
VPORTWWPNS=${NPIVARGS[0]}
TGTWWPNS=${NPIVARGS[1]}
LUN=${NPIVARGS[2]}
else
# wrong syntax
exit 1
fi
# Ensure we compare everything using lower-case hex characters
TGTWWPNS=`echo $TGTWWPNS | tr A-Z a-z |sed 's/[{.}]/ /g'`
VPORTWWPNS=`echo $VPORTWWPNS | tr A-Z a-z |sed 's/[{.}]/ /g'`
# Only one VPWWNN is supported
VPORTWWNN=`echo $VPORTWWNNS | tr A-Z a-z | sed -e 's/\..*//g' -e 's/{//'`
claim_lock "npiv"
paths=0
for VPORTWWPN in $VPORTWWPNS; do
find_vhost $VPORTWWPN
if test -z "$vhost" ; then
create_vport $VPORTWWPN $VPORTWWNN
if [ $? -ne 0 ] ; then exit 2; fi
sleep 8
find_vhost $VPORTWWPN
if test -z "$vhost" ; then exit 3; fi
fi
for TGTWWPN in $TGTWWPNS; do
find_sdev $vhost $TGTWWPN $LUN
if test -z "$dev"; then
echo "- - -" > /sys/class/scsi_host/$vhost/scan
sleep 2
find_sdev $vhost $TGTWWPN $LUN
fi
if test -z "$dev"; then
exit 4
fi
paths=$(($paths+1))
done
done
release_lock "npiv"
if test $paths -gt 1; then
xenstore-write $XENBUS_PATH/multipath 1
/etc/init.d/multipathd start
if test $? -ne 0 ; then exit 4; fi
dm=`multipath -l /dev/$dev | grep dm | cut -f2 -d' '`
else
xenstore-write $XENBUS_PATH/multipath 0
dm=$dev
fi
if test ! -z "$dm"; then
xenstore-write $XENBUS_PATH/node /dev/$dm
write_dev /dev/$dm
exit 0
fi
exit 4
;;
remove)
node=`xenstore-read $XENBUS_PATH/node` || true
multipath=`xenstore-read $XENBUS_PATH/multipath` || true
# this is really screwy. the first delete of a lun will
# terminate the entire vport (all luns)
if test $multipath = 1; then
par=`xenstore-read $XENBUS_PATH/params` || true
NPIVARGS=(${par//-/ })
wc=${#NPIVARGS[@]}
if [ $wc -eq 5 ]; then
# old syntax
# FABRIC-VPWWPN-VPWWNN-TGTWWPN-LUN
VPORTWWPNS=${NPIVARGS[1]}
elif [ $wc -eq 3 ]; then
# new syntax
VPORTWWPNS=${NPIVARGS[0]}
fi
VPORTWWPNS=`echo $VPORTWWPNS | tr A-Z a-z |sed 's/[{.}]/ /g'`
for VPORTWWPN in $VPORTWWPNS; do
find_vhost $VPORTWWPN
if test -z "$vhost" ; then exit 5; fi
flush_nodes_on_vhost $vhost
delete_vhost $vhost
done
else
dev=$node; dev=${dev#/dev/}
find_vhost_from_dev $dev
if test -z "$vhost" ; then exit 5; fi
flush_nodes_on_vhost $vhost
delete_vhost $vhost
fi
exit 0
;;
esac

277
block-npiv-common.sh Normal file
View File

@ -0,0 +1,277 @@
# Look for the NPIV vport with the WWPN
# $1 contains the WWPN (assumes it does not contain a leading "0x")
find_vhost()
{
unset vhost
# look in upstream locations
for fchost in /sys/class/fc_vports/* ; do
if test -e $fchost/port_name ; then
wwpn=`cat $fchost/port_name | sed -e s/^0x//`
if test $wwpn = $1 ; then
# Note: makes the assumption the vport will always have an scsi_host child
vhost=`ls -d $fchost/device/host*`
vhost=`basename $vhost`
return
fi
fi
done
# look in vendor-specific locations
# Emulex - just looks like another scsi_host - so look at fc_hosts...
for fchost in /sys/class/fc_host/* ; do
if test -e $fchost/port_name ; then
wwpn=`cat $fchost/port_name | sed -e s/^0x//`
if test $wwpn = $1 ; then
# Note: makes the assumption the vport will always have an scsi_host child
vhost=`basename $fchost`
return
fi
fi
done
}
# Create a NPIV vport with WWPN
# $1 contains the VPORT WWPN
# $2 may contain the VPORT WWNN
# (assumes no name contains a leading "0x")
create_vport()
{
wwpn=$1
wwnn=$2
if [ -z "$wwnn" ]; then
# auto generate wwnn, follow FluidLabUpdateForEmulex.pdf
# Novell specific identifier
# byte 6 = 0 indicates WWNN, = 1 indicates WWPN
wwnn=${wwpn:0:6}"0"${wwpn:7}
fi
# find a base adapter with npiv support that is on the right fabric
# Look via upstream interfaces
for fchost in /sys/class/fc_host/* ; do
if test -e $fchost/vport_create ; then
# is the link up, w/ NPIV support ?
pstate=`cat $fchost/port_state`
ptype=`cat $fchost/port_type | cut -c 1-5`
if [ $pstate = "Online" -a $ptype = "NPort" ] ; then
vmax=`cat $fchost/max_npiv_vports`
vinuse=`cat $fchost/npiv_vports_inuse`
avail=`expr $vmax - $vinuse`
if [ $avail -gt 0 ] ; then
# create the vport
echo $wwpn":"$wwnn > $fchost/vport_create
if [ $? -eq 0 ] ; then
return 0
fi
# failed - so we'll just look for the next adapter
fi
fi
fi
done
# Look in vendor-specific locations
# Emulex: interfaces mirror upstream, but are under adapter scsi_host
for shost in /sys/class/scsi_host/* ; do
if [ -e $shost/vport_create ] ; then
fchost=`ls -d $shost/device/fc_host*`
# is the link up, w/ NPIV support ?
if [ -e $fchost/port_state ] ; then
pstate=`cat $fchost/port_state`
ptype=`cat $fchost/port_type | cut -c 1-5`
if [ $pstate = "Online" -a $ptype = "NPort" ] ; then
vmax=`cat $shost/max_npiv_vports`
vinuse=`cat $shost/npiv_vports_inuse`
avail=`expr $vmax - $vinuse`
if [ $avail -gt 0 ] ; then
# create the vport
echo $wwpn":"$wwnn > $shost/vport_create
if [ $? -eq 0 ] ; then
return 0
fi
# failed - so we'll just look for the next adapter
fi
fi
fi
fi
done
# BFA are under adapter scsi_host
for shost in /sys/class/scsi_host/* ; do
if [ -e $shost/vport_create ] ; then
fchost=`ls -d $shost/device/fc_host/*`
# is the link up, w/ NPIV support ?
if [ -e $fchost/port_state ] ; then
pstate=`cat $fchost/port_state`
ptype=`cat $fchost/port_type | cut -c 1-5`
if [ $pstate = "Online" -a $ptype = "NPort" ] ; then
# create the vport
echo $wwpn":"$wwnn > $shost/vport_create
if [ $? -eq 0 ] ; then
return 0
fi
# failed - so we'll just look for the next adapter
fi
fi
fi
done
return 1
}
# Look for the LUN on the indicated scsi_host (which is an NPIV vport)
# $1 is the scsi_host name (normalized to simply the hostX name)
# $2 is the WWPN of the tgt port the lun is on
# Note: this implies we don't support a multipath'd lun, or we
# are explicitly identifying a "path"
# $3 is the LUN number of the scsi device
find_sdev()
{
unset dev
hostno=${1/*host/}
for sdev in /sys/class/scsi_device/${hostno}:*:$3 ; do
if test -e $sdev/device/../fc_trans*/target${hostno}*/port_name ; then
tgtwwpn=`cat $sdev/device/../fc_trans*/target${hostno}*/port_name | sed -e s/^0x//`
if test $tgtwwpn = $2 ; then
if test -e $sdev/device/block* ; then
dev=`ls $sdev/device/block*`
dev=${dev##*/}
return
fi
fi
fi
done
}
# Look for the NPIV vhost based on a scsi "sdX" name
# $1 is the "sdX" name
find_vhost_from_dev()
{
unset vhost
hostno=`readlink /sys/block/$1/device`
hostno=${hostno##*/}
hostno=${hostno%%:*}
if test -z "$hostno" ; then return; fi
vhost="host"$hostno
}
# We're about to terminate a vhost based on a scsi device
# Flush all nodes on that vhost as they are about to go away
# $1 is the vhost
flush_nodes_on_vhost()
{
if test ! -x /sbin/blockdev ; then return; fi
hostno=${1/*host/}
for sdev in /sys/class/scsi_device/${hostno}:* ; do
if test -e $sdev/device/block* ; then
dev=`ls $sdev/device/block*`
dev="/dev/"$dev
if test -n "$dev"; then
blockdev --flushbufs $dev
fi
fi
done
}
# Terminate a NPIV vhost
# $1 is vhost
delete_vhost()
{
# use upstream interface
for vport in /sys/class/fc_vports/* ; do
if test -e $vport/device/$1 ; then
if test -e $vport/vport_delete ; then
echo "1" > $vport/vport_delete
if test $? -ne 0 ; then exit 6; fi
sleep 4
return
fi
fi
done
# use vendor specific interface
# Emulex
if test -e /sys/class/fc_host/$1/device/../scsi_host*/lpfc_drvr_version ; then
shost=`ls -1d /sys/class/fc_host/$1/device/../scsi_host* | sed s/.*scsi_host://`
vportwwpn=`cat /sys/class/fc_host/$1/port_name | sed s/^0x//`
vportwwnn=`cat /sys/class/fc_host/$1/node_name | sed s/^0x//`
echo "$vportwwpn:$vportwwnn" > /sys/class/scsi_host/$shost/vport_delete
if test $? -ne 0 ; then exit 6; fi
sleep 4
return
fi
# Qlogic
if test -e /sys/class/fc_host/$1/device/../scsi_host*/driver_version ; then
shost=`ls -1d /sys/class/fc_host/$1/device/../scsi_host* | sed s/.*scsi_host://`
vportwwpn=`cat /sys/class/fc_host/$1/port_name | sed s/^0x//`
vportwwnn=`cat /sys/class/fc_host/$1/node_name | sed s/^0x//`
echo "$vportwwpn:$vportwwnn" > /sys/class/scsi_host/$shost/vport_delete
if test $? -ne 0 ; then exit 6; fi
sleep 4
return
fi
# BFA
if test -e /sys/class/fc_host/$1/device/../scsi_host/*/driver_name ; then
shost=`ls -1d /sys/class/fc_host/$1/device/../scsi_host/* | sed s#.*scsi_host/##`
vportwwpn=`cat /sys/class/fc_host/$1/port_name | sed s/^0x//`
vportwwnn=`cat /sys/class/fc_host/$1/node_name | sed s/^0x//`
echo "$vportwwpn:$vportwwnn" > /sys/class/scsi_host/$shost/vport_delete
if test $? -ne 0 ; then exit 6; fi
sleep 4
return
fi
exit 6
}
vport_status()
{
# Look via upstream interfaces
for fchost in /sys/class/fc_host/* ; do
if test -e $fchost/vport_create ; then
vport_status_display $fchost $fchost
fi
done
# Look in vendor-specific locations
# Emulex: interfaces mirror upstream, but are under adapter scsi_host
for shost in /sys/class/scsi_host/* ; do
if [ -e $shost/vport_create ] ; then
fchost=`ls -d $shost/device/fc_host*`
vport_status_display $fchost $shost
fi
done
return 0
}
vport_status_display()
{
echo
echo "fc_host: " $2
echo "port_state: " `cat $1/port_state`
echo "port_type: " `cat $1/port_type`
echo "fabric_name: " `cat $1/fabric_name`
echo "max_npiv_vports: " `cat $2/max_npiv_vports`
echo "npiv_vports_inuse: " `cat $2/npiv_vports_inuse`
echo "modeldesc: " `cat $2/modeldesc`
echo "speed: " `cat $1/speed`
return 0
}

79
block-npiv-vport Normal file
View File

@ -0,0 +1,79 @@
#!/bin/bash
# Usage: block-npiv-vport [create npivargs | delete vportwwpn | status]
dir=$(dirname "$0")
. "$dir/block-npiv-common.sh"
#set -x
command=$1
params=$2
case "$command" in
create)
# Params is one big arg, with fields separated by hyphens:
# FABRIC-VPWWPN-VPWWNN-TGTWWPN-LUN#
# arg 2 - Fabric Name
# arg 3 - VPORT's WWPN
# arg 4 - VPORT's WWNN
# arg 5 - Target's WWPN
# arg 6 - LUN # on Target
# no wwn contains a leading 0x - it is a 16 character hex value
# You may want to optionally pick a specific adapter ?
NPIVARGS=$params;
LUN=${NPIVARGS##*-*-*-*-}; NPIVARGS=${NPIVARGS%-*}
if test $LUN = $NPIVARGS ; then exit 1; fi
TGTWWPN=${NPIVARGS##*-*-*-}; NPIVARGS=${NPIVARGS%-*}
if test $TGTWWPN = $NPIVARGS ; then exit 1; fi
VPORTWWNN=${NPIVARGS##*-*-}; NPIVARGS=${NPIVARGS%-*}
if test $VPORTWWNN = $NPIVARGS ; then exit 1; fi
VPORTWWPN=${NPIVARGS##*-}; NPIVARGS=${NPIVARGS%-*}
if test $VPORTWWPN = $NPIVARGS ; then exit 1; fi
FABRICNM=$NPIVARGS
# Ensure we compare everything using lower-case hex characters
TGTWWPN=`echo $TGTWWPN | tr A-Z a-z`
VPORTWWPN=`echo $VPORTWWPN | tr A-Z a-z`
VPORTWWNN=`echo $VPORTWWNN | tr A-Z a-z`
FABRICNM=`echo $FABRICNM | tr A-Z a-z`
find_vhost $VPORTWWPN $FABRICNM
if test -z "$vhost" ; then
create_vport $FABRICNM $VPORTWWPN $VPORTWWNN
if [ $? -ne 0 ] ; then exit 2; fi
sleep 8
find_vhost $VPORTWWPN $FABRICNM
if test -z "$vhost" ; then exit 3; fi
fi
exit 0
;;
delete)
# Params is VPORT's WWPN
# no wwn contains a leading 0x - it is a 16 character hex value
VPORTWWPN=$params
# Ensure we compare everything using lower-case hex characters
VPORTWWPN=`echo $VPORTWWPN | tr A-Z a-z`
find_vhost $VPORTWWPN $FABRICNM
if test -z "$vhost" ; then exit 4; fi
delete_vhost $vhost
exit 0
;;
status)
vport_status
exit 0
;;
*)
echo "Usage: block-npiv-vport [create npivargs | delete vportwwpn | status]"
exit 1
;;
esac

79
boot.local.xenU Normal file
View File

@ -0,0 +1,79 @@
#! /bin/sh
#
# Copyright (c) 2014 SUSE GmbH Nuernberg, Germany. All rights reserved.
#
# Author: Werner Fink <werner@suse.de>, 1996
# Burchard Steinbild <bs@suse.de>, 1996
#
# /etc/init.d/boot.local
#
# script with local commands to be executed from init on system startup
#
#
# Here you should add things, that should happen directly after booting
# before we're going to the first run level.
#
date
# echo "$MACHINE: running $0 $*"
my_REDIRECT="$(echo $REDIRECT | sed 's#^/dev/##')"
my_DEVICE="$(echo $my_REDIRECT | sed 's#^tty##')"
my_SPEED="$(stty speed)"
# echo REDIRECT $REDIRECT $my_REDIRECT
# echo my_DEVICE $my_DEVICE
# echo my_SPEED $my_SPEED
# compose a line like that for inittab
# S0:12345:respawn:/sbin/agetty -L 9600 ttyS0 vt102
case $my_REDIRECT in
ttyS*)
echo adding this line to inittab
echo "$my_DEVICE:12345:respawn:/sbin/agetty -L $my_SPEED $my_REDIRECT vt102"
echo "$my_DEVICE:12345:respawn:/sbin/agetty -L $my_SPEED $my_REDIRECT vt102" >> /etc/inittab
echo $my_REDIRECT >> /etc/securetty
;;
hvc*)
echo adding this line to inittab
echo "$my_DEVICE:12345:respawn:/sbin/agetty -L $my_SPEED $my_REDIRECT vt320"
echo "$my_DEVICE:12345:respawn:/sbin/agetty -L $my_SPEED $my_REDIRECT vt320" >> /etc/inittab
echo $my_REDIRECT >> /etc/securetty
;;
*)
echo "no modification in inittab needed for: $my_REDIRECT"
;;
esac
telinit q
# Changes for Xen
test -f /lib/modules/`uname -r`/modules.dep || depmod -ae
CMDLINE=`cat /proc/cmdline | grep 'ip='`
if test ! -z "$CMDLINE"; then
OLDIFS=$IFS
IFS=":"
read ip oth mask gw hostname dev dhcp rest < /proc/cmdline
IFS=$OLDIFS
hostname $hostname
ip=`echo $ip | sed 's/ip= *//'`
if test ! -z "$ip"; then
if test -z "$mask"; then
if [ ${ip%/*} = $ip ]; then
ip="$ip/27"
fi
echo "ip addr add $ip dev $dev"
ip addr add $ip dev $dev
ip link set $dev up
else
ifconfig add $ip netmask $mask $dev
fi
fi
if test "${dhcp#dhcp}" != "$dhcp"; then
ifup-dhcp $dev
fi
fi

101
boot.xen Normal file
View File

@ -0,0 +1,101 @@
#! /bin/sh
# Copyright (c) 2005-2006 SUSE Linux AG, Nuernberg, Germany.
# All rights reserved.
#
# /etc/init.d/boot.xen
#
# LSB compatible service control script; see http://www.linuxbase.org/spec/
#
### BEGIN INIT INFO
# Provides: Xen
# Required-Start: boot.localfs
# Should-Start: boot.localnet
# Required-Stop: boot.localfs
# Should-Stop:
# Default-Start: B
# Default-Stop:
# Short-Description: Switch on and off TLS depending on whether Xen is running
# Description: Xen gets a major performance hit by the way
# recent glibc (& gcc) set up the TLS offset, as it needs to
# play segmentation tricks. This can be avoided by moving away
# the tls libs.
### END INIT INFO
. /etc/rc.status
# Reset status of this service
rc_reset
case "$1" in
start)
echo -n "Starting Xen setup "
if test -d /proc/xen; then
export LD_ASSUME_KERNEL=2.4.21
echo -n "Xen running "
fi
if test -d /proc/xen -a -d /lib/tls; then
echo -n "move /lib/tls away "
mv /lib/tls /lib/tls.save
elif test ! -d /proc/xen -a -d /lib/tls.save; then
echo -n "move back /lib/tls "
mv /lib/tls.save /lib/tls
fi
rc_status -v
;;
stop)
# rc_status -v
;;
try-restart|condrestart)
$0 restart
# Remember status and be quiet
rc_status
;;
restart)
## Stop the service and regardless of whether it was
## running or not, start it again.
$0 start
# Remember status and be quiet
rc_status
;;
force-reload)
$0 try-restart
rc_status
;;
reload)
rc_failed 3
rc_status -v
;;
status)
echo -n "Checking for Xen "
# Return value is slightly different for the status command:
# 0 - service up and running
# 1 - service dead, but /var/run/ pid file exists
# 2 - service dead, but /var/lock/ lock file exists
# 3 - service not running (unused)
# 4 - service status unknown :-(
# 5--199 reserved (5--99 LSB, 100--149 distro, 150--199 appl.)
if test -d /proc/xen; then
if test -d /lib/tls; then
echo -n "Xen running, /lib/tls existing "
rc_failed 1
else
echo -n "Xen running, /lib/tls not existing "
fi
else
if test -d /lib/tls.save; then
echo -n "Xen not running, /lib/tls existing "
rc_failed 2
else
echo -n "Xen not running, /lib/tls not existing "
rc_failed 3
fi
fi
rc_status -v
;;
*)
echo "Usage: $0 {start|stop|status|try-restart|restart|force-reload|reload}"
exit 1
;;
esac
rc_exit

View File

@ -0,0 +1,201 @@
Index: xen-4.18.0-testing/Config.mk
===================================================================
--- xen-4.18.0-testing.orig/Config.mk
+++ xen-4.18.0-testing/Config.mk
@@ -77,7 +77,7 @@ EXTRA_INCLUDES += $(EXTRA_PREFIX)/includ
EXTRA_LIB += $(EXTRA_PREFIX)/lib
endif
-PYTHON ?= python
+PYTHON ?= python3
PYTHON_PREFIX_ARG ?= --prefix="$(prefix)"
# The above requires that prefix contains *no spaces*. This variable is here
# to permit the user to set PYTHON_PREFIX_ARG to '' to workaround this bug:
Index: xen-4.18.0-testing/tools/configure
===================================================================
--- xen-4.18.0-testing.orig/tools/configure
+++ xen-4.18.0-testing/tools/configure
@@ -7392,15 +7392,15 @@ if test x"${PYTHONPATH}" = x"no"
then
as_fn_error $? "Unable to find $PYTHON, please install $PYTHON" "$LINENO" 5
fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for python version >= 2.6 " >&5
-$as_echo_n "checking for python version >= 2.6 ... " >&6; }
-`$PYTHON -c 'import sys; sys.exit(eval("sys.version_info < (2, 6)"))'`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for python3 version >= 3.0 " >&5
+$as_echo_n "checking for python3 version >= 3.0 ... " >&6; }
+`$PYTHON -c 'import sys; sys.exit(eval("sys.version_info < (3, 0)"))'`
if test "$?" != "0"
then
python_version=`$PYTHON -V 2>&1`
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
$as_echo "no" >&6; }
- as_fn_error $? "$python_version is too old, minimum required version is 2.6" "$LINENO" 5
+ as_fn_error $? "$python_version is too old, minimum required version is 3.0" "$LINENO" 5
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
$as_echo "yes" >&6; }
Index: xen-4.18.0-testing/tools/configure.ac
===================================================================
--- xen-4.18.0-testing.orig/tools/configure.ac
+++ xen-4.18.0-testing/tools/configure.ac
@@ -385,7 +385,7 @@ PYTHONPATH=$PYTHON
PYTHON=`basename $PYTHONPATH`
AX_PATH_PROG_OR_FAIL([PYTHONPATH], [$PYTHON])
-AX_CHECK_PYTHON_VERSION([2], [6])
+AX_CHECK_PYTHON_VERSION([3], [0])
AS_IF([test "$cross_compiling" != yes], [
AX_CHECK_PYTHON_DEVEL()
Index: xen-4.18.0-testing/tools/libs/light/idl.py
===================================================================
--- xen-4.18.0-testing.orig/tools/libs/light/idl.py
+++ xen-4.18.0-testing/tools/libs/light/idl.py
@@ -271,7 +271,7 @@ class KeyedUnion(Aggregate):
if not isinstance(keyvar_type, Enumeration):
raise ValueError
- kv_kwargs = dict([(x.lstrip('keyvar_'),y) for (x,y) in kwargs.items() if x.startswith('keyvar_')])
+ kv_kwargs = dict([(x.lstrip('keyvar_'),y) for (x,y) in list(kwargs.items()) if x.startswith('keyvar_')])
self.keyvar = Field(keyvar_type, keyvar_name, **kv_kwargs)
@@ -317,7 +317,7 @@ class Array(Type):
kwargs.setdefault('json_parse_type', 'JSON_ARRAY')
Type.__init__(self, namespace=elem_type.namespace, typename=elem_type.rawname + " *", **kwargs)
- lv_kwargs = dict([(x.lstrip('lenvar_'),y) for (x,y) in kwargs.items() if x.startswith('lenvar_')])
+ lv_kwargs = dict([(x.lstrip('lenvar_'),y) for (x,y) in list(kwargs.items()) if x.startswith('lenvar_')])
self.lenvar = Field(integer, lenvar_name, **lv_kwargs)
self.elem_type = elem_type
@@ -353,7 +353,7 @@ def parse(f):
globs = {}
locs = OrderedDict()
- for n,t in globals().items():
+ for n,t in list(globals().items()):
if isinstance(t, Type):
globs[n] = t
elif isinstance(t,type(object)) and issubclass(t, Type):
Index: xen-4.18.0-testing/tools/libs/light/gentest.py
===================================================================
--- xen-4.18.0-testing.orig/tools/libs/light/gentest.py
+++ xen-4.18.0-testing/tools/libs/light/gentest.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
from __future__ import print_function
Index: xen-4.18.0-testing/tools/libs/light/gentypes.py
===================================================================
--- xen-4.18.0-testing.orig/tools/libs/light/gentypes.py
+++ xen-4.18.0-testing/tools/libs/light/gentypes.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
from __future__ import print_function
Index: xen-4.18.0-testing/tools/include/xen-foreign/mkheader.py
===================================================================
--- xen-4.18.0-testing.orig/tools/include/xen-foreign/mkheader.py
+++ xen-4.18.0-testing/tools/include/xen-foreign/mkheader.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
from __future__ import print_function
Index: xen-4.18.0-testing/tools/include/xen-foreign/mkchecker.py
===================================================================
--- xen-4.18.0-testing.orig/tools/include/xen-foreign/mkchecker.py
+++ xen-4.18.0-testing/tools/include/xen-foreign/mkchecker.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
import sys;
from structs import structs, compat_arches;
Index: xen-4.18.0-testing/xen/tools/gen-cpuid.py
===================================================================
--- xen-4.18.0-testing.orig/xen/tools/gen-cpuid.py
+++ xen-4.18.0-testing/xen/tools/gen-cpuid.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys, os, re
Index: xen-4.18.0-testing/xen/tools/compat-build-source.py
===================================================================
--- xen-4.18.0-testing.orig/xen/tools/compat-build-source.py
+++ xen-4.18.0-testing/xen/tools/compat-build-source.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
import re,sys
Index: xen-4.18.0-testing/xen/tools/compat-build-header.py
===================================================================
--- xen-4.18.0-testing.orig/xen/tools/compat-build-header.py
+++ xen-4.18.0-testing/xen/tools/compat-build-header.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
import re,sys
Index: xen-4.18.0-testing/tools/misc/xensymoops
===================================================================
--- xen-4.18.0-testing.orig/tools/misc/xensymoops
+++ xen-4.18.0-testing/tools/misc/xensymoops
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
# An oops analyser for Xen
# Usage: xensymoops path-to-xen.s < oops-message
@@ -43,12 +43,12 @@ def read_oops():
return (eip_addr, stack_addresses)
def usage():
- print >> sys.stderr, """Usage: %s path-to-asm < oops-msg
+ print("""Usage: %s path-to-asm < oops-msg
The oops message should be fed to the standard input. The
command-line argument specifies the path to the Xen assembly dump
produced by \"make debug\". The location of EIP and the backtrace
will be output to standard output.
- """ % sys.argv[0]
+ """ % sys.argv[0], file=sys.stderr)
sys.exit()
##### main
@@ -99,7 +99,7 @@ while True:
# if this address was seen as a potential code address in the backtrace then
# record it in the backtrace list
- if stk_addrs.has_key(address):
+ if address in stk_addrs:
backtrace.append((stk_addrs[address], address, func))
# if this was the address that EIP...
@@ -107,12 +107,12 @@ while True:
eip_func = func
-print "EIP %s in function %s" % (eip_addr, eip_func)
-print "Backtrace:"
+print("EIP %s in function %s" % (eip_addr, eip_func))
+print("Backtrace:")
# sorting will order primarily by the first element of each tuple,
# i.e. the order in the original oops
backtrace.sort()
for (i, a, f) in backtrace:
- print "%s in function %s" % ( a, f )
+ print("%s in function %s" % ( a, f ))

7
etc_pam.d_xen-api Normal file
View File

@ -0,0 +1,7 @@
#%PAM-1.0
auth required pam_listfile.so onerr=fail item=user \
sense=allow file=/etc/xen/xenapiusers
auth include common-auth
account include common-account
password include common-password
session include common-session

44
hibernate.patch Normal file
View File

@ -0,0 +1,44 @@
Index: xen-4.18.0-testing/tools/libacpi/ssdt_s3.asl
===================================================================
--- xen-4.18.0-testing.orig/tools/libacpi/ssdt_s3.asl
+++ xen-4.18.0-testing/tools/libacpi/ssdt_s3.asl
@@ -7,13 +7,9 @@
DefinitionBlock ("SSDT_S3.aml", "SSDT", 2, "Xen", "HVM", 0)
{
- /* Must match piix emulation */
- Name (\_S3, Package (0x04)
- {
- 0x01, /* PM1a_CNT.SLP_TYP */
- 0x01, /* PM1b_CNT.SLP_TYP */
- 0x0, /* reserved */
- 0x0 /* reserved */
- })
+ /*
+ * Turn off support for s3 sleep state to deal with SVVP tests.
+ * This is what MSFT does on HyperV.
+ */
}
Index: xen-4.18.0-testing/tools/libacpi/ssdt_s4.asl
===================================================================
--- xen-4.18.0-testing.orig/tools/libacpi/ssdt_s4.asl
+++ xen-4.18.0-testing/tools/libacpi/ssdt_s4.asl
@@ -7,13 +7,9 @@
DefinitionBlock ("SSDT_S4.aml", "SSDT", 2, "Xen", "HVM", 0)
{
- /* Must match piix emulation */
- Name (\_S4, Package (0x04)
- {
- 0x00, /* PM1a_CNT.SLP_TYP */
- 0x00, /* PM1b_CNT.SLP_TYP */
- 0x00, /* reserved */
- 0x00 /* reserved */
- })
+ /*
+ * Turn off support for s4 sleep state to deal with SVVP tests.
+ * This is what MSFT does on HyperV.
+ */
}

View File

@ -0,0 +1,54 @@
References: bsc#1172356
The bug is that virt-manager reports a failure when in fact
the host and guest have added the network interface. The Xen
scripts are failing with an error when in fact that command
is succeeding.
The 'ip' commands seem to abort the script due to a 'set -e' in
xen-script-common.sh with what appears to be an error condition.
However, the command actually succeeds when checked from the
host console or also by inserting a sleep before each ip command
and executing it manually at the command line. This seems to be
an artifact of using 'set -e' everywhere.
Index: xen-4.15.0-testing/tools/hotplug/Linux/xen-network-common.sh
===================================================================
--- xen-4.15.0-testing.orig/tools/hotplug/Linux/xen-network-common.sh
+++ xen-4.15.0-testing/tools/hotplug/Linux/xen-network-common.sh
@@ -90,7 +90,7 @@ _setup_bridge_port() {
local virtual="$2"
# take interface down ...
- ip link set dev ${dev} down
+ (ip link set dev ${dev} down || true)
if [ $virtual -ne 0 ] ; then
# Initialise a dummy MAC address. We choose the numerically
@@ -101,7 +101,7 @@ _setup_bridge_port() {
fi
# ... and configure it
- ip address flush dev ${dev}
+ (ip address flush dev ${dev} || true)
}
setup_physical_bridge_port() {
@@ -136,15 +136,15 @@ add_to_bridge () {
if [ ! -e "/sys/class/net/${bridge}/brif/${dev}" ]; then
log debug "adding $dev to bridge $bridge"
if which brctl >&/dev/null; then
- brctl addif ${bridge} ${dev}
+ (brctl addif ${bridge} ${dev} || true)
else
- ip link set ${dev} master ${bridge}
+ (ip link set ${dev} master ${bridge} || true)
fi
else
log debug "$dev already on bridge $bridge"
fi
- ip link set dev ${dev} up
+ (ip link set dev ${dev} up || true)
}
remove_from_bridge () {

119
init.pciback Normal file
View File

@ -0,0 +1,119 @@
#!/bin/bash
#
# Copyright (c) 2014 SUSE GmbH Nuernberg, Germany. All rights reserved.
#
# /etc/init.d/pciback
#
### BEGIN INIT INFO
# Provides: pciback
# Required-Start: $syslog $network
# Should-Start: $null
# Required-Stop: $syslog $network
# Should-Stop: $null
# Default-Start: 3 5
# Default-Stop: 0 1 2 6
# Description: bind PCI devices to pciback
### END INIT INFO
. /etc/rc.status
. /etc/sysconfig/pciback
rc_reset
load_pciback() {
if ! lsmod | grep -qi "pciback"
then
echo "Loading pciback ..."
modprobe pciback
fi
}
unload_pciback() {
if lsmod | grep -qi "pciback"
then
echo "Unloading pciback ..."
modprobe -r pciback
fi
}
bind_dev_to_pciback() {
for DEVICE in ${XEN_PCI_HIDE_LIST}
do
local DRV=`echo ${DEVICE} | /usr/bin/cut -d "," -f 1`
local PCIID=`echo ${DEVICE} | /usr/bin/cut -d "," -f 2`
if ! ls /sys/bus/pci/drivers/pciback/${PCIID} > /dev/null 2>&1
then
echo "Binding ${PCIID} ..."
if ls /sys/bus/pci/drivers/${DRV}/${PCIID} > /dev/null 2>&1
then
echo -n ${PCIID} > /sys/bus/pci/drivers/${DRV}/unbind
fi
echo -n ${PCIID} > /sys/bus/pci/drivers/pciback/new_slot
echo -n ${PCIID} > /sys/bus/pci/drivers/pciback/bind
fi
done
}
unbind_dev_from_pciback() {
for DEVICE in ${XEN_PCI_HIDE_LIST}
do
local DRV=`echo ${DEVICE} | /usr/bin/cut -d "," -f 1`
local PCIID=`echo ${DEVICE} | /usr/bin/cut -d "," -f 2`
if ls /sys/bus/pci/drivers/pciback/${PCIID} > /dev/null
then
echo "Unbinding ${PCIID} ..."
echo -n ${PCIID} > /sys/bus/pci/drivers/pciback/unbind
fi
done
}
test "uname -r" | grep xen && exit 0
case $1 in
start)
echo "Starting pciback ..."
echo
load_pciback
bind_dev_to_pciback
rc_status -v -r
;;
stop)
echo "Stopping pciback ..."
echo
unbind_dev_from_pciback
unload_pciback
rc_status -v
;;
reload|restart)
echo "Stopping pciback ..."
echo
unbind_dev_from_pciback
unload_pciback
echo "Starting pciback ..."
echo
load_pciback
bind_dev_to_pciback
;;
status)
if lsmod | grep -qi pciback
then
echo
echo "pciback: loaded"
echo
echo "Currently bound devices ..."
echo "-----------------------------"
ls /sys/bus/pci/drivers/pciback | grep ^0000
echo
else
echo "pciback: not loaded"
fi
;;
*)
echo "Usage: $0 [start|stop|restart|reload|status]"
exit 1
;;
esac

64
libxc-bitmap-long.patch Normal file
View File

@ -0,0 +1,64 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Wed, 9 Dec 2020 16:40:00 +0100
Subject: libxc sr bitmap long
tools: add API to work with sevaral bits at once
Introduce new API to test if a fixed number of bits is clear or set,
and clear or set them all at once.
The caller has to make sure the input bitnumber is a multiple of BITS_PER_LONG.
This API avoids the loop over each bit in a known range just to see
if all of them are either clear or set.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
v02:
- change return type from int to bool (jgross)
---
tools/libs/ctrl/xc_bitops.h | 28 ++++++++++++++++++++++++++++
1 file changed, 28 insertions(+)
--- a/tools/libs/ctrl/xc_bitops.h
+++ b/tools/libs/ctrl/xc_bitops.h
@@ -3,6 +3,7 @@
/* bitmap operations for single threaded access */
+#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
@@ -81,4 +82,31 @@ static inline void bitmap_or(void *_dst,
dst[i] |= other[i];
}
+static inline bool test_bit_long_set(unsigned long nr_base, const void *_addr)
+{
+ const unsigned long *addr = _addr;
+ unsigned long val = addr[nr_base / BITS_PER_LONG];
+
+ return val == ~0;
+}
+
+static inline bool test_bit_long_clear(unsigned long nr_base, const void *_addr)
+{
+ const unsigned long *addr = _addr;
+ unsigned long val = addr[nr_base / BITS_PER_LONG];
+
+ return val == 0;
+}
+
+static inline void clear_bit_long(unsigned long nr_base, void *_addr)
+{
+ unsigned long *addr = _addr;
+ addr[nr_base / BITS_PER_LONG] = 0;
+}
+
+static inline void set_bit_long(unsigned long nr_base, void *_addr)
+{
+ unsigned long *addr = _addr;
+ addr[nr_base / BITS_PER_LONG] = ~0;
+}
#endif /* XC_BITOPS_H */

View File

@ -0,0 +1,144 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Thu, 7 Jan 2021 15:58:30 +0100
Subject: libxc sr LIBXL_HAVE_DOMAIN_SUSPEND_PROPS
tools: adjust libxl_domain_suspend to receive a struct props
Upcoming changes will pass more knobs down to xc_domain_save.
Adjust the libxl_domain_suspend API to allow easy adding of additional knobs.
No change in behavior intented.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
Acked-by: Christian Lindig <christian.lindig@citrix.com>
---
tools/include/libxl.h | 26 +++++++++++++++++++++++---
tools/libs/light/libxl_domain.c | 7 ++++---
tools/xl/xl_migrate.c | 9 ++++++---
tools/xl/xl_saverestore.c | 3 ++-
4 files changed, 35 insertions(+), 10 deletions(-)
--- a/tools/include/libxl.h
+++ b/tools/include/libxl.h
@@ -1811,13 +1811,28 @@ static inline int libxl_retrieve_domain_
libxl_retrieve_domain_configuration_0x041200
#endif
-int libxl_domain_suspend(libxl_ctx *ctx, uint32_t domid, int fd,
- int flags, /* LIBXL_SUSPEND_* */
- const libxl_asyncop_how *ao_how)
- LIBXL_EXTERNAL_CALLERS_ONLY;
+/*
+ * LIBXL_HAVE_DOMAIN_SUSPEND_PROPS indicates that the
+ * libxl_domain_suspend_props() function takes a props struct.
+ */
+#define LIBXL_HAVE_DOMAIN_SUSPEND_PROPS 1
+
+typedef struct {
+ uint32_t flags; /* LIBXL_SUSPEND_* */
+} libxl_domain_suspend_suse_properties;
#define LIBXL_SUSPEND_DEBUG 1
#define LIBXL_SUSPEND_LIVE 2
+#define LIBXL_HAVE_DOMAIN_SUSPEND_SUSE
+int libxl_domain_suspend_suse(libxl_ctx *ctx, uint32_t domid, int fd,
+ const libxl_domain_suspend_suse_properties *props, /* optional */
+ const libxl_asyncop_how *ao_how)
+ LIBXL_EXTERNAL_CALLERS_ONLY;
+
+int libxl_domain_suspend(libxl_ctx *ctx, uint32_t domid, int fd, int flags,
+ const libxl_asyncop_how *ao_how)
+ LIBXL_EXTERNAL_CALLERS_ONLY;
+
/*
* Only suspend domain, do not save its state to file, do not destroy it.
* Suspended domain can be resumed with libxl_domain_resume()
--- a/tools/libs/light/libxl_domain.c
+++ b/tools/libs/light/libxl_domain.c
@@ -502,7 +502,8 @@ static void domain_suspend_cb(libxl__egc
}
-int libxl_domain_suspend(libxl_ctx *ctx, uint32_t domid, int fd, int flags,
+static int do_libxl_domain_suspend(libxl_ctx *ctx, uint32_t domid, int fd,
+ const libxl_domain_suspend_suse_properties *props,
const libxl_asyncop_how *ao_how)
{
AO_CREATE(ctx, domid, ao_how);
@@ -523,8 +524,8 @@ int libxl_domain_suspend(libxl_ctx *ctx,
dss->domid = domid;
dss->fd = fd;
dss->type = type;
- dss->live = flags & LIBXL_SUSPEND_LIVE;
- dss->debug = flags & LIBXL_SUSPEND_DEBUG;
+ dss->live = props->flags & LIBXL_SUSPEND_LIVE;
+ dss->debug = props->flags & LIBXL_SUSPEND_DEBUG;
dss->checkpointed_stream = LIBXL_CHECKPOINTED_STREAM_NONE;
rc = libxl__fd_flags_modify_save(gc, dss->fd,
@@ -539,6 +540,21 @@ int libxl_domain_suspend(libxl_ctx *ctx,
return AO_CREATE_FAIL(rc);
}
+int libxl_domain_suspend_suse(libxl_ctx *ctx, uint32_t domid, int fd,
+ const libxl_domain_suspend_suse_properties *props,
+ const libxl_asyncop_how *ao_how)
+{
+ return do_libxl_domain_suspend(ctx, domid, fd, props, ao_how);
+}
+
+int libxl_domain_suspend(libxl_ctx *ctx, uint32_t domid, int fd, int flags,
+ const libxl_asyncop_how *ao_how)
+{
+ libxl_domain_suspend_suse_properties props = { .flags = flags };
+
+ return do_libxl_domain_suspend(ctx, domid, fd, &props, ao_how);
+}
+
static void domain_suspend_empty_cb(libxl__egc *egc,
libxl__domain_suspend_state *dss, int rc)
{
--- a/tools/xl/xl_migrate.c
+++ b/tools/xl/xl_migrate.c
@@ -186,7 +186,10 @@ static void migrate_domain(uint32_t domi
char *away_domname;
char rc_buf;
uint8_t *config_data;
- int config_len, flags = LIBXL_SUSPEND_LIVE;
+ int config_len;
+ libxl_domain_suspend_suse_properties props = {
+ .flags = LIBXL_SUSPEND_LIVE,
+ };
save_domain_core_begin(domid, preserve_domid, override_config_file,
&config_data, &config_len);
@@ -205,8 +208,8 @@ static void migrate_domain(uint32_t domi
xtl_stdiostream_adjust_flags(logger, XTL_STDIOSTREAM_HIDE_PROGRESS, 0);
if (debug)
- flags |= LIBXL_SUSPEND_DEBUG;
- rc = libxl_domain_suspend(ctx, domid, send_fd, flags, NULL);
+ props.flags |= LIBXL_SUSPEND_DEBUG;
+ rc = libxl_domain_suspend_suse(ctx, domid, send_fd, &props, NULL);
if (rc) {
fprintf(stderr, "migration sender: libxl_domain_suspend failed"
" (rc=%d)\n", rc);
--- a/tools/xl/xl_saverestore.c
+++ b/tools/xl/xl_saverestore.c
@@ -130,6 +130,7 @@ static int save_domain(uint32_t domid, i
int fd;
uint8_t *config_data;
int config_len;
+ libxl_domain_suspend_suse_properties props = {};
save_domain_core_begin(domid, preserve_domid, override_config_file,
&config_data, &config_len);
@@ -146,7 +147,7 @@ static int save_domain(uint32_t domid, i
save_domain_core_writeconfig(fd, filename, config_data, config_len);
- int rc = libxl_domain_suspend(ctx, domid, fd, 0, NULL);
+ int rc = libxl_domain_suspend_suse(ctx, domid, fd, &props, NULL);
close(fd);
if (rc < 0) {

View File

@ -0,0 +1,238 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Thu, 7 Jan 2021 20:25:28 +0100
Subject: libxc sr abort_if_busy
tools: add --abort_if_busy to libxl_domain_suspend
Provide a knob to the host admin to abort the live migration of a
running domU if the downtime during final transit will be too long
for the workload within domU.
Adjust error reporting. Add ERROR_MIGRATION_ABORTED to allow callers of
libxl_domain_suspend to distinguish between errors and the requested
constraint.
Adjust precopy_policy to simplify reporting of remaining dirty pages.
The loop in send_memory_live populates ->dirty_count in a different
place than ->iteration. Let it proceeed one more time to provide the
desired information before leaving the loop.
This patch adjusts xl(1) and the libxl API.
External users check LIBXL_HAVE_DOMAIN_SUSPEND_PROPS for the availibility
of the new .abort_if_busy property.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
docs/man/xl.1.pod.in | 8 +++++++
tools/include/libxl.h | 1 +
tools/libs/light/libxl_dom_save.c | 7 ++++++-
tools/libs/light/libxl_domain.c | 1 +
tools/libs/light/libxl_internal.h | 2 ++
tools/libs/light/libxl_stream_write.c | 9 +++++++-
tools/libs/light/libxl_types.idl | 1 +
tools/xl/xl_cmdtable.c | 6 +++++-
tools/xl/xl_migrate.c | 30 ++++++++++++++++++++-------
9 files changed, 55 insertions(+), 10 deletions(-)
--- a/docs/man/xl.1.pod.in
+++ b/docs/man/xl.1.pod.in
@@ -513,6 +513,14 @@ low, the guest is suspended and the domU
This allows the host admin to control for how long the domU will likely
be suspended during transit.
+=item B<--abort_if_busy>
+
+Abort migration instead of doing final suspend/move/resume if the
+guest produced more than I<min_remaining> dirty pages during th number
+of I<max_iters> iterations.
+This avoids long periods of time where the guest is suspended, which
+may confuse the workload within domU.
+
=back
=item B<remus> [I<OPTIONS>] I<domain-id> I<host>
--- a/tools/include/libxl.h
+++ b/tools/include/libxl.h
@@ -1824,6 +1824,7 @@ typedef struct {
} libxl_domain_suspend_suse_properties;
#define LIBXL_SUSPEND_DEBUG 1
#define LIBXL_SUSPEND_LIVE 2
+#define LIBXL_SUSPEND_ABORT_IF_BUSY 4
#define LIBXL_HAVE_DOMAIN_SUSPEND_SUSE
int libxl_domain_suspend_suse(libxl_ctx *ctx, uint32_t domid, int fd,
--- a/tools/libs/light/libxl_dom_save.c
+++ b/tools/libs/light/libxl_dom_save.c
@@ -383,11 +383,16 @@ static int libxl__domain_save_precopy_po
stats.iteration, stats.dirty_count, stats.total_written);
if (stats.dirty_count >= 0 && stats.dirty_count < dss->min_remaining)
goto stop_copy;
- if (stats.iteration >= dss->max_iters)
+ if (stats.dirty_count >= 0 && stats.iteration >= dss->max_iters)
goto stop_copy;
return XGS_POLICY_CONTINUE_PRECOPY;
stop_copy:
+ if (dss->abort_if_busy)
+ {
+ dss->remaining_dirty_pages = stats.dirty_count;
+ return XGS_POLICY_ABORT;
+ }
return XGS_POLICY_STOP_AND_COPY;
}
--- a/tools/libs/light/libxl_domain.c
+++ b/tools/libs/light/libxl_domain.c
@@ -526,6 +526,7 @@ static int do_libxl_domain_suspend(libxl
dss->type = type;
dss->max_iters = props->max_iters ?: LIBXL_XGS_POLICY_MAX_ITERATIONS;
dss->min_remaining = props->min_remaining ?: LIBXL_XGS_POLICY_TARGET_DIRTY_COUNT;
+ dss->abort_if_busy = props->flags & LIBXL_SUSPEND_ABORT_IF_BUSY;
dss->live = props->flags & LIBXL_SUSPEND_LIVE;
dss->debug = props->flags & LIBXL_SUSPEND_DEBUG;
dss->checkpointed_stream = LIBXL_CHECKPOINTED_STREAM_NONE;
--- a/tools/libs/light/libxl_internal.h
+++ b/tools/libs/light/libxl_internal.h
@@ -3655,9 +3655,11 @@ struct libxl__domain_save_state {
libxl_domain_type type;
int live;
int debug;
+ int abort_if_busy;
int checkpointed_stream;
uint32_t max_iters;
uint32_t min_remaining;
+ long remaining_dirty_pages;
const libxl_domain_remus_info *remus;
/* private */
int rc;
--- a/tools/libs/light/libxl_stream_write.c
+++ b/tools/libs/light/libxl_stream_write.c
@@ -344,11 +344,18 @@ void libxl__xc_domain_save_done(libxl__e
goto err;
if (retval) {
+ if (dss->remaining_dirty_pages) {
+ LOGD(NOTICE, dss->domid, "saving domain: aborted,"
+ " %ld remaining dirty pages.", dss->remaining_dirty_pages);
+ } else {
LOGEVD(ERROR, errnoval, dss->domid, "saving domain: %s",
dss->dsps.guest_responded ?
"domain responded to suspend request" :
"domain did not respond to suspend request");
- if (!dss->dsps.guest_responded)
+ }
+ if (dss->remaining_dirty_pages)
+ rc = ERROR_MIGRATION_ABORTED;
+ else if(!dss->dsps.guest_responded)
rc = ERROR_GUEST_TIMEDOUT;
else if (dss->rc)
rc = dss->rc;
--- a/tools/libs/light/libxl_types.idl
+++ b/tools/libs/light/libxl_types.idl
@@ -76,6 +76,7 @@ libxl_error = Enumeration("error", [
(-30, "QMP_DEVICE_NOT_ACTIVE"), # a device has failed to be become active
(-31, "QMP_DEVICE_NOT_FOUND"), # the requested device has not been found
(-32, "QEMU_API"), # QEMU's replies don't contains expected members
+ (-33, "MIGRATION_ABORTED"),
], value_namespace = "")
libxl_domain_type = Enumeration("domain_type", [
--- a/tools/xl/xl_cmdtable.c
+++ b/tools/xl/xl_cmdtable.c
@@ -177,7 +177,11 @@ const struct cmd_spec cmd_table[] = {
"-p Do not unpause domain after migrating it.\n"
"-D Preserve the domain id\n"
"--max_iters N Number of copy iterations before final stop+move\n"
- "--min_remaining N Number of remaining dirty pages before final stop+move"
+ "--min_remaining N Number of remaining dirty pages before final stop+move\n"
+ "--abort_if_busy Abort migration instead of doing final stop+move,\n"
+ " if the number of dirty pages is higher than <min_remaining>\n"
+ " after <max_iters> iterations. Otherwise the amount of memory\n"
+ " to be transfered would exceed maximum allowed domU downtime."
},
{ "restore",
&main_restore, 0, 1,
--- a/tools/xl/xl_migrate.c
+++ b/tools/xl/xl_migrate.c
@@ -177,7 +177,7 @@ static void migrate_do_preamble(int send
}
static void migrate_domain(uint32_t domid, int preserve_domid,
- const char *rune, int debug,
+ const char *rune, int debug, int abort_if_busy,
uint32_t max_iters,
uint32_t min_remaining,
const char *override_config_file)
@@ -213,14 +213,20 @@ static void migrate_domain(uint32_t domi
if (debug)
props.flags |= LIBXL_SUSPEND_DEBUG;
+ if (abort_if_busy)
+ props.flags |= LIBXL_SUSPEND_ABORT_IF_BUSY;
rc = libxl_domain_suspend_suse(ctx, domid, send_fd, &props, NULL);
if (rc) {
fprintf(stderr, "migration sender: libxl_domain_suspend failed"
" (rc=%d)\n", rc);
- if (rc == ERROR_GUEST_TIMEDOUT)
- goto failed_suspend;
- else
- goto failed_resume;
+ switch (rc) {
+ case ERROR_GUEST_TIMEDOUT:
+ goto failed_suspend;
+ case ERROR_MIGRATION_ABORTED:
+ goto failed_busy;
+ default:
+ goto failed_resume;
+ }
}
//fprintf(stderr, "migration sender: Transfer complete.\n");
@@ -302,6 +308,12 @@ static void migrate_domain(uint32_t domi
fprintf(stderr, "Migration failed, failed to suspend at sender.\n");
exit(EXIT_FAILURE);
+ failed_busy:
+ close(send_fd);
+ migration_child_report(recv_fd);
+ fprintf(stderr, "Migration aborted as requested, domain is too busy.\n");
+ exit(EXIT_FAILURE);
+
failed_resume:
close(send_fd);
migration_child_report(recv_fd);
@@ -545,13 +557,14 @@ int main_migrate(int argc, char **argv)
char *rune = NULL;
char *host;
int opt, daemonize = 1, monitor = 1, debug = 0, pause_after_migration = 0;
- int preserve_domid = 0;
+ int preserve_domid = 0, abort_if_busy = 0;
uint32_t max_iters = 0;
uint32_t min_remaining = 0;
static struct option opts[] = {
{"debug", 0, 0, 0x100},
{"max_iters", 1, 0, 0x101},
{"min_remaining", 1, 0, 0x102},
+ {"abort_if_busy", 0, 0, 0x103},
{"live", 0, 0, 0x200},
COMMON_LONG_OPTS
};
@@ -585,6 +598,9 @@ int main_migrate(int argc, char **argv)
case 0x102: /* --min_remaining */
min_remaining = atoi(optarg);
break;
+ case 0x103: /* --abort_if_busy */
+ abort_if_busy = 1;
+ break;
case 0x200: /* --live */
/* ignored for compatibility with xm */
break;
@@ -619,7 +635,7 @@ int main_migrate(int argc, char **argv)
pause_after_migration ? " -p" : "");
}
- migrate_domain(domid, preserve_domid, rune, debug,
+ migrate_domain(domid, preserve_domid, rune, debug, abort_if_busy,
max_iters, min_remaining, config_filename);
return EXIT_SUCCESS;
}

148
libxc-sr-max_iters.patch Normal file
View File

@ -0,0 +1,148 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Sat, 9 Jan 2021 11:32:17 +0100
Subject: libxc sr max_iters
tools: add --max_iters to libxl_domain_suspend
Migrating a large, and potentially busy, domU will take more
time than neccessary due to excessive number of copying iterations.
Allow to host admin to control the number of iterations which
copy cumulated domU dirty pages to the target host.
The default remains 5, which means one initial iteration to copy the
entire domU memory, and up to 4 additional iterations to copy dirty
memory from the still running domU. After the given number of iterations
the domU is suspended, remaining dirty memory is copied and the domU is
finally moved to the target host.
This patch adjusts xl(1) and the libxl API.
External users check LIBXL_HAVE_DOMAIN_SUSPEND_PROPS for the availibility
of the new .max_iters property.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
docs/man/xl.1.pod.in | 4 ++++
tools/include/libxl.h | 1 +
tools/libs/light/libxl_dom_save.c | 2 +-
tools/libs/light/libxl_domain.c | 1 +
tools/libs/light/libxl_internal.h | 1 +
tools/xl/xl_cmdtable.c | 3 ++-
tools/xl/xl_migrate.c | 10 +++++++++-
7 files changed, 19 insertions(+), 3 deletions(-)
--- a/docs/man/xl.1.pod.in
+++ b/docs/man/xl.1.pod.in
@@ -501,6 +501,10 @@ such that it will be identical on the de
configuration is overridden using the B<-C> option. Note that it is not
possible to use this option for a 'localhost' migration.
+=item B<--max_iters> I<iterations>
+
+Number of copy iterations before final suspend+move (default: 5)
+
=back
=item B<remus> [I<OPTIONS>] I<domain-id> I<host>
--- a/tools/include/libxl.h
+++ b/tools/include/libxl.h
@@ -1819,6 +1819,7 @@ static inline int libxl_retrieve_domain_
typedef struct {
uint32_t flags; /* LIBXL_SUSPEND_* */
+ uint32_t max_iters;
} libxl_domain_suspend_suse_properties;
#define LIBXL_SUSPEND_DEBUG 1
#define LIBXL_SUSPEND_LIVE 2
--- a/tools/libs/light/libxl_dom_save.c
+++ b/tools/libs/light/libxl_dom_save.c
@@ -383,7 +383,7 @@ static int libxl__domain_save_precopy_po
stats.iteration, stats.dirty_count, stats.total_written);
if (stats.dirty_count >= 0 && stats.dirty_count < LIBXL_XGS_POLICY_TARGET_DIRTY_COUNT)
goto stop_copy;
- if (stats.iteration >= LIBXL_XGS_POLICY_MAX_ITERATIONS)
+ if (stats.iteration >= dss->max_iters)
goto stop_copy;
return XGS_POLICY_CONTINUE_PRECOPY;
--- a/tools/libs/light/libxl_domain.c
+++ b/tools/libs/light/libxl_domain.c
@@ -524,6 +524,7 @@ static int do_libxl_domain_suspend(libxl
dss->domid = domid;
dss->fd = fd;
dss->type = type;
+ dss->max_iters = props->max_iters ?: LIBXL_XGS_POLICY_MAX_ITERATIONS;
dss->live = props->flags & LIBXL_SUSPEND_LIVE;
dss->debug = props->flags & LIBXL_SUSPEND_DEBUG;
dss->checkpointed_stream = LIBXL_CHECKPOINTED_STREAM_NONE;
--- a/tools/libs/light/libxl_internal.h
+++ b/tools/libs/light/libxl_internal.h
@@ -3656,6 +3656,7 @@ struct libxl__domain_save_state {
int live;
int debug;
int checkpointed_stream;
+ uint32_t max_iters;
const libxl_domain_remus_info *remus;
/* private */
int rc;
--- a/tools/xl/xl_cmdtable.c
+++ b/tools/xl/xl_cmdtable.c
@@ -175,7 +175,8 @@ const struct cmd_spec cmd_table[] = {
" of the domain.\n"
"--debug Enable verification mode.\n"
"-p Do not unpause domain after migrating it.\n"
- "-D Preserve the domain id"
+ "-D Preserve the domain id\n"
+ "--max_iters N Number of copy iterations before final stop+move"
},
{ "restore",
&main_restore, 0, 1,
--- a/tools/xl/xl_migrate.c
+++ b/tools/xl/xl_migrate.c
@@ -178,6 +178,7 @@ static void migrate_do_preamble(int send
static void migrate_domain(uint32_t domid, int preserve_domid,
const char *rune, int debug,
+ uint32_t max_iters,
const char *override_config_file)
{
pid_t child = -1;
@@ -189,6 +190,7 @@ static void migrate_domain(uint32_t domi
int config_len;
libxl_domain_suspend_suse_properties props = {
.flags = LIBXL_SUSPEND_LIVE,
+ .max_iters = max_iters,
};
save_domain_core_begin(domid, preserve_domid, override_config_file,
@@ -542,8 +544,10 @@ int main_migrate(int argc, char **argv)
char *host;
int opt, daemonize = 1, monitor = 1, debug = 0, pause_after_migration = 0;
int preserve_domid = 0;
+ uint32_t max_iters = 0;
static struct option opts[] = {
{"debug", 0, 0, 0x100},
+ {"max_iters", 1, 0, 0x101},
{"live", 0, 0, 0x200},
COMMON_LONG_OPTS
};
@@ -571,6 +575,9 @@ int main_migrate(int argc, char **argv)
case 0x100: /* --debug */
debug = 1;
break;
+ case 0x101: /* --max_iters */
+ max_iters = atoi(optarg);
+ break;
case 0x200: /* --live */
/* ignored for compatibility with xm */
break;
@@ -605,7 +612,8 @@ int main_migrate(int argc, char **argv)
pause_after_migration ? " -p" : "");
}
- migrate_domain(domid, preserve_domid, rune, debug, config_filename);
+ migrate_domain(domid, preserve_domid, rune, debug,
+ max_iters, config_filename);
return EXIT_SUCCESS;
}

View File

@ -0,0 +1,173 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Thu, 7 Jan 2021 19:39:28 +0100
Subject: libxc sr min_remaining
tools: add --min_remaining to libxl_domain_suspend
The decision to stop+move a domU to the new host must be based on two factors:
- the available network bandwidth for the migration stream
- the maximum time a workload within a domU can be savely suspended
Both values define how many dirty pages a workload may produce prior the
final stop+move.
The default value of 50 pages is much too low with todays network bandwidths.
On an idle 1GiB link these 200K will be transferred within ~2ms.
Give the admin a knob to adjust the point when the final stop+move will
be done, so he can base this decision on his own needs.
This patch adjusts xl(1) and the libxl API.
External users check LIBXL_HAVE_DOMAIN_SUSPEND_PROPS for the availibility
of the new .min_remaining property.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
docs/man/xl.1.pod.in | 8 ++++++++
tools/include/libxl.h | 1 +
tools/libs/light/libxl_dom_save.c | 2 +-
tools/libs/light/libxl_domain.c | 1 +
tools/libs/light/libxl_internal.h | 1 +
tools/xl/xl_cmdtable.c | 23 ++++++++++++-----------
tools/xl/xl_migrate.c | 9 ++++++++-
7 files changed, 32 insertions(+), 13 deletions(-)
--- a/docs/man/xl.1.pod.in
+++ b/docs/man/xl.1.pod.in
@@ -505,6 +505,14 @@ possible to use this option for a 'local
Number of copy iterations before final suspend+move (default: 5)
+=item B<--min_remaing> I<pages>
+
+Number of remaining dirty pages. If the number of dirty pages drops that
+low, the guest is suspended and the domU will finally be moved to I<host>.
+
+This allows the host admin to control for how long the domU will likely
+be suspended during transit.
+
=back
=item B<remus> [I<OPTIONS>] I<domain-id> I<host>
--- a/tools/include/libxl.h
+++ b/tools/include/libxl.h
@@ -1820,6 +1820,7 @@ static inline int libxl_retrieve_domain_
typedef struct {
uint32_t flags; /* LIBXL_SUSPEND_* */
uint32_t max_iters;
+ uint32_t min_remaining;
} libxl_domain_suspend_suse_properties;
#define LIBXL_SUSPEND_DEBUG 1
#define LIBXL_SUSPEND_LIVE 2
--- a/tools/libs/light/libxl_dom_save.c
+++ b/tools/libs/light/libxl_dom_save.c
@@ -381,7 +381,7 @@ static int libxl__domain_save_precopy_po
LOGD(DEBUG, shs->domid, "iteration %u dirty_count %ld total_written %lu",
stats.iteration, stats.dirty_count, stats.total_written);
- if (stats.dirty_count >= 0 && stats.dirty_count < LIBXL_XGS_POLICY_TARGET_DIRTY_COUNT)
+ if (stats.dirty_count >= 0 && stats.dirty_count < dss->min_remaining)
goto stop_copy;
if (stats.iteration >= dss->max_iters)
goto stop_copy;
--- a/tools/libs/light/libxl_domain.c
+++ b/tools/libs/light/libxl_domain.c
@@ -525,6 +525,7 @@ static int do_libxl_domain_suspend(libxl
dss->fd = fd;
dss->type = type;
dss->max_iters = props->max_iters ?: LIBXL_XGS_POLICY_MAX_ITERATIONS;
+ dss->min_remaining = props->min_remaining ?: LIBXL_XGS_POLICY_TARGET_DIRTY_COUNT;
dss->live = props->flags & LIBXL_SUSPEND_LIVE;
dss->debug = props->flags & LIBXL_SUSPEND_DEBUG;
dss->checkpointed_stream = LIBXL_CHECKPOINTED_STREAM_NONE;
--- a/tools/libs/light/libxl_internal.h
+++ b/tools/libs/light/libxl_internal.h
@@ -3657,6 +3657,7 @@ struct libxl__domain_save_state {
int debug;
int checkpointed_stream;
uint32_t max_iters;
+ uint32_t min_remaining;
const libxl_domain_remus_info *remus;
/* private */
int rc;
--- a/tools/xl/xl_cmdtable.c
+++ b/tools/xl/xl_cmdtable.c
@@ -166,17 +166,18 @@ const struct cmd_spec cmd_table[] = {
&main_migrate, 0, 1,
"Migrate a domain to another host",
"[options] <Domain> <host>",
- "-h Print this help.\n"
- "-C <config> Send <config> instead of config file from creation.\n"
- "-s <sshcommand> Use <sshcommand> instead of ssh. String will be passed\n"
- " to sh. If empty, run <host> instead of ssh <host> xl\n"
- " migrate-receive [-d -e]\n"
- "-e Do not wait in the background (on <host>) for the death\n"
- " of the domain.\n"
- "--debug Enable verification mode.\n"
- "-p Do not unpause domain after migrating it.\n"
- "-D Preserve the domain id\n"
- "--max_iters N Number of copy iterations before final stop+move"
+ "-h Print this help.\n"
+ "-C <config> Send <config> instead of config file from creation.\n"
+ "-s <sshcommand> Use <sshcommand> instead of ssh. String will be passed\n"
+ " to sh. If empty, run <host> instead of ssh <host> xl\n"
+ " migrate-receive [-d -e]\n"
+ "-e Do not wait in the background (on <host>) for the death\n"
+ " of the domain.\n"
+ "--debug Enable verification mode.\n"
+ "-p Do not unpause domain after migrating it.\n"
+ "-D Preserve the domain id\n"
+ "--max_iters N Number of copy iterations before final stop+move\n"
+ "--min_remaining N Number of remaining dirty pages before final stop+move"
},
{ "restore",
&main_restore, 0, 1,
--- a/tools/xl/xl_migrate.c
+++ b/tools/xl/xl_migrate.c
@@ -179,6 +179,7 @@ static void migrate_do_preamble(int send
static void migrate_domain(uint32_t domid, int preserve_domid,
const char *rune, int debug,
uint32_t max_iters,
+ uint32_t min_remaining,
const char *override_config_file)
{
pid_t child = -1;
@@ -191,6 +192,7 @@ static void migrate_domain(uint32_t domi
libxl_domain_suspend_suse_properties props = {
.flags = LIBXL_SUSPEND_LIVE,
.max_iters = max_iters,
+ .min_remaining = min_remaining,
};
save_domain_core_begin(domid, preserve_domid, override_config_file,
@@ -545,9 +547,11 @@ int main_migrate(int argc, char **argv)
int opt, daemonize = 1, monitor = 1, debug = 0, pause_after_migration = 0;
int preserve_domid = 0;
uint32_t max_iters = 0;
+ uint32_t min_remaining = 0;
static struct option opts[] = {
{"debug", 0, 0, 0x100},
{"max_iters", 1, 0, 0x101},
+ {"min_remaining", 1, 0, 0x102},
{"live", 0, 0, 0x200},
COMMON_LONG_OPTS
};
@@ -578,6 +582,9 @@ int main_migrate(int argc, char **argv)
case 0x101: /* --max_iters */
max_iters = atoi(optarg);
break;
+ case 0x102: /* --min_remaining */
+ min_remaining = atoi(optarg);
+ break;
case 0x200: /* --live */
/* ignored for compatibility with xm */
break;
@@ -613,7 +620,7 @@ int main_migrate(int argc, char **argv)
}
migrate_domain(domid, preserve_domid, rune, debug,
- max_iters, config_filename);
+ max_iters, min_remaining, config_filename);
return EXIT_SUCCESS;
}

View File

@ -0,0 +1,24 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Mon, 4 Jan 2021 20:58:42 +0200
Subject: libxc sr number of iterations
Reduce default value of --max_iters from 5 to 1.
The workload within domU will continue to produce dirty pages.
It is unreasonable to expect any slowdown during migration.
Now there is one initial copy of all memory, one instead of five
iterations for dirty memory, and a final copy iteration prior move.
---
tools/libs/light/libxl_internal.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/tools/libs/light/libxl_internal.h
+++ b/tools/libs/light/libxl_internal.h
@@ -124,7 +124,7 @@
#define DOMID_XS_PATH "domid"
#define PVSHIM_BASENAME "xen-shim"
#define PVSHIM_CMDLINE "pv-shim console=xen,pv"
-#define LIBXL_XGS_POLICY_MAX_ITERATIONS 5
+#define LIBXL_XGS_POLICY_MAX_ITERATIONS 1
#define LIBXL_XGS_POLICY_TARGET_DIRTY_COUNT 50
/* Size macros. */

View File

@ -0,0 +1,90 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Fri, 8 Jan 2021 18:19:49 +0100
Subject: libxc sr precopy_policy
tools: add callback to libxl for precopy_policy and precopy_stats
This duplicates simple_precopy_policy. To recap its purpose:
- do up to 5 iterations of copying dirty domU memory to target,
including the initial copying of all domU memory, excluding
the final copying while the domU is suspended
- do fewer iterations in case the domU dirtied less than 50 pages
Take the opportunity to also move xen_pfn_t into qw().
Signed-off-by: Olaf Hering <olaf@aepfle.de>
v02:
- use plain struct precopy_stats instead of inventing
a new precopy_stats_t (anthony)
---
tools/libs/light/libxl_dom_save.c | 19 +++++++++++++++++++
tools/libs/light/libxl_internal.h | 2 ++
tools/libs/light/libxl_save_msgs_gen.pl | 3 ++-
3 files changed, 23 insertions(+), 1 deletion(-)
--- a/tools/libs/light/libxl_dom_save.c
+++ b/tools/libs/light/libxl_dom_save.c
@@ -373,6 +373,24 @@ int libxl__save_emulator_xenstore_data(l
return rc;
}
+static int libxl__domain_save_precopy_policy(struct precopy_stats stats, void *user)
+{
+ libxl__save_helper_state *shs = user;
+ libxl__domain_save_state *dss = shs->caller_state;
+ STATE_AO_GC(dss->ao);
+
+ LOGD(DEBUG, shs->domid, "iteration %u dirty_count %ld total_written %lu",
+ stats.iteration, stats.dirty_count, stats.total_written);
+ if (stats.dirty_count >= 0 && stats.dirty_count < LIBXL_XGS_POLICY_TARGET_DIRTY_COUNT)
+ goto stop_copy;
+ if (stats.iteration >= LIBXL_XGS_POLICY_MAX_ITERATIONS)
+ goto stop_copy;
+ return XGS_POLICY_CONTINUE_PRECOPY;
+
+stop_copy:
+ return XGS_POLICY_STOP_AND_COPY;
+}
+
/*----- main code for saving, in order of execution -----*/
void libxl__domain_save(libxl__egc *egc, libxl__domain_save_state *dss)
@@ -430,6 +448,7 @@ void libxl__domain_save(libxl__egc *egc,
callbacks->suspend = libxl__domain_suspend_callback;
callbacks->switch_qemu_logdirty = libxl__domain_suspend_common_switch_qemu_logdirty;
+ callbacks->precopy_policy = libxl__domain_save_precopy_policy;
dss->sws.ao = dss->ao;
dss->sws.dss = dss;
--- a/tools/libs/light/libxl_internal.h
+++ b/tools/libs/light/libxl_internal.h
@@ -124,6 +124,8 @@
#define DOMID_XS_PATH "domid"
#define PVSHIM_BASENAME "xen-shim"
#define PVSHIM_CMDLINE "pv-shim console=xen,pv"
+#define LIBXL_XGS_POLICY_MAX_ITERATIONS 5
+#define LIBXL_XGS_POLICY_TARGET_DIRTY_COUNT 50
/* Size macros. */
#define MB(_mb) (_AC(_mb, ULL) << 20)
--- a/tools/libs/light/libxl_save_msgs_gen.pl
+++ b/tools/libs/light/libxl_save_msgs_gen.pl
@@ -23,6 +23,7 @@ our @msgs = (
STRING doing_what),
'unsigned long', 'done',
'unsigned long', 'total'] ],
+ [ 'scxW', "precopy_policy", ['struct precopy_stats', 'stats'] ],
[ 'srcxA', "suspend", [] ],
[ 'srcxA', "postcopy", [] ],
[ 'srcxA', "checkpoint", [] ],
@@ -142,7 +143,7 @@ static void bytes_put(unsigned char *con
END
-foreach my $simpletype (qw(int uint16_t uint32_t unsigned), 'unsigned long', 'xen_pfn_t') {
+foreach my $simpletype (qw(int uint16_t uint32_t unsigned xen_pfn_t), 'struct precopy_stats', 'unsigned long') {
my $typeid = typeid($simpletype);
$out_body{'callout'} .= <<END;
static int ${typeid}_get(const unsigned char **msg,

103
libxc-sr-readv_exact.patch Normal file
View File

@ -0,0 +1,103 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Wed, 28 Oct 2020 12:07:36 +0100
Subject: libxc sr readv_exact
tools: add readv_exact to libxenctrl
Read a batch of iovec's.
Short reads are the common case, finish the trailing iov with read_exact.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
v2:
- add comment to short-read handling
---
tools/libs/ctrl/xc_private.c | 57 +++++++++++++++++++++++++++++++++++-
tools/libs/ctrl/xc_private.h | 1 +
2 files changed, 57 insertions(+), 1 deletion(-)
--- a/tools/libs/ctrl/xc_private.c
+++ b/tools/libs/ctrl/xc_private.c
@@ -699,8 +699,23 @@ int write_exact(int fd, const void *data
#if defined(__MINIOS__)
/*
- * MiniOS's libc doesn't know about writev(). Implement it as multiple write()s.
+ * MiniOS's libc doesn't know about readv/writev().
+ * Implement it as multiple read/write()s.
*/
+int readv_exact(int fd, const struct iovec *iov, int iovcnt)
+{
+ int rc, i;
+
+ for ( i = 0; i < iovcnt; ++i )
+ {
+ rc = read_exact(fd, iov[i].iov_base, iov[i].iov_len);
+ if ( rc )
+ return rc;
+ }
+
+ return 0;
+}
+
int writev_exact(int fd, const struct iovec *iov, int iovcnt)
{
int rc, i;
@@ -715,6 +730,46 @@ int writev_exact(int fd, const struct io
return 0;
}
#else
+int readv_exact(int fd, const struct iovec *iov, int iovcnt)
+{
+ int rc = 0, idx = 0;
+ ssize_t len;
+
+ while ( idx < iovcnt )
+ {
+ len = readv(fd, &iov[idx], min(iovcnt - idx, IOV_MAX));
+ if ( len == -1 && errno == EINTR )
+ continue;
+ if ( len <= 0 )
+ {
+ rc = -1;
+ goto out;
+ }
+
+ /* Finish a potential short read in the last iov */
+ while ( len > 0 && idx < iovcnt )
+ {
+ if ( len >= iov[idx].iov_len )
+ {
+ len -= iov[idx].iov_len;
+ }
+ else
+ {
+ void *p = iov[idx].iov_base + len;
+ size_t l = iov[idx].iov_len - len;
+
+ rc = read_exact(fd, p, l);
+ if ( rc )
+ goto out;
+ len = 0;
+ }
+ idx++;
+ }
+ }
+out:
+ return rc;
+}
+
int writev_exact(int fd, const struct iovec *iov, int iovcnt)
{
struct iovec *local_iov = NULL;
--- a/tools/libs/ctrl/xc_private.h
+++ b/tools/libs/ctrl/xc_private.h
@@ -395,6 +395,7 @@ int xc_flush_mmu_updates(xc_interface *x
/* Return 0 on success; -1 on error setting errno. */
int read_exact(int fd, void *data, size_t size); /* EOF => -1, errno=0 */
+int readv_exact(int fd, const struct iovec *iov, int iovcnt);
int write_exact(int fd, const void *data, size_t size);
int writev_exact(int fd, const struct iovec *iov, int iovcnt);

View File

@ -0,0 +1,435 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Tue, 27 Oct 2020 19:21:50 +0100
Subject: libxc sr restore handle_buffered_page_data
tools: restore: split handle_page_data
handle_page_data must be able to read directly into mapped guest memory.
This will avoid unneccesary memcpy calls for data that can be consumed verbatim.
Split the various steps of record processing:
- move processing to handle_buffered_page_data
- adjust xenforeignmemory_map to set errno in case of failure
- adjust verify mode to set errno in case of failure
This change is preparation for future changes in handle_page_data,
no change in behavior is intended.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
tools/libs/guest/xg_sr_common.h | 4 +
tools/libs/guest/xg_sr_restore.c | 320 ++++++++++++++++++++-----------
2 files changed, 207 insertions(+), 117 deletions(-)
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -262,6 +262,10 @@ struct xc_sr_context
int *map_errs;
xen_pfn_t *pp_pfns;
xen_pfn_t *pp_mfns;
+ void **guest_data;
+
+ void *guest_mapping;
+ uint32_t nr_mapped_pages;
int send_back_fd;
unsigned long p2m_size;
--- a/tools/libs/guest/xg_sr_restore.c
+++ b/tools/libs/guest/xg_sr_restore.c
@@ -183,121 +183,18 @@ int populate_pfns(struct xc_sr_context *
return rc;
}
-/*
- * Given a list of pfns, their types, and a block of page data from the
- * stream, populate and record their types, map the relevant subset and copy
- * the data into the guest.
- */
-static int process_page_data(struct xc_sr_context *ctx, unsigned int count,
- xen_pfn_t *pfns, uint32_t *types, void *page_data)
+static int handle_static_data_end_v2(struct xc_sr_context *ctx)
{
- xc_interface *xch = ctx->xch;
- int rc;
- void *mapping = NULL, *guest_page = NULL;
- unsigned int i, /* i indexes the pfns from the record. */
- j, /* j indexes the subset of pfns we decide to map. */
- nr_pages = 0;
-
- rc = populate_pfns(ctx, count, pfns, types);
- if ( rc )
- {
- ERROR("Failed to populate pfns for batch of %u pages", count);
- goto err;
- }
-
- for ( i = 0; i < count; ++i )
- {
- ctx->restore.ops.set_page_type(ctx, pfns[i], types[i]);
-
- if ( page_type_has_stream_data(types[i]) )
- ctx->restore.mfns[nr_pages++] = ctx->restore.ops.pfn_to_gfn(ctx, pfns[i]);
- }
-
- /* Nothing to do? */
- if ( nr_pages == 0 )
- goto done;
-
- mapping = guest_page = xenforeignmemory_map(
- xch->fmem, ctx->domid, PROT_READ | PROT_WRITE,
- nr_pages, ctx->restore.mfns, ctx->restore.map_errs);
- if ( !mapping )
- {
- rc = -1;
- PERROR("Unable to map %u mfns for %u pages of data",
- nr_pages, count);
- goto err;
- }
-
- for ( i = 0, j = 0; i < count; ++i )
- {
- if ( !page_type_has_stream_data(types[i]) )
- continue;
-
- if ( ctx->restore.map_errs[j] )
- {
- rc = -1;
- ERROR("Mapping pfn %#"PRIpfn" (mfn %#"PRIpfn", type %#"PRIx32") failed with %d",
- pfns[i], ctx->restore.mfns[j], types[i], ctx->restore.map_errs[j]);
- goto err;
- }
-
- /* Undo page normalisation done by the saver. */
- rc = ctx->restore.ops.localise_page(ctx, types[i], page_data);
- if ( rc )
- {
- ERROR("Failed to localise pfn %#"PRIpfn" (type %#"PRIx32")",
- pfns[i], types[i] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT);
- goto err;
- }
-
- if ( ctx->restore.verify )
- {
- /* Verify mode - compare incoming data to what we already have. */
- if ( memcmp(guest_page, page_data, PAGE_SIZE) )
- ERROR("verify pfn %#"PRIpfn" failed (type %#"PRIx32")",
- pfns[i], types[i] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT);
- }
- else
- {
- /* Regular mode - copy incoming data into place. */
- memcpy(guest_page, page_data, PAGE_SIZE);
- }
-
- ++j;
- guest_page += PAGE_SIZE;
- page_data += PAGE_SIZE;
- }
-
- done:
- rc = 0;
-
- err:
- if ( mapping )
- xenforeignmemory_unmap(xch->fmem, mapping, nr_pages);
-
- return rc;
-}
+ int rc = 0;
-/*
- * Validate a PAGE_DATA record from the stream, and pass the results to
- * process_page_data() to actually perform the legwork.
- */
-static int handle_page_data(struct xc_sr_context *ctx, struct xc_sr_record *rec)
-{
+#if defined(__i386__) || defined(__x86_64__)
xc_interface *xch = ctx->xch;
- struct xc_sr_rec_page_data_header *pages = rec->data;
- unsigned int i, pages_of_data = 0;
- int rc = -1;
-
- xen_pfn_t pfn;
- uint32_t type;
-
/*
* v2 compatibility only exists for x86 streams. This is a bit of a
* bodge, but it is less bad than duplicating handle_page_data() between
* different architectures.
*/
-#if defined(__i386__) || defined(__x86_64__)
+
/* v2 compat. Infer the position of STATIC_DATA_END. */
if ( ctx->restore.format_version < 3 && !ctx->restore.seen_static_data_end )
{
@@ -315,12 +212,26 @@ static int handle_page_data(struct xc_sr
ERROR("No STATIC_DATA_END seen");
goto err;
}
+
+ rc = 0;
+err:
#endif
- if ( rec->length < sizeof(*pages) )
+ return rc;
+}
+
+static bool verify_rec_page_hdr(struct xc_sr_context *ctx, uint32_t rec_length,
+ struct xc_sr_rec_page_data_header *pages)
+{
+ xc_interface *xch = ctx->xch;
+ bool ret = false;
+
+ errno = EINVAL;
+
+ if ( rec_length < sizeof(*pages) )
{
ERROR("PAGE_DATA record truncated: length %u, min %zu",
- rec->length, sizeof(*pages));
+ rec_length, sizeof(*pages));
goto err;
}
@@ -330,13 +241,28 @@ static int handle_page_data(struct xc_sr
goto err;
}
- if ( rec->length < sizeof(*pages) + (pages->count * sizeof(uint64_t)) )
+ if ( rec_length < sizeof(*pages) + (pages->count * sizeof(uint64_t)) )
{
ERROR("PAGE_DATA record (length %u) too short to contain %u"
- " pfns worth of information", rec->length, pages->count);
+ " pfns worth of information", rec_length, pages->count);
goto err;
}
+ ret = true;
+
+err:
+ return ret;
+}
+
+static bool verify_rec_page_pfns(struct xc_sr_context *ctx, uint32_t rec_length,
+ struct xc_sr_rec_page_data_header *pages)
+{
+ xc_interface *xch = ctx->xch;
+ uint32_t i, pages_of_data = 0;
+ xen_pfn_t pfn;
+ uint32_t type;
+ bool ret = false;
+
for ( i = 0; i < pages->count; ++i )
{
pfn = pages->pfn[i] & PAGE_DATA_PFN_MASK;
@@ -363,19 +289,177 @@ static int handle_page_data(struct xc_sr
ctx->restore.types[i] = type;
}
- if ( rec->length != (sizeof(*pages) +
+ if ( rec_length != (sizeof(*pages) +
(sizeof(uint64_t) * pages->count) +
(PAGE_SIZE * pages_of_data)) )
{
ERROR("PAGE_DATA record wrong size: length %u, expected "
- "%zu + %zu + %lu", rec->length, sizeof(*pages),
+ "%zu + %zu + %lu", rec_length, sizeof(*pages),
(sizeof(uint64_t) * pages->count), (PAGE_SIZE * pages_of_data));
goto err;
}
- rc = process_page_data(ctx, pages->count, ctx->restore.pfns,
- ctx->restore.types, &pages->pfn[pages->count]);
+ ret = true;
+
+err:
+ return ret;
+}
+
+/*
+ * Populate pfns, if required
+ * Fill guest_data with either mapped address or NULL
+ * The caller must unmap guest_mapping
+ */
+static int map_guest_pages(struct xc_sr_context *ctx,
+ struct xc_sr_rec_page_data_header *pages)
+{
+ xc_interface *xch = ctx->xch;
+ uint32_t i, p;
+ int rc;
+
+ rc = populate_pfns(ctx, pages->count, ctx->restore.pfns, ctx->restore.types);
+ if ( rc )
+ {
+ ERROR("Failed to populate pfns for batch of %u pages", pages->count);
+ goto err;
+ }
+
+ ctx->restore.nr_mapped_pages = 0;
+
+ for ( i = 0; i < pages->count; i++ )
+ {
+ ctx->restore.ops.set_page_type(ctx, ctx->restore.pfns[i], ctx->restore.types[i]);
+
+ if ( page_type_has_stream_data(ctx->restore.types[i]) == false )
+ {
+ ctx->restore.guest_data[i] = NULL;
+ continue;
+ }
+
+ ctx->restore.mfns[ctx->restore.nr_mapped_pages++] = ctx->restore.ops.pfn_to_gfn(ctx, ctx->restore.pfns[i]);
+ }
+
+ /* Nothing to do? */
+ if ( ctx->restore.nr_mapped_pages == 0 )
+ goto done;
+
+ ctx->restore.guest_mapping = xenforeignmemory_map(xch->fmem, ctx->domid,
+ PROT_READ | PROT_WRITE, ctx->restore.nr_mapped_pages,
+ ctx->restore.mfns, ctx->restore.map_errs);
+ if ( !ctx->restore.guest_mapping )
+ {
+ rc = -1;
+ PERROR("Unable to map %u mfns for %u pages of data",
+ ctx->restore.nr_mapped_pages, pages->count);
+ goto err;
+ }
+
+ /* Verify mapping, and assign address to pfn data */
+ for ( i = 0, p = 0; i < pages->count; i++ )
+ {
+ if ( !page_type_has_stream_data(ctx->restore.types[i]) )
+ continue;
+
+ if ( ctx->restore.map_errs[p] == 0 )
+ {
+ ctx->restore.guest_data[i] = ctx->restore.guest_mapping + (p * PAGE_SIZE);
+ p++;
+ continue;
+ }
+
+ errno = ctx->restore.map_errs[p];
+ rc = -1;
+ PERROR("Mapping pfn %#"PRIpfn" (mfn %#"PRIpfn", type %#"PRIx32") failed",
+ ctx->restore.pfns[i], ctx->restore.mfns[p], ctx->restore.types[i]);
+ goto err;
+ }
+
+done:
+ rc = 0;
+
+err:
+ return rc;
+}
+
+/*
+ * Handle PAGE_DATA record from an existing buffer
+ * Given a list of pfns, their types, and a block of page data from the
+ * stream, populate and record their types, map the relevant subset and copy
+ * the data into the guest.
+ */
+static int handle_buffered_page_data(struct xc_sr_context *ctx,
+ struct xc_sr_record *rec)
+{
+ xc_interface *xch = ctx->xch;
+ struct xc_sr_rec_page_data_header *pages = rec->data;
+ void *p;
+ uint32_t i;
+ int rc = -1, idx;
+
+ rc = handle_static_data_end_v2(ctx);
+ if ( rc )
+ goto err;
+
+ /* First read and verify the header */
+ if ( !verify_rec_page_hdr(ctx, rec->length, pages) )
+ {
+ rc = -1;
+ goto err;
+ }
+
+ /* Then read and verify the pfn numbers */
+ if ( !verify_rec_page_pfns(ctx, rec->length, pages) )
+ {
+ rc = -1;
+ goto err;
+ }
+
+ /* Map the target pfn */
+ rc = map_guest_pages(ctx, pages);
+ if ( rc )
+ goto err;
+
+ for ( i = 0, idx = 0; i < pages->count; i++ )
+ {
+ if ( !ctx->restore.guest_data[i] )
+ continue;
+
+ p = &pages->pfn[pages->count] + (idx * PAGE_SIZE);
+ rc = ctx->restore.ops.localise_page(ctx, ctx->restore.types[i], p);
+ if ( rc )
+ {
+ ERROR("Failed to localise pfn %#"PRIpfn" (type %#"PRIx32")",
+ ctx->restore.pfns[i], ctx->restore.types[i] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT);
+ goto err;
+
+ }
+
+ if ( ctx->restore.verify )
+ {
+ if ( memcmp(ctx->restore.guest_data[i], p, PAGE_SIZE) )
+ {
+ errno = EIO;
+ ERROR("verify pfn %#"PRIpfn" failed (type %#"PRIx32")",
+ ctx->restore.pfns[i], ctx->restore.types[i] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT);
+ goto err;
+ }
+ }
+ else
+ {
+ memcpy(ctx->restore.guest_data[i], p, PAGE_SIZE);
+ }
+
+ idx++;
+ }
+
+ rc = 0;
+
err:
+ if ( ctx->restore.guest_mapping )
+ {
+ xenforeignmemory_unmap(xch->fmem, ctx->restore.guest_mapping, ctx->restore.nr_mapped_pages);
+ ctx->restore.guest_mapping = NULL;
+ }
return rc;
}
@@ -623,7 +707,7 @@ static int process_buffered_record(struc
break;
case REC_TYPE_PAGE_DATA:
- rc = handle_page_data(ctx, rec);
+ rc = handle_buffered_page_data(ctx, rec);
break;
case REC_TYPE_VERIFY:
@@ -703,9 +787,10 @@ static int setup(struct xc_sr_context *c
ctx->restore.map_errs = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.map_errs));
ctx->restore.pp_pfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.pp_pfns));
ctx->restore.pp_mfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.pp_mfns));
+ ctx->restore.guest_data = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.guest_data));
if ( !ctx->restore.pfns || !ctx->restore.types || !ctx->restore.mfns ||
!ctx->restore.map_errs || !ctx->restore.pp_pfns ||
- !ctx->restore.pp_mfns )
+ !ctx->restore.pp_mfns || !ctx->restore.guest_data )
{
ERROR("Unable to allocate memory");
rc = -1;
@@ -742,6 +827,7 @@ static void cleanup(struct xc_sr_context
free(ctx->restore.buffered_records);
free(ctx->restore.populated_pfns);
+ free(ctx->restore.guest_data);
free(ctx->restore.pp_mfns);
free(ctx->restore.pp_pfns);
free(ctx->restore.map_errs);

View File

@ -0,0 +1,230 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Thu, 29 Oct 2020 16:13:10 +0100
Subject: libxc sr restore handle_incoming_page_data
tools: restore: write data directly into guest
Read incoming migration stream directly into the guest memory.
This avoids the memory allocation and copying, and the resulting
performance penalty.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
tools/libs/guest/xg_sr_common.h | 3 +
tools/libs/guest/xg_sr_restore.c | 155 ++++++++++++++++++++++++++++++-
2 files changed, 153 insertions(+), 5 deletions(-)
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -263,6 +263,8 @@ struct xc_sr_context
xen_pfn_t *pp_pfns;
xen_pfn_t *pp_mfns;
void **guest_data;
+ struct iovec *iov;
+ struct xc_sr_rec_page_data_header *pages;
void *guest_mapping;
uint32_t nr_mapped_pages;
@@ -311,6 +313,7 @@ struct xc_sr_context
/* Sender has invoked verify mode on the stream. */
bool verify;
+ void *verify_buf;
} restore;
};
--- a/tools/libs/guest/xg_sr_restore.c
+++ b/tools/libs/guest/xg_sr_restore.c
@@ -382,6 +382,129 @@ err:
}
/*
+ * Handle PAGE_DATA record from the stream.
+ * Given a list of pfns, their types, and a block of page data from the
+ * stream, populate and record their types, map the relevant subset and copy
+ * the data into the guest.
+ */
+static int handle_incoming_page_data(struct xc_sr_context *ctx,
+ struct xc_sr_rhdr *rhdr)
+{
+ xc_interface *xch = ctx->xch;
+ struct xc_sr_rec_page_data_header *pages = ctx->restore.pages;
+ uint64_t *pfn_nums = &pages->pfn[0];
+ uint32_t i;
+ int rc, iov_idx;
+
+ rc = handle_static_data_end_v2(ctx);
+ if ( rc )
+ goto err;
+
+ /* First read and verify the header */
+ rc = read_exact(ctx->fd, pages, sizeof(*pages));
+ if ( rc )
+ {
+ PERROR("Could not read rec_pfn header");
+ goto err;
+ }
+
+ if ( !verify_rec_page_hdr(ctx, rhdr->length, pages) )
+ {
+ rc = -1;
+ goto err;
+ }
+
+ /* Then read and verify the incoming pfn numbers */
+ rc = read_exact(ctx->fd, pfn_nums, sizeof(*pfn_nums) * pages->count);
+ if ( rc )
+ {
+ PERROR("Could not read rec_pfn data");
+ goto err;
+ }
+
+ if ( !verify_rec_page_pfns(ctx, rhdr->length, pages) )
+ {
+ rc = -1;
+ goto err;
+ }
+
+ /* Finally read and verify the incoming pfn data */
+ rc = map_guest_pages(ctx, pages);
+ if ( rc )
+ goto err;
+
+ /* Prepare read buffers, either guest or throw-away memory */
+ for ( i = 0, iov_idx = 0; i < pages->count; i++ )
+ {
+ struct iovec *iov;
+
+ if ( !ctx->restore.guest_data[i] )
+ continue;
+
+ iov = &ctx->restore.iov[iov_idx];
+ iov->iov_len = PAGE_SIZE;
+ if ( ctx->restore.verify )
+ iov->iov_base = ctx->restore.verify_buf + (i * PAGE_SIZE);
+ else
+ iov->iov_base = ctx->restore.guest_data[i];
+ iov_idx++;
+ }
+
+ if ( !iov_idx )
+ goto done;
+
+ rc = readv_exact(ctx->fd, ctx->restore.iov, iov_idx);
+ if ( rc )
+ {
+ PERROR("read of %d pages failed", iov_idx);
+ goto err;
+ }
+
+ /* Post-processing of pfn data */
+ for ( i = 0, iov_idx = 0; i < pages->count; i++ )
+ {
+ void *addr;
+
+ if ( !ctx->restore.guest_data[i] )
+ continue;
+
+ addr = ctx->restore.iov[iov_idx].iov_base;
+ rc = ctx->restore.ops.localise_page(ctx, ctx->restore.types[i], addr);
+ if ( rc )
+ {
+ ERROR("Failed to localise pfn %#"PRIpfn" (type %#"PRIx32")",
+ ctx->restore.pfns[i],
+ ctx->restore.types[i] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT);
+ goto err;
+
+ }
+
+ if ( ctx->restore.verify )
+ {
+ if ( memcmp(ctx->restore.guest_data[i], addr, PAGE_SIZE) )
+ {
+ ERROR("verify pfn %#"PRIpfn" failed (type %#"PRIx32")",
+ ctx->restore.pfns[i],
+ ctx->restore.types[i] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT);
+ }
+ }
+
+ iov_idx++;
+ }
+
+done:
+ rc = 0;
+
+err:
+ if ( ctx->restore.guest_mapping )
+ {
+ xenforeignmemory_unmap(xch->fmem, ctx->restore.guest_mapping, ctx->restore.nr_mapped_pages);
+ ctx->restore.guest_mapping = NULL;
+ }
+ return rc;
+}
+
+/*
* Handle PAGE_DATA record from an existing buffer
* Given a list of pfns, their types, and a block of page data from the
* stream, populate and record their types, map the relevant subset and copy
@@ -713,6 +836,15 @@ static int process_buffered_record(struc
case REC_TYPE_VERIFY:
DPRINTF("Verify mode enabled");
ctx->restore.verify = true;
+ if ( !ctx->restore.verify_buf )
+ {
+ ctx->restore.verify_buf = malloc(MAX_BATCH_SIZE * PAGE_SIZE);
+ if ( !ctx->restore.verify_buf )
+ {
+ PERROR("Unable to allocate verify_buf");
+ rc = -1;
+ }
+ }
break;
case REC_TYPE_CHECKPOINT:
@@ -739,11 +871,19 @@ static int process_incoming_record_heade
struct xc_sr_record rec;
int rc;
- rc = read_record_data(ctx, ctx->fd, rhdr, &rec);
- if ( rc )
- return rc;
+ switch ( rhdr->type )
+ {
+ case REC_TYPE_PAGE_DATA:
+ rc = handle_incoming_page_data(ctx, rhdr);
+ break;
+ default:
+ rc = read_record_data(ctx, ctx->fd, rhdr, &rec);
+ if ( rc == 0 )
+ rc = process_buffered_record(ctx, &rec);;
+ break;
+ }
- return process_buffered_record(ctx, &rec);
+ return rc;
}
@@ -788,9 +928,12 @@ static int setup(struct xc_sr_context *c
ctx->restore.pp_pfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.pp_pfns));
ctx->restore.pp_mfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.pp_mfns));
ctx->restore.guest_data = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.guest_data));
+ ctx->restore.iov = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.iov));
+ ctx->restore.pages = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.pages->pfn) + sizeof(*ctx->restore.pages));
if ( !ctx->restore.pfns || !ctx->restore.types || !ctx->restore.mfns ||
!ctx->restore.map_errs || !ctx->restore.pp_pfns ||
- !ctx->restore.pp_mfns || !ctx->restore.guest_data )
+ !ctx->restore.pp_mfns || !ctx->restore.guest_data ||
+ !ctx->restore.iov || !ctx->restore.pages )
{
ERROR("Unable to allocate memory");
rc = -1;
@@ -827,6 +970,8 @@ static void cleanup(struct xc_sr_context
free(ctx->restore.buffered_records);
free(ctx->restore.populated_pfns);
+ free(ctx->restore.pages);
+ free(ctx->restore.iov);
free(ctx->restore.guest_data);
free(ctx->restore.pp_mfns);
free(ctx->restore.pp_pfns);

View File

@ -0,0 +1,701 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Mon, 7 Aug 2017 12:58:02 +0000
Subject: libxc sr restore hvm legacy superpage
tools: use superpages during restore of HVM guest
bsc#1035231 - migration of HVM domU does not use superpages on destination dom0
bsc#1055695 - XEN: 11SP4 and 12SP3 HVM guests can not be restored
During creating of a HVM domU meminit_hvm() tries to map superpages.
After save/restore or migration this mapping is lost, everything is
allocated in single pages. This causes a performance degradation after
migration.
Add neccessary code to preallocate a superpage for an incoming chunk of
pfns. In case a pfn was not populated on the sending side, it must be
freed on the receiving side to avoid over-allocation.
The existing code for x86_pv is moved unmodified into its own file.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
tools/libs/guest/xg_dom_x86.c | 5 -
tools/libs/guest/xg_private.h | 5 +
tools/libs/guest/xg_sr_common.h | 28 +-
tools/libs/guest/xg_sr_restore.c | 60 +---
tools/libs/guest/xg_sr_restore_x86_hvm.c | 381 ++++++++++++++++++++++-
tools/libs/guest/xg_sr_restore_x86_pv.c | 61 +++-
6 files changed, 467 insertions(+), 73 deletions(-)
--- a/tools/libs/guest/xg_dom_x86.c
+++ b/tools/libs/guest/xg_dom_x86.c
@@ -44,11 +44,6 @@
#define SUPERPAGE_BATCH_SIZE 512
-#define SUPERPAGE_2MB_SHIFT 9
-#define SUPERPAGE_2MB_NR_PFNS (1UL << SUPERPAGE_2MB_SHIFT)
-#define SUPERPAGE_1GB_SHIFT 18
-#define SUPERPAGE_1GB_NR_PFNS (1UL << SUPERPAGE_1GB_SHIFT)
-
#define X86_CR0_PE 0x01
#define X86_CR0_ET 0x10
--- a/tools/libs/guest/xg_private.h
+++ b/tools/libs/guest/xg_private.h
@@ -180,4 +180,9 @@ struct xc_cpu_policy {
};
#endif /* x86 */
+#define SUPERPAGE_2MB_SHIFT 9
+#define SUPERPAGE_2MB_NR_PFNS (1UL << SUPERPAGE_2MB_SHIFT)
+#define SUPERPAGE_1GB_SHIFT 18
+#define SUPERPAGE_1GB_NR_PFNS (1UL << SUPERPAGE_1GB_SHIFT)
+
#endif /* XG_PRIVATE_H */
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -208,6 +208,16 @@ struct xc_sr_restore_ops
int (*setup)(struct xc_sr_context *ctx);
/**
+ * Populate PFNs
+ *
+ * Given a set of pfns, obtain memory from Xen to fill the physmap for the
+ * unpopulated subset.
+ */
+ int (*populate_pfns)(struct xc_sr_context *ctx, unsigned count,
+ const xen_pfn_t *original_pfns, const uint32_t *types);
+
+
+ /**
* Process an individual record from the stream. The caller shall take
* care of processing common records (e.g. END, PAGE_DATA).
*
@@ -338,6 +348,8 @@ struct xc_sr_context
int send_back_fd;
unsigned long p2m_size;
+ unsigned long max_pages;
+ unsigned long tot_pages;
xc_hypercall_buffer_t dirty_bitmap_hbuf;
/* From Image Header. */
@@ -471,6 +483,14 @@ struct xc_sr_context
{
/* HVM context blob. */
struct xc_sr_blob context;
+
+ /* Bitmap of currently allocated PFNs during restore. */
+ struct sr_bitmap attempted_1g;
+ struct sr_bitmap attempted_2m;
+ struct sr_bitmap allocated_pfns;
+ xen_pfn_t prev_populated_pfn;
+ xen_pfn_t iteration_tracker_pfn;
+ unsigned long iteration;
} restore;
};
} hvm;
@@ -535,14 +555,6 @@ int read_record_header(struct xc_sr_cont
int read_record_data(struct xc_sr_context *ctx, int fd, struct xc_sr_rhdr *rhdr,
struct xc_sr_record *rec);
-/*
- * This would ideally be private in restore.c, but is needed by
- * x86_pv_localise_page() if we receive pagetables frames ahead of the
- * contents of the frames they point at.
- */
-int populate_pfns(struct xc_sr_context *ctx, unsigned int count,
- const xen_pfn_t *original_pfns, const uint32_t *types);
-
/* Handle a STATIC_DATA_END record. */
int handle_static_data_end(struct xc_sr_context *ctx);
--- a/tools/libs/guest/xg_sr_restore.c
+++ b/tools/libs/guest/xg_sr_restore.c
@@ -71,60 +71,6 @@ static int read_headers(struct xc_sr_con
return 0;
}
-/*
- * Given a set of pfns, obtain memory from Xen to fill the physmap for the
- * unpopulated subset. If types is NULL, no page type checking is performed
- * and all unpopulated pfns are populated.
- */
-int populate_pfns(struct xc_sr_context *ctx, unsigned int count,
- const xen_pfn_t *original_pfns, const uint32_t *types)
-{
- xc_interface *xch = ctx->xch;
- unsigned int i, nr_pfns = 0;
- int rc = -1;
-
- for ( i = 0; i < count; ++i )
- {
- if ( (!types || page_type_to_populate(types[i])) &&
- !pfn_is_populated(ctx, original_pfns[i]) )
- {
- rc = pfn_set_populated(ctx, original_pfns[i]);
- if ( rc )
- goto err;
- ctx->restore.pp_pfns[nr_pfns] = ctx->restore.pp_mfns[nr_pfns] = original_pfns[i];
- ++nr_pfns;
- }
- }
-
- if ( nr_pfns )
- {
- rc = xc_domain_populate_physmap_exact(
- xch, ctx->domid, nr_pfns, 0, 0, ctx->restore.pp_mfns);
- if ( rc )
- {
- PERROR("Failed to populate physmap");
- goto err;
- }
-
- for ( i = 0; i < nr_pfns; ++i )
- {
- if ( ctx->restore.pp_mfns[i] == INVALID_MFN )
- {
- ERROR("Populate physmap failed for pfn %u", i);
- rc = -1;
- goto err;
- }
-
- ctx->restore.ops.set_gfn(ctx, ctx->restore.pp_pfns[i], ctx->restore.pp_mfns[i]);
- }
- }
-
- rc = 0;
-
- err:
- return rc;
-}
-
static int handle_static_data_end_v2(struct xc_sr_context *ctx)
{
int rc = 0;
@@ -259,7 +205,8 @@ static int map_guest_pages(struct xc_sr_
uint32_t i, p;
int rc;
- rc = populate_pfns(ctx, pages->count, ctx->restore.pfns, ctx->restore.types);
+ rc = ctx->restore.ops.populate_pfns(ctx, pages->count, ctx->restore.pfns,
+ ctx->restore.types);
if ( rc )
{
ERROR("Failed to populate pfns for batch of %u pages", pages->count);
@@ -1074,6 +1021,9 @@ int xc_domain_restore(xc_interface *xch,
return -1;
}
+ /* See xc_domain_getinfo */
+ ctx.restore.max_pages = ctx.dominfo.max_pages;
+ ctx.restore.tot_pages = ctx.dominfo.tot_pages;
ctx.restore.p2m_size = nr_pfns;
ctx.restore.ops = hvm ? restore_ops_x86_hvm : restore_ops_x86_pv;
--- a/tools/libs/guest/xg_sr_restore_x86_hvm.c
+++ b/tools/libs/guest/xg_sr_restore_x86_hvm.c
@@ -130,6 +130,33 @@ static int x86_hvm_localise_page(struct
return 0;
}
+static bool x86_hvm_expand_sp_bitmaps(struct xc_sr_context *ctx, unsigned long max_pfn)
+{
+ struct sr_bitmap *bm;
+
+ bm = &ctx->x86.hvm.restore.attempted_1g;
+ if ( !sr_bitmap_expand(bm, max_pfn >> SUPERPAGE_1GB_SHIFT) )
+ return false;
+
+ bm = &ctx->x86.hvm.restore.attempted_2m;
+ if ( !sr_bitmap_expand(bm, max_pfn >> SUPERPAGE_2MB_SHIFT) )
+ return false;
+
+ bm = &ctx->x86.hvm.restore.allocated_pfns;
+ if ( !sr_bitmap_expand(bm, max_pfn) )
+ return false;
+
+ return true;
+}
+
+static void x86_hvm_no_superpage(struct xc_sr_context *ctx, unsigned long addr)
+{
+ unsigned long pfn = addr >> XC_PAGE_SHIFT;
+
+ sr_set_bit(pfn >> SUPERPAGE_1GB_SHIFT, &ctx->x86.hvm.restore.attempted_1g);
+ sr_set_bit(pfn >> SUPERPAGE_2MB_SHIFT, &ctx->x86.hvm.restore.attempted_2m);
+}
+
/*
* restore_ops function. Confirms the stream matches the domain.
*/
@@ -164,12 +191,24 @@ static int x86_hvm_setup(struct xc_sr_co
max_pfn = max(ctx->restore.p2m_size, max_pages);
if ( !sr_bitmap_expand(&ctx->restore.populated_pfns, max_pfn) )
- {
- PERROR("Unable to allocate memory for populated_pfns bitmap");
- return -1;
- }
+ goto out;
+
+ if ( !x86_hvm_expand_sp_bitmaps(ctx, max_pfn) )
+ goto out;
+
+ /* FIXME: distinguish between PVH and HVM */
+ /* No superpage in 1st 2MB due to VGA hole */
+ x86_hvm_no_superpage(ctx, 0xA0000u);
+#define LAPIC_BASE_ADDRESS 0xfee00000u
+#define ACPI_INFO_PHYSICAL_ADDRESS 0xfc000000u
+ x86_hvm_no_superpage(ctx, LAPIC_BASE_ADDRESS);
+ x86_hvm_no_superpage(ctx, ACPI_INFO_PHYSICAL_ADDRESS);
return 0;
+
+out:
+ PERROR("Unable to allocate memory for pfn bitmaps");
+ return -1;
}
/*
@@ -250,6 +289,9 @@ static int x86_hvm_stream_complete(struc
static int x86_hvm_cleanup(struct xc_sr_context *ctx)
{
sr_bitmap_free(&ctx->restore.populated_pfns);
+ sr_bitmap_free(&ctx->x86.hvm.restore.attempted_1g);
+ sr_bitmap_free(&ctx->x86.hvm.restore.attempted_2m);
+ sr_bitmap_free(&ctx->x86.hvm.restore.allocated_pfns);
free(ctx->x86.hvm.restore.context.ptr);
free(ctx->x86.restore.cpuid.ptr);
@@ -258,6 +300,336 @@ static int x86_hvm_cleanup(struct xc_sr_
return 0;
}
+/*
+ * Set a range of pfns as allocated
+ */
+static void pfn_set_long_allocated(struct xc_sr_context *ctx, xen_pfn_t base_pfn)
+{
+ sr_set_long_bit(base_pfn, &ctx->x86.hvm.restore.allocated_pfns);
+}
+
+static void pfn_set_allocated(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+ sr_set_bit(pfn, &ctx->x86.hvm.restore.allocated_pfns);
+}
+
+struct x86_hvm_sp {
+ xen_pfn_t pfn;
+ xen_pfn_t base_pfn;
+ unsigned long index;
+ unsigned long count;
+};
+
+/*
+ * Try to allocate a 1GB page for this pfn, but avoid Over-allocation.
+ * If this succeeds, mark the range of 2MB pages as busy.
+ */
+static bool x86_hvm_alloc_1g(struct xc_sr_context *ctx, struct x86_hvm_sp *sp)
+{
+ xc_interface *xch = ctx->xch;
+ unsigned int order;
+ int i, done;
+ xen_pfn_t extent;
+
+ /* Only one attempt to avoid overlapping allocation */
+ if ( sr_test_and_set_bit(sp->index, &ctx->x86.hvm.restore.attempted_1g) )
+ return false;
+
+ order = SUPERPAGE_1GB_SHIFT;
+ sp->count = SUPERPAGE_1GB_NR_PFNS;
+
+ /* Allocate only if there is room for another superpage */
+ if ( ctx->restore.tot_pages + sp->count > ctx->restore.max_pages )
+ return false;
+
+ extent = sp->base_pfn = (sp->pfn >> order) << order;
+ done = xc_domain_populate_physmap(xch, ctx->domid, 1, order, 0, &extent);
+ if ( done < 0 ) {
+ PERROR("populate_physmap failed.");
+ return false;
+ }
+ if ( done == 0 )
+ return false;
+
+ DPRINTF("1G %" PRI_xen_pfn "\n", sp->base_pfn);
+
+ /* Mark all 2MB pages as done to avoid overlapping allocation */
+ for ( i = 0; i < (SUPERPAGE_1GB_NR_PFNS/SUPERPAGE_2MB_NR_PFNS); i++ )
+ sr_set_bit((sp->base_pfn >> SUPERPAGE_2MB_SHIFT) + i, &ctx->x86.hvm.restore.attempted_2m);
+
+ return true;
+}
+
+/* Allocate a 2MB page if x86_hvm_alloc_1g failed, avoid Over-allocation. */
+static bool x86_hvm_alloc_2m(struct xc_sr_context *ctx, struct x86_hvm_sp *sp)
+{
+ xc_interface *xch = ctx->xch;
+ unsigned int order;
+ int done;
+ xen_pfn_t extent;
+
+ /* Only one attempt to avoid overlapping allocation */
+ if ( sr_test_and_set_bit(sp->index, &ctx->x86.hvm.restore.attempted_2m) )
+ return false;
+
+ order = SUPERPAGE_2MB_SHIFT;
+ sp->count = SUPERPAGE_2MB_NR_PFNS;
+
+ /* Allocate only if there is room for another superpage */
+ if ( ctx->restore.tot_pages + sp->count > ctx->restore.max_pages )
+ return false;
+
+ extent = sp->base_pfn = (sp->pfn >> order) << order;
+ done = xc_domain_populate_physmap(xch, ctx->domid, 1, order, 0, &extent);
+ if ( done < 0 ) {
+ PERROR("populate_physmap failed.");
+ return false;
+ }
+ if ( done == 0 )
+ return false;
+
+ DPRINTF("2M %" PRI_xen_pfn "\n", sp->base_pfn);
+ return true;
+}
+
+/* Allocate a single page if x86_hvm_alloc_2m failed. */
+static bool x86_hvm_alloc_4k(struct xc_sr_context *ctx, struct x86_hvm_sp *sp)
+{
+ xc_interface *xch = ctx->xch;
+ unsigned int order;
+ int done;
+ xen_pfn_t extent;
+
+ order = 0;
+ sp->count = 1UL;
+
+ /* Allocate only if there is room for another page */
+ if ( ctx->restore.tot_pages + sp->count > ctx->restore.max_pages ) {
+ errno = E2BIG;
+ return false;
+ }
+
+ extent = sp->base_pfn = (sp->pfn >> order) << order;
+ done = xc_domain_populate_physmap(xch, ctx->domid, 1, order, 0, &extent);
+ if ( done < 0 ) {
+ PERROR("populate_physmap failed.");
+ return false;
+ }
+ if ( done == 0 ) {
+ errno = ENOMEM;
+ return false;
+ }
+
+ DPRINTF("4K %" PRI_xen_pfn "\n", sp->base_pfn);
+ return true;
+}
+/*
+ * Attempt to allocate a superpage where the pfn resides.
+ */
+static int x86_hvm_allocate_pfn(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+ bool success;
+ unsigned long idx_1g, idx_2m;
+ struct x86_hvm_sp sp = {
+ .pfn = pfn
+ };
+
+ if ( sr_test_bit(pfn, &ctx->x86.hvm.restore.allocated_pfns) )
+ return 0;
+
+ idx_1g = pfn >> SUPERPAGE_1GB_SHIFT;
+ idx_2m = pfn >> SUPERPAGE_2MB_SHIFT;
+
+ sp.index = idx_1g;
+ success = x86_hvm_alloc_1g(ctx, &sp);
+
+ if ( success == false ) {
+ sp.index = idx_2m;
+ success = x86_hvm_alloc_2m(ctx, &sp);
+ }
+
+ if ( success == false ) {
+ sp.index = 0;
+ success = x86_hvm_alloc_4k(ctx, &sp);
+ }
+
+ if ( success == false )
+ return -1;
+
+ do {
+ if ( sp.count >= BITS_PER_LONG && (sp.count % BITS_PER_LONG) == 0 ) {
+ sp.count -= BITS_PER_LONG;
+ ctx->restore.tot_pages += BITS_PER_LONG;
+ pfn_set_long_allocated(ctx, sp.base_pfn + sp.count);
+ } else {
+ sp.count--;
+ ctx->restore.tot_pages++;
+ pfn_set_allocated(ctx, sp.base_pfn + sp.count);
+ }
+ } while ( sp.count );
+
+ return 0;
+}
+
+/*
+ * Deallocate memory.
+ * There was likely an optimistic superpage allocation.
+ * This means more pages may have been allocated past gap_end.
+ * This range is not freed now. Incoming higher pfns will release it.
+ */
+static int x86_hvm_punch_hole(struct xc_sr_context *ctx,
+ xen_pfn_t gap_start, xen_pfn_t gap_end)
+{
+ xc_interface *xch = ctx->xch;
+ xen_pfn_t _pfn, pfn;
+ uint32_t domid, freed = 0;
+ int rc;
+
+ pfn = gap_start >> SUPERPAGE_1GB_SHIFT;
+ do
+ {
+ sr_set_bit(pfn, &ctx->x86.hvm.restore.attempted_1g);
+ } while (++pfn <= gap_end >> SUPERPAGE_1GB_SHIFT);
+
+ pfn = gap_start >> SUPERPAGE_2MB_SHIFT;
+ do
+ {
+ sr_set_bit(pfn, &ctx->x86.hvm.restore.attempted_2m);
+ } while (++pfn <= gap_end >> SUPERPAGE_2MB_SHIFT);
+
+ pfn = gap_start;
+
+ while ( pfn <= gap_end )
+ {
+ if ( sr_test_and_clear_bit(pfn, &ctx->x86.hvm.restore.allocated_pfns) )
+ {
+ domid = ctx->domid;
+ _pfn = pfn;
+ rc = xc_domain_decrease_reservation_exact(xch, domid, 1, 0, &_pfn);
+ if ( rc )
+ {
+ PERROR("Failed to release pfn %" PRI_xen_pfn, pfn);
+ return -1;
+ }
+ ctx->restore.tot_pages--;
+ freed++;
+ }
+ pfn++;
+ }
+ if ( freed )
+ DPRINTF("freed %u between %" PRI_xen_pfn " %" PRI_xen_pfn "\n",
+ freed, gap_start, gap_end);
+ return 0;
+}
+
+static int x86_hvm_unpopulate_page(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+ sr_clear_bit(pfn, &ctx->restore.populated_pfns);
+ return x86_hvm_punch_hole(ctx, pfn, pfn);
+}
+
+static int x86_hvm_populate_page(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+ xen_pfn_t gap_start, gap_end;
+ bool has_gap, first_iteration;
+ int rc;
+
+ /*
+ * Check for a gap between the previous populated pfn and this pfn.
+ * In case a gap exists, it is required to punch a hole to release memory,
+ * starting after the previous pfn and before this pfn.
+ *
+ * But: this can be done only during the first iteration, which is the
+ * only place where superpage allocations are attempted. All following
+ * iterations lack the info to properly maintain prev_populated_pfn.
+ */
+ has_gap = ctx->x86.hvm.restore.prev_populated_pfn + 1 < pfn;
+ first_iteration = ctx->x86.hvm.restore.iteration == 0;
+ if ( has_gap && first_iteration )
+ {
+ gap_start = ctx->x86.hvm.restore.prev_populated_pfn + 1;
+ gap_end = pfn - 1;
+
+ rc = x86_hvm_punch_hole(ctx, gap_start, gap_end);
+ if ( rc )
+ goto err;
+ }
+
+ rc = x86_hvm_allocate_pfn(ctx, pfn);
+ if ( rc )
+ goto err;
+ pfn_set_populated(ctx, pfn);
+ ctx->x86.hvm.restore.prev_populated_pfn = pfn;
+
+ rc = 0;
+err:
+ return rc;
+}
+
+/*
+ * Try to allocate superpages.
+ * This works without memory map because the pfns arrive in incremental order.
+ * All pfn numbers and their type are submitted.
+ * Only pfns with data will have also pfn content transmitted.
+ */
+static int x86_hvm_populate_pfns(struct xc_sr_context *ctx, unsigned count,
+ const xen_pfn_t *original_pfns,
+ const uint32_t *types)
+{
+ xc_interface *xch = ctx->xch;
+ xen_pfn_t pfn, min_pfn, max_pfn;
+ bool to_populate, populated;
+ unsigned i = count;
+ int rc = 0;
+
+ min_pfn = count ? original_pfns[0] : 0;
+ max_pfn = count ? original_pfns[count - 1] : 0;
+ DPRINTF("batch of %u pfns between %" PRI_xen_pfn " %" PRI_xen_pfn "\n",
+ count, min_pfn, max_pfn);
+
+ if ( !x86_hvm_expand_sp_bitmaps(ctx, max_pfn) )
+ {
+ ERROR("Unable to allocate memory for pfn bitmaps");
+ return -1;
+ }
+
+ /*
+ * There is no indicator for a new iteration.
+ * Simulate it by checking if a lower pfn is coming in.
+ * In the end it matters only to know if this iteration is the first one.
+ */
+ if ( min_pfn < ctx->x86.hvm.restore.iteration_tracker_pfn )
+ ctx->x86.hvm.restore.iteration++;
+ ctx->x86.hvm.restore.iteration_tracker_pfn = min_pfn;
+
+ for ( i = 0; i < count; ++i )
+ {
+ pfn = original_pfns[i];
+
+ to_populate = page_type_to_populate(types[i]);
+ populated = pfn_is_populated(ctx, pfn);
+
+ /*
+ * page has data, pfn populated: nothing to do
+ * page has data, pfn not populated: likely never seen before
+ * page has no data, pfn populated: likely ballooned out during migration
+ * page has no data, pfn not populated: nothing to do
+ */
+ if ( to_populate && !populated )
+ {
+ rc = x86_hvm_populate_page(ctx, pfn);
+ } else if ( !to_populate && populated )
+ {
+ rc = x86_hvm_unpopulate_page(ctx, pfn);
+ }
+ if ( rc )
+ break;
+ }
+
+ return rc;
+}
+
+
struct xc_sr_restore_ops restore_ops_x86_hvm =
{
.pfn_is_valid = x86_hvm_pfn_is_valid,
@@ -266,6 +638,7 @@ struct xc_sr_restore_ops restore_ops_x86
.set_page_type = x86_hvm_set_page_type,
.localise_page = x86_hvm_localise_page,
.setup = x86_hvm_setup,
+ .populate_pfns = x86_hvm_populate_pfns,
.process_record = x86_hvm_process_record,
.static_data_complete = x86_static_data_complete,
.stream_complete = x86_hvm_stream_complete,
--- a/tools/libs/guest/xg_sr_restore_x86_pv.c
+++ b/tools/libs/guest/xg_sr_restore_x86_pv.c
@@ -960,6 +960,64 @@ static void x86_pv_set_gfn(struct xc_sr_
}
/*
+ * Given a set of pfns, obtain memory from Xen to fill the physmap for the
+ * unpopulated subset. If types is NULL, no page type checking is performed
+ * and all unpopulated pfns are populated.
+ */
+static int x86_pv_populate_pfns(struct xc_sr_context *ctx, unsigned count,
+ const xen_pfn_t *original_pfns,
+ const uint32_t *types)
+{
+ xc_interface *xch = ctx->xch;
+ xen_pfn_t *mfns = ctx->restore.pp_mfns,
+ *pfns = ctx->restore.pp_pfns;
+ unsigned int i, nr_pfns = 0;
+ int rc = -1;
+
+ for ( i = 0; i < count; ++i )
+ {
+ if ( (!types ||
+ (types && page_type_has_stream_data(types[i]) == true)) &&
+ !pfn_is_populated(ctx, original_pfns[i]) )
+ {
+ rc = pfn_set_populated(ctx, original_pfns[i]);
+ if ( rc )
+ goto err;
+ pfns[nr_pfns] = mfns[nr_pfns] = original_pfns[i];
+ ++nr_pfns;
+ }
+ }
+
+ if ( nr_pfns )
+ {
+ rc = xc_domain_populate_physmap_exact(
+ xch, ctx->domid, nr_pfns, 0, 0, mfns);
+ if ( rc )
+ {
+ PERROR("Failed to populate physmap");
+ goto err;
+ }
+
+ for ( i = 0; i < nr_pfns; ++i )
+ {
+ if ( mfns[i] == INVALID_MFN )
+ {
+ ERROR("Populate physmap failed for pfn %u", i);
+ rc = -1;
+ goto err;
+ }
+
+ ctx->restore.ops.set_gfn(ctx, pfns[i], mfns[i]);
+ }
+ }
+
+ rc = 0;
+
+ err:
+ return rc;
+}
+
+/*
* restore_ops function. Convert pfns back to mfns in pagetables. Possibly
* needs to populate new frames if a PTE is found referring to a frame which
* hasn't yet been seen from PAGE_DATA records.
@@ -1003,7 +1061,7 @@ static int x86_pv_localise_page(struct x
}
}
- if ( to_populate && populate_pfns(ctx, to_populate, pfns, NULL) )
+ if ( to_populate && x86_pv_populate_pfns(ctx, to_populate, pfns, NULL) )
return -1;
for ( i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i )
@@ -1200,6 +1258,7 @@ struct xc_sr_restore_ops restore_ops_x86
.set_gfn = x86_pv_set_gfn,
.localise_page = x86_pv_localise_page,
.setup = x86_pv_setup,
+ .populate_pfns = x86_pv_populate_pfns,
.process_record = x86_pv_process_record,
.static_data_complete = x86_static_data_complete,
.stream_complete = x86_pv_stream_complete,

View File

@ -0,0 +1,101 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Fri, 23 Oct 2020 14:44:09 +0200
Subject: libxc sr restore map_errs
tools: restore: preallocate map_errs array
Remove repeated allocation from migration loop. There will never be
more than MAX_BATCH_SIZE pages to process in an incoming batch.
Allocate the space once.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
tools/libs/guest/xg_sr_common.h | 1 +
tools/libs/guest/xg_sr_restore.c | 22 +++++++---------------
2 files changed, 8 insertions(+), 15 deletions(-)
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -259,6 +259,7 @@ struct xc_sr_context
xen_pfn_t *pfns;
uint32_t *types;
xen_pfn_t *mfns;
+ int *map_errs;
int send_back_fd;
unsigned long p2m_size;
--- a/tools/libs/guest/xg_sr_restore.c
+++ b/tools/libs/guest/xg_sr_restore.c
@@ -204,21 +204,12 @@ static int process_page_data(struct xc_s
xen_pfn_t *pfns, uint32_t *types, void *page_data)
{
xc_interface *xch = ctx->xch;
- int *map_errs = malloc(count * sizeof(*map_errs));
int rc;
void *mapping = NULL, *guest_page = NULL;
unsigned int i, /* i indexes the pfns from the record. */
j, /* j indexes the subset of pfns we decide to map. */
nr_pages = 0;
- if ( !map_errs )
- {
- rc = -1;
- ERROR("Failed to allocate %zu bytes to process page data",
- count * sizeof(*map_errs));
- goto err;
- }
-
rc = populate_pfns(ctx, count, pfns, types);
if ( rc )
{
@@ -240,7 +231,7 @@ static int process_page_data(struct xc_s
mapping = guest_page = xenforeignmemory_map(
xch->fmem, ctx->domid, PROT_READ | PROT_WRITE,
- nr_pages, ctx->restore.mfns, map_errs);
+ nr_pages, ctx->restore.mfns, ctx->restore.map_errs);
if ( !mapping )
{
rc = -1;
@@ -254,11 +245,11 @@ static int process_page_data(struct xc_s
if ( !page_type_has_stream_data(types[i]) )
continue;
- if ( map_errs[j] )
+ if ( ctx->restore.map_errs[j] )
{
rc = -1;
ERROR("Mapping pfn %#"PRIpfn" (mfn %#"PRIpfn", type %#"PRIx32") failed with %d",
- pfns[i], ctx->restore.mfns[j], types[i], map_errs[j]);
+ pfns[i], ctx->restore.mfns[j], types[i], ctx->restore.map_errs[j]);
goto err;
}
@@ -296,8 +287,6 @@ static int process_page_data(struct xc_s
if ( mapping )
xenforeignmemory_unmap(xch->fmem, mapping, nr_pages);
- free(map_errs);
-
return rc;
}
@@ -704,7 +693,9 @@ static int setup(struct xc_sr_context *c
ctx->restore.pfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.pfns));
ctx->restore.types = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.types));
ctx->restore.mfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.mfns));
- if ( !ctx->restore.pfns || !ctx->restore.types || !ctx->restore.mfns )
+ ctx->restore.map_errs = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.map_errs));
+ if ( !ctx->restore.pfns || !ctx->restore.types || !ctx->restore.mfns ||
+ !ctx->restore.map_errs )
{
ERROR("Unable to allocate memory");
rc = -1;
@@ -741,6 +732,7 @@ static void cleanup(struct xc_sr_context
free(ctx->restore.buffered_records);
free(ctx->restore.populated_pfns);
+ free(ctx->restore.map_errs);
free(ctx->restore.mfns);
free(ctx->restore.types);
free(ctx->restore.pfns);

103
libxc-sr-restore-mfns.patch Normal file
View File

@ -0,0 +1,103 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Fri, 23 Oct 2020 14:42:19 +0200
Subject: libxc sr restore mfns
tools: restore: preallocate mfns array
Remove repeated allocation from migration loop. There will never be
more than MAX_BATCH_SIZE pages to process in an incoming batch.
Allocate the space once.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
tools/libs/guest/xg_sr_common.h | 1 +
tools/libs/guest/xg_sr_restore.c | 16 ++++++++--------
2 files changed, 9 insertions(+), 8 deletions(-)
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -258,6 +258,7 @@ struct xc_sr_context
struct restore_callbacks *callbacks;
xen_pfn_t *pfns;
uint32_t *types;
+ xen_pfn_t *mfns;
int send_back_fd;
unsigned long p2m_size;
--- a/tools/libs/guest/xg_sr_restore.c
+++ b/tools/libs/guest/xg_sr_restore.c
@@ -204,7 +204,6 @@ static int process_page_data(struct xc_s
xen_pfn_t *pfns, uint32_t *types, void *page_data)
{
xc_interface *xch = ctx->xch;
- xen_pfn_t *mfns = malloc(count * sizeof(*mfns));
int *map_errs = malloc(count * sizeof(*map_errs));
int rc;
void *mapping = NULL, *guest_page = NULL;
@@ -212,11 +211,11 @@ static int process_page_data(struct xc_s
j, /* j indexes the subset of pfns we decide to map. */
nr_pages = 0;
- if ( !mfns || !map_errs )
+ if ( !map_errs )
{
rc = -1;
ERROR("Failed to allocate %zu bytes to process page data",
- count * (sizeof(*mfns) + sizeof(*map_errs)));
+ count * sizeof(*map_errs));
goto err;
}
@@ -232,7 +231,7 @@ static int process_page_data(struct xc_s
ctx->restore.ops.set_page_type(ctx, pfns[i], types[i]);
if ( page_type_has_stream_data(types[i]) )
- mfns[nr_pages++] = ctx->restore.ops.pfn_to_gfn(ctx, pfns[i]);
+ ctx->restore.mfns[nr_pages++] = ctx->restore.ops.pfn_to_gfn(ctx, pfns[i]);
}
/* Nothing to do? */
@@ -241,7 +240,7 @@ static int process_page_data(struct xc_s
mapping = guest_page = xenforeignmemory_map(
xch->fmem, ctx->domid, PROT_READ | PROT_WRITE,
- nr_pages, mfns, map_errs);
+ nr_pages, ctx->restore.mfns, map_errs);
if ( !mapping )
{
rc = -1;
@@ -259,7 +258,7 @@ static int process_page_data(struct xc_s
{
rc = -1;
ERROR("Mapping pfn %#"PRIpfn" (mfn %#"PRIpfn", type %#"PRIx32") failed with %d",
- pfns[i], mfns[j], types[i], map_errs[j]);
+ pfns[i], ctx->restore.mfns[j], types[i], map_errs[j]);
goto err;
}
@@ -298,7 +297,6 @@ static int process_page_data(struct xc_s
xenforeignmemory_unmap(xch->fmem, mapping, nr_pages);
free(map_errs);
- free(mfns);
return rc;
}
@@ -705,7 +703,8 @@ static int setup(struct xc_sr_context *c
ctx->restore.pfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.pfns));
ctx->restore.types = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.types));
- if ( !ctx->restore.pfns || !ctx->restore.types )
+ ctx->restore.mfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.mfns));
+ if ( !ctx->restore.pfns || !ctx->restore.types || !ctx->restore.mfns )
{
ERROR("Unable to allocate memory");
rc = -1;
@@ -742,6 +741,7 @@ static void cleanup(struct xc_sr_context
free(ctx->restore.buffered_records);
free(ctx->restore.populated_pfns);
+ free(ctx->restore.mfns);
free(ctx->restore.types);
free(ctx->restore.pfns);

108
libxc-sr-restore-pfns.patch Normal file
View File

@ -0,0 +1,108 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Fri, 23 Oct 2020 14:39:30 +0200
Subject: libxc sr restore pfns
tools: restore: preallocate pfns array
Remove repeated allocation from migration loop. There will never be
more than MAX_BATCH_SIZE pages to process in an incoming batch.
Allocate the space once.
Adjust the verification for page count. It must be at least one page,
but not more than MAX_BATCH_SIZE.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
tools/libs/guest/xg_sr_common.h | 1 +
tools/libs/guest/xg_sr_restore.c | 23 +++++++++++++++--------
2 files changed, 16 insertions(+), 8 deletions(-)
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -256,6 +256,7 @@ struct xc_sr_context
{
struct xc_sr_restore_ops ops;
struct restore_callbacks *callbacks;
+ xen_pfn_t *pfns;
int send_back_fd;
unsigned long p2m_size;
--- a/tools/libs/guest/xg_sr_restore.c
+++ b/tools/libs/guest/xg_sr_restore.c
@@ -314,7 +314,7 @@ static int handle_page_data(struct xc_sr
unsigned int i, pages_of_data = 0;
int rc = -1;
- xen_pfn_t *pfns = NULL, pfn;
+ xen_pfn_t pfn;
uint32_t *types = NULL, type;
/*
@@ -349,9 +349,9 @@ static int handle_page_data(struct xc_sr
goto err;
}
- if ( pages->count < 1 )
+ if ( !pages->count || pages->count > MAX_BATCH_SIZE )
{
- ERROR("Expected at least 1 pfn in PAGE_DATA record");
+ ERROR("Unexpected pfn count %u in PAGE_DATA record", pages->count);
goto err;
}
@@ -362,9 +362,8 @@ static int handle_page_data(struct xc_sr
goto err;
}
- pfns = malloc(pages->count * sizeof(*pfns));
types = malloc(pages->count * sizeof(*types));
- if ( !pfns || !types )
+ if ( !types )
{
ERROR("Unable to allocate enough memory for %u pfns",
pages->count);
@@ -393,7 +392,7 @@ static int handle_page_data(struct xc_sr
* have a page worth of data in the record. */
pages_of_data++;
- pfns[i] = pfn;
+ ctx->restore.pfns[i] = pfn;
types[i] = type;
}
@@ -407,11 +406,10 @@ static int handle_page_data(struct xc_sr
goto err;
}
- rc = process_page_data(ctx, pages->count, pfns, types,
+ rc = process_page_data(ctx, pages->count, ctx->restore.pfns, types,
&pages->pfn[pages->count]);
err:
free(types);
- free(pfns);
return rc;
}
@@ -715,6 +713,14 @@ static int setup(struct xc_sr_context *c
goto err;
}
+ ctx->restore.pfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.pfns));
+ if ( !ctx->restore.pfns )
+ {
+ ERROR("Unable to allocate memory");
+ rc = -1;
+ goto err;
+ }
+
ctx->restore.buffered_records = malloc(
DEFAULT_BUF_RECORDS * sizeof(struct xc_sr_record));
if ( !ctx->restore.buffered_records )
@@ -745,6 +751,7 @@ static void cleanup(struct xc_sr_context
free(ctx->restore.buffered_records);
free(ctx->restore.populated_pfns);
+ free(ctx->restore.pfns);
if ( ctx->restore.ops.cleanup(ctx) )
PERROR("Failed to clean up");

View File

@ -0,0 +1,111 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Fri, 23 Oct 2020 14:54:12 +0200
Subject: libxc sr restore populate_pfns mfns
tools: restore: preallocate populate_pfns mfns array
Remove repeated allocation from migration loop. There will never be
more than MAX_BATCH_SIZE pages to process in an incoming batch.
Allocate the space once.
Use some prefix to avoid conflict with an array used in handle_page_data.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
tools/libs/guest/xg_sr_common.h | 1 +
tools/libs/guest/xg_sr_restore.c | 23 ++++++++---------------
2 files changed, 9 insertions(+), 15 deletions(-)
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -261,6 +261,7 @@ struct xc_sr_context
xen_pfn_t *mfns;
int *map_errs;
xen_pfn_t *pp_pfns;
+ xen_pfn_t *pp_mfns;
int send_back_fd;
unsigned long p2m_size;
--- a/tools/libs/guest/xg_sr_restore.c
+++ b/tools/libs/guest/xg_sr_restore.c
@@ -138,17 +138,9 @@ int populate_pfns(struct xc_sr_context *
const xen_pfn_t *original_pfns, const uint32_t *types)
{
xc_interface *xch = ctx->xch;
- xen_pfn_t *mfns = malloc(count * sizeof(*mfns));
unsigned int i, nr_pfns = 0;
int rc = -1;
- if ( !mfns )
- {
- ERROR("Failed to allocate %zu bytes for populating the physmap",
- 2 * count * sizeof(*mfns));
- goto err;
- }
-
for ( i = 0; i < count; ++i )
{
if ( (!types || page_type_to_populate(types[i])) &&
@@ -157,7 +149,7 @@ int populate_pfns(struct xc_sr_context *
rc = pfn_set_populated(ctx, original_pfns[i]);
if ( rc )
goto err;
- ctx->restore.pp_pfns[nr_pfns] = mfns[nr_pfns] = original_pfns[i];
+ ctx->restore.pp_pfns[nr_pfns] = ctx->restore.pp_mfns[nr_pfns] = original_pfns[i];
++nr_pfns;
}
}
@@ -165,7 +157,7 @@ int populate_pfns(struct xc_sr_context *
if ( nr_pfns )
{
rc = xc_domain_populate_physmap_exact(
- xch, ctx->domid, nr_pfns, 0, 0, mfns);
+ xch, ctx->domid, nr_pfns, 0, 0, ctx->restore.pp_mfns);
if ( rc )
{
PERROR("Failed to populate physmap");
@@ -174,22 +166,20 @@ int populate_pfns(struct xc_sr_context *
for ( i = 0; i < nr_pfns; ++i )
{
- if ( mfns[i] == INVALID_MFN )
+ if ( ctx->restore.pp_mfns[i] == INVALID_MFN )
{
ERROR("Populate physmap failed for pfn %u", i);
rc = -1;
goto err;
}
- ctx->restore.ops.set_gfn(ctx, ctx->restore.pp_pfns[i], mfns[i]);
+ ctx->restore.ops.set_gfn(ctx, ctx->restore.pp_pfns[i], ctx->restore.pp_mfns[i]);
}
}
rc = 0;
err:
- free(mfns);
-
return rc;
}
@@ -693,8 +683,10 @@ static int setup(struct xc_sr_context *c
ctx->restore.mfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.mfns));
ctx->restore.map_errs = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.map_errs));
ctx->restore.pp_pfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.pp_pfns));
+ ctx->restore.pp_mfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.pp_mfns));
if ( !ctx->restore.pfns || !ctx->restore.types || !ctx->restore.mfns ||
- !ctx->restore.map_errs || !ctx->restore.pp_pfns )
+ !ctx->restore.map_errs || !ctx->restore.pp_pfns ||
+ !ctx->restore.pp_mfns )
{
ERROR("Unable to allocate memory");
rc = -1;
@@ -731,6 +723,7 @@ static void cleanup(struct xc_sr_context
free(ctx->restore.buffered_records);
free(ctx->restore.populated_pfns);
+ free(ctx->restore.pp_mfns);
free(ctx->restore.pp_pfns);
free(ctx->restore.map_errs);
free(ctx->restore.mfns);

View File

@ -0,0 +1,89 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Fri, 23 Oct 2020 14:58:53 +0200
Subject: libxc sr restore populate_pfns pfns
tools: restore: preallocate populate_pfns pfns array
Remove repeated allocation from migration loop. There will never be
more than MAX_BATCH_SIZE pages to process in an incoming batch.
Allocate the space once.
Use some prefix to avoid conflict with an array used in handle_page_data.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
tools/libs/guest/xg_sr_common.h | 1 +
tools/libs/guest/xg_sr_restore.c | 14 +++++++-------
2 files changed, 8 insertions(+), 7 deletions(-)
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -260,6 +260,7 @@ struct xc_sr_context
uint32_t *types;
xen_pfn_t *mfns;
int *map_errs;
+ xen_pfn_t *pp_pfns;
int send_back_fd;
unsigned long p2m_size;
--- a/tools/libs/guest/xg_sr_restore.c
+++ b/tools/libs/guest/xg_sr_restore.c
@@ -138,12 +138,11 @@ int populate_pfns(struct xc_sr_context *
const xen_pfn_t *original_pfns, const uint32_t *types)
{
xc_interface *xch = ctx->xch;
- xen_pfn_t *mfns = malloc(count * sizeof(*mfns)),
- *pfns = malloc(count * sizeof(*pfns));
+ xen_pfn_t *mfns = malloc(count * sizeof(*mfns));
unsigned int i, nr_pfns = 0;
int rc = -1;
- if ( !mfns || !pfns )
+ if ( !mfns )
{
ERROR("Failed to allocate %zu bytes for populating the physmap",
2 * count * sizeof(*mfns));
@@ -158,7 +157,7 @@ int populate_pfns(struct xc_sr_context *
rc = pfn_set_populated(ctx, original_pfns[i]);
if ( rc )
goto err;
- pfns[nr_pfns] = mfns[nr_pfns] = original_pfns[i];
+ ctx->restore.pp_pfns[nr_pfns] = mfns[nr_pfns] = original_pfns[i];
++nr_pfns;
}
}
@@ -182,14 +181,13 @@ int populate_pfns(struct xc_sr_context *
goto err;
}
- ctx->restore.ops.set_gfn(ctx, pfns[i], mfns[i]);
+ ctx->restore.ops.set_gfn(ctx, ctx->restore.pp_pfns[i], mfns[i]);
}
}
rc = 0;
err:
- free(pfns);
free(mfns);
return rc;
@@ -694,8 +692,9 @@ static int setup(struct xc_sr_context *c
ctx->restore.types = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.types));
ctx->restore.mfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.mfns));
ctx->restore.map_errs = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.map_errs));
+ ctx->restore.pp_pfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.pp_pfns));
if ( !ctx->restore.pfns || !ctx->restore.types || !ctx->restore.mfns ||
- !ctx->restore.map_errs )
+ !ctx->restore.map_errs || !ctx->restore.pp_pfns )
{
ERROR("Unable to allocate memory");
rc = -1;
@@ -732,6 +731,7 @@ static void cleanup(struct xc_sr_context
free(ctx->restore.buffered_records);
free(ctx->restore.populated_pfns);
+ free(ctx->restore.pp_pfns);
free(ctx->restore.map_errs);
free(ctx->restore.mfns);
free(ctx->restore.types);

View File

@ -0,0 +1,272 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Mon, 26 Oct 2020 12:19:17 +0100
Subject: libxc sr restore read_record
tools: restore: split record processing
handle_page_data must be able to read directly into mapped guest memory.
This will avoid unneccesary memcpy calls for data which can be consumed verbatim.
Rearrange the code to allow decisions based on the incoming record.
This change is preparation for future changes in handle_page_data,
no change in behavior is intended.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
---
tools/libs/guest/xg_sr_common.c | 33 ++++++++++++---------
tools/libs/guest/xg_sr_common.h | 4 ++-
tools/libs/guest/xg_sr_restore.c | 49 ++++++++++++++++++++++----------
tools/libs/guest/xg_sr_save.c | 7 ++++-
4 files changed, 63 insertions(+), 30 deletions(-)
--- a/tools/libs/guest/xg_sr_common.c
+++ b/tools/libs/guest/xg_sr_common.c
@@ -91,26 +91,33 @@ int write_split_record(struct xc_sr_cont
return -1;
}
-int read_record(struct xc_sr_context *ctx, int fd, struct xc_sr_record *rec)
+int read_record_header(struct xc_sr_context *ctx, int fd, struct xc_sr_rhdr *rhdr)
{
xc_interface *xch = ctx->xch;
- struct xc_sr_rhdr rhdr;
- size_t datasz;
- if ( read_exact(fd, &rhdr, sizeof(rhdr)) )
+ if ( read_exact(fd, rhdr, sizeof(*rhdr)) )
{
PERROR("Failed to read Record Header from stream");
return -1;
}
- if ( rhdr.length > REC_LENGTH_MAX )
+ if ( rhdr->length > REC_LENGTH_MAX )
{
- ERROR("Record (0x%08x, %s) length %#x exceeds max (%#x)", rhdr.type,
- rec_type_to_str(rhdr.type), rhdr.length, REC_LENGTH_MAX);
+ ERROR("Record (0x%08x, %s) length %#x exceeds max (%#x)", rhdr->type,
+ rec_type_to_str(rhdr->type), rhdr->length, REC_LENGTH_MAX);
return -1;
}
- datasz = ROUNDUP(rhdr.length, REC_ALIGN_ORDER);
+ return 0;
+}
+
+int read_record_data(struct xc_sr_context *ctx, int fd, struct xc_sr_rhdr *rhdr,
+ struct xc_sr_record *rec)
+{
+ xc_interface *xch = ctx->xch;
+ size_t datasz;
+
+ datasz = ROUNDUP(rhdr->length, REC_ALIGN_ORDER);
if ( datasz )
{
@@ -119,7 +126,7 @@ int read_record(struct xc_sr_context *ct
if ( !rec->data )
{
ERROR("Unable to allocate %zu bytes for record data (0x%08x, %s)",
- datasz, rhdr.type, rec_type_to_str(rhdr.type));
+ datasz, rhdr->type, rec_type_to_str(rhdr->type));
return -1;
}
@@ -128,18 +135,18 @@ int read_record(struct xc_sr_context *ct
free(rec->data);
rec->data = NULL;
PERROR("Failed to read %zu bytes of data for record (0x%08x, %s)",
- datasz, rhdr.type, rec_type_to_str(rhdr.type));
+ datasz, rhdr->type, rec_type_to_str(rhdr->type));
return -1;
}
}
else
rec->data = NULL;
- rec->type = rhdr.type;
- rec->length = rhdr.length;
+ rec->type = rhdr->type;
+ rec->length = rhdr->length;
return 0;
-};
+}
static void __attribute__((unused)) build_assertions(void)
{
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -458,7 +458,9 @@ static inline int write_record(struct xc
*
* On failure, the contents of the record structure are undefined.
*/
-int read_record(struct xc_sr_context *ctx, int fd, struct xc_sr_record *rec);
+int read_record_header(struct xc_sr_context *ctx, int fd, struct xc_sr_rhdr *rhdr);
+int read_record_data(struct xc_sr_context *ctx, int fd, struct xc_sr_rhdr *rhdr,
+ struct xc_sr_record *rec);
/*
* This would ideally be private in restore.c, but is needed by
--- a/tools/libs/guest/xg_sr_restore.c
+++ b/tools/libs/guest/xg_sr_restore.c
@@ -453,7 +453,7 @@ static int send_checkpoint_dirty_pfn_lis
return rc;
}
-static int process_record(struct xc_sr_context *ctx, struct xc_sr_record *rec);
+static int process_buffered_record(struct xc_sr_context *ctx, struct xc_sr_record *rec);
static int handle_checkpoint(struct xc_sr_context *ctx)
{
xc_interface *xch = ctx->xch;
@@ -492,7 +492,7 @@ static int handle_checkpoint(struct xc_s
for ( i = 0; i < ctx->restore.buffered_rec_num; i++ )
{
- rc = process_record(ctx, &ctx->restore.buffered_records[i]);
+ rc = process_buffered_record(ctx, &ctx->restore.buffered_records[i]);
if ( rc )
goto err;
}
@@ -553,10 +553,11 @@ static int handle_checkpoint(struct xc_s
return rc;
}
-static int buffer_record(struct xc_sr_context *ctx, struct xc_sr_record *rec)
+static int buffer_record(struct xc_sr_context *ctx, struct xc_sr_rhdr *rhdr)
{
xc_interface *xch = ctx->xch;
unsigned int new_alloc_num;
+ struct xc_sr_record rec;
struct xc_sr_record *p;
if ( ctx->restore.buffered_rec_num >= ctx->restore.allocated_rec_num )
@@ -574,8 +575,13 @@ static int buffer_record(struct xc_sr_co
ctx->restore.allocated_rec_num = new_alloc_num;
}
+ if ( read_record_data(ctx, ctx->fd, rhdr, &rec) )
+ {
+ return -1;
+ }
+
memcpy(&ctx->restore.buffered_records[ctx->restore.buffered_rec_num++],
- rec, sizeof(*rec));
+ &rec, sizeof(rec));
return 0;
}
@@ -606,7 +612,7 @@ int handle_static_data_end(struct xc_sr_
return rc;
}
-static int process_record(struct xc_sr_context *ctx, struct xc_sr_record *rec)
+static int process_buffered_record(struct xc_sr_context *ctx, struct xc_sr_record *rec)
{
xc_interface *xch = ctx->xch;
int rc = 0;
@@ -644,6 +650,19 @@ static int process_record(struct xc_sr_c
return rc;
}
+static int process_incoming_record_header(struct xc_sr_context *ctx, struct xc_sr_rhdr *rhdr)
+{
+ struct xc_sr_record rec;
+ int rc;
+
+ rc = read_record_data(ctx, ctx->fd, rhdr, &rec);
+ if ( rc )
+ return rc;
+
+ return process_buffered_record(ctx, &rec);
+}
+
+
static int setup(struct xc_sr_context *ctx)
{
xc_interface *xch = ctx->xch;
@@ -740,7 +759,7 @@ static void cleanup(struct xc_sr_context
static int restore(struct xc_sr_context *ctx)
{
xc_interface *xch = ctx->xch;
- struct xc_sr_record rec;
+ struct xc_sr_rhdr rhdr;
int rc, saved_rc = 0, saved_errno = 0;
IPRINTF("Restoring domain");
@@ -751,7 +770,7 @@ static int restore(struct xc_sr_context
do
{
- rc = read_record(ctx, ctx->fd, &rec);
+ rc = read_record_header(ctx, ctx->fd, &rhdr);
if ( rc )
{
if ( ctx->restore.buffer_all_records )
@@ -761,25 +780,25 @@ static int restore(struct xc_sr_context
}
if ( ctx->restore.buffer_all_records &&
- rec.type != REC_TYPE_END &&
- rec.type != REC_TYPE_CHECKPOINT )
+ rhdr.type != REC_TYPE_END &&
+ rhdr.type != REC_TYPE_CHECKPOINT )
{
- rc = buffer_record(ctx, &rec);
+ rc = buffer_record(ctx, &rhdr);
if ( rc )
goto err;
}
else
{
- rc = process_record(ctx, &rec);
+ rc = process_incoming_record_header(ctx, &rhdr);
if ( rc == RECORD_NOT_PROCESSED )
{
- if ( rec.type & REC_TYPE_OPTIONAL )
+ if ( rhdr.type & REC_TYPE_OPTIONAL )
DPRINTF("Ignoring optional record %#x (%s)",
- rec.type, rec_type_to_str(rec.type));
+ rhdr.type, rec_type_to_str(rhdr.type));
else
{
ERROR("Mandatory record %#x (%s) not handled",
- rec.type, rec_type_to_str(rec.type));
+ rhdr.type, rec_type_to_str(rhdr.type));
rc = -1;
goto err;
}
@@ -790,7 +809,7 @@ static int restore(struct xc_sr_context
goto err;
}
- } while ( rec.type != REC_TYPE_END );
+ } while ( rhdr.type != REC_TYPE_END );
remus_failover:
if ( ctx->stream_type == XC_STREAM_COLO )
--- a/tools/libs/guest/xg_sr_save.c
+++ b/tools/libs/guest/xg_sr_save.c
@@ -590,6 +590,7 @@ static int send_memory_live(struct xc_sr
static int colo_merge_secondary_dirty_bitmap(struct xc_sr_context *ctx)
{
xc_interface *xch = ctx->xch;
+ struct xc_sr_rhdr rhdr;
struct xc_sr_record rec;
uint64_t *pfns = NULL;
uint64_t pfn;
@@ -598,7 +599,11 @@ static int colo_merge_secondary_dirty_bi
DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
&ctx->save.dirty_bitmap_hbuf);
- rc = read_record(ctx, ctx->save.recv_fd, &rec);
+ rc = read_record_header(ctx, ctx->save.recv_fd, &rhdr);
+ if ( rc )
+ goto err;
+
+ rc = read_record_data(ctx, ctx->save.recv_fd, &rhdr, &rec);
if ( rc )
goto err;

View File

@ -0,0 +1,93 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Fri, 23 Oct 2020 14:39:31 +0200
Subject: libxc sr restore types
tools: restore: preallocate types array
Remove repeated allocation from migration loop. There will never be
more than MAX_BATCH_SIZE pages to process in an incoming batch.
Allocate the space once.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
tools/libs/guest/xg_sr_common.h | 1 +
tools/libs/guest/xg_sr_restore.c | 22 +++++++---------------
2 files changed, 8 insertions(+), 15 deletions(-)
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -257,6 +257,7 @@ struct xc_sr_context
struct xc_sr_restore_ops ops;
struct restore_callbacks *callbacks;
xen_pfn_t *pfns;
+ uint32_t *types;
int send_back_fd;
unsigned long p2m_size;
--- a/tools/libs/guest/xg_sr_restore.c
+++ b/tools/libs/guest/xg_sr_restore.c
@@ -315,7 +315,7 @@ static int handle_page_data(struct xc_sr
int rc = -1;
xen_pfn_t pfn;
- uint32_t *types = NULL, type;
+ uint32_t type;
/*
* v2 compatibility only exists for x86 streams. This is a bit of a
@@ -362,14 +362,6 @@ static int handle_page_data(struct xc_sr
goto err;
}
- types = malloc(pages->count * sizeof(*types));
- if ( !types )
- {
- ERROR("Unable to allocate enough memory for %u pfns",
- pages->count);
- goto err;
- }
-
for ( i = 0; i < pages->count; ++i )
{
pfn = pages->pfn[i] & PAGE_DATA_PFN_MASK;
@@ -393,7 +385,7 @@ static int handle_page_data(struct xc_sr
pages_of_data++;
ctx->restore.pfns[i] = pfn;
- types[i] = type;
+ ctx->restore.types[i] = type;
}
if ( rec->length != (sizeof(*pages) +
@@ -406,11 +398,9 @@ static int handle_page_data(struct xc_sr
goto err;
}
- rc = process_page_data(ctx, pages->count, ctx->restore.pfns, types,
- &pages->pfn[pages->count]);
+ rc = process_page_data(ctx, pages->count, ctx->restore.pfns,
+ ctx->restore.types, &pages->pfn[pages->count]);
err:
- free(types);
-
return rc;
}
@@ -714,7 +704,8 @@ static int setup(struct xc_sr_context *c
}
ctx->restore.pfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.pfns));
- if ( !ctx->restore.pfns )
+ ctx->restore.types = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.types));
+ if ( !ctx->restore.pfns || !ctx->restore.types )
{
ERROR("Unable to allocate memory");
rc = -1;
@@ -751,6 +742,7 @@ static void cleanup(struct xc_sr_context
free(ctx->restore.buffered_records);
free(ctx->restore.populated_pfns);
+ free(ctx->restore.types);
free(ctx->restore.pfns);
if ( ctx->restore.ops.cleanup(ctx) )

109
libxc-sr-save-errors.patch Normal file
View File

@ -0,0 +1,109 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Fri, 23 Oct 2020 11:26:05 +0200
Subject: libxc sr save errors
tools: save: preallocate errors array
Remove repeated allocation from migration loop. There will never be
more than MAX_BATCH_SIZE pages to process in a batch.
Allocate the space once.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
tools/libs/guest/xg_sr_common.h | 1 +
tools/libs/guest/xg_sr_save.c | 20 ++++++++++----------
2 files changed, 11 insertions(+), 10 deletions(-)
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -246,6 +246,7 @@ struct xc_sr_context
xen_pfn_t *batch_pfns;
xen_pfn_t *mfns;
xen_pfn_t *types;
+ int *errors;
unsigned int nr_batch_pfns;
unsigned long *deferred_pages;
unsigned long nr_deferred_pages;
--- a/tools/libs/guest/xg_sr_save.c
+++ b/tools/libs/guest/xg_sr_save.c
@@ -91,7 +91,7 @@ static int write_batch(struct xc_sr_cont
void *guest_mapping = NULL;
void **guest_data = NULL;
void **local_pages = NULL;
- int *errors = NULL, rc = -1;
+ int rc = -1;
unsigned int i, p, nr_pages = 0, nr_pages_mapped = 0;
unsigned int nr_pfns = ctx->save.nr_batch_pfns;
void *page, *orig_page;
@@ -104,8 +104,6 @@ static int write_batch(struct xc_sr_cont
assert(nr_pfns != 0);
- /* Errors from attempting to map the gfns. */
- errors = malloc(nr_pfns * sizeof(*errors));
/* Pointers to page data to send. Mapped gfns or local allocations. */
guest_data = calloc(nr_pfns, sizeof(*guest_data));
/* Pointers to locally allocated pages. Need freeing. */
@@ -113,7 +111,7 @@ static int write_batch(struct xc_sr_cont
/* iovec[] for writev(). */
iov = malloc((nr_pfns + 4) * sizeof(*iov));
- if ( !errors || !guest_data || !local_pages || !iov )
+ if ( !guest_data || !local_pages || !iov )
{
ERROR("Unable to allocate arrays for a batch of %u pages",
nr_pfns);
@@ -158,8 +156,8 @@ static int write_batch(struct xc_sr_cont
if ( nr_pages > 0 )
{
- guest_mapping = xenforeignmemory_map(
- xch->fmem, ctx->domid, PROT_READ, nr_pages, ctx->save.mfns, errors);
+ guest_mapping = xenforeignmemory_map(xch->fmem, ctx->domid, PROT_READ,
+ nr_pages, ctx->save.mfns, ctx->save.errors);
if ( !guest_mapping )
{
PERROR("Failed to map guest pages");
@@ -172,10 +170,11 @@ static int write_batch(struct xc_sr_cont
if ( !page_type_has_stream_data(ctx->save.types[i]) )
continue;
- if ( errors[p] )
+ if ( ctx->save.errors[p] )
{
ERROR("Mapping of pfn %#"PRIpfn" (mfn %#"PRIpfn") failed %d",
- ctx->save.batch_pfns[i], ctx->save.mfns[p], errors[p]);
+ ctx->save.batch_pfns[i], ctx->save.mfns[p],
+ ctx->save.errors[p]);
goto err;
}
@@ -271,7 +270,6 @@ static int write_batch(struct xc_sr_cont
free(iov);
free(local_pages);
free(guest_data);
- free(errors);
return rc;
}
@@ -846,10 +844,11 @@ static int setup(struct xc_sr_context *c
sizeof(*ctx->save.batch_pfns));
ctx->save.mfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->save.mfns));
ctx->save.types = malloc(MAX_BATCH_SIZE * sizeof(*ctx->save.types));
+ ctx->save.errors = malloc(MAX_BATCH_SIZE * sizeof(*ctx->save.errors));
ctx->save.deferred_pages = bitmap_alloc(ctx->save.p2m_size);
if ( !ctx->save.batch_pfns || !ctx->save.mfns || !ctx->save.types ||
- !dirty_bitmap || !ctx->save.deferred_pages )
+ !ctx->save.errors || !dirty_bitmap || !ctx->save.deferred_pages )
{
ERROR("Unable to allocate memory for dirty bitmaps, batch pfns and"
" deferred pages");
@@ -880,6 +879,7 @@ static void cleanup(struct xc_sr_context
xc_hypercall_buffer_free_pages(xch, dirty_bitmap,
NRPAGES(bitmap_size(ctx->save.p2m_size)));
free(ctx->save.deferred_pages);
+ free(ctx->save.errors);
free(ctx->save.types);
free(ctx->save.mfns);
free(ctx->save.batch_pfns);

View File

@ -0,0 +1,123 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Fri, 23 Oct 2020 11:40:45 +0200
Subject: libxc sr save guest_data
tools: save: preallocate guest_data array
Remove repeated allocation from migration loop. There will never be
more than MAX_BATCH_SIZE pages to process in a batch.
Allocate the space once.
Because this was allocated with calloc:
Adjust the loop to clear unused entries as needed.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
tools/libs/guest/xg_sr_common.h | 1 +
tools/libs/guest/xg_sr_save.c | 20 +++++++++++---------
2 files changed, 12 insertions(+), 9 deletions(-)
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -249,6 +249,7 @@ struct xc_sr_context
int *errors;
struct iovec *iov;
uint64_t *rec_pfns;
+ void **guest_data;
unsigned int nr_batch_pfns;
unsigned long *deferred_pages;
unsigned long nr_deferred_pages;
--- a/tools/libs/guest/xg_sr_save.c
+++ b/tools/libs/guest/xg_sr_save.c
@@ -89,7 +89,6 @@ static int write_batch(struct xc_sr_cont
{
xc_interface *xch = ctx->xch;
void *guest_mapping = NULL;
- void **guest_data = NULL;
void **local_pages = NULL;
int rc = -1;
unsigned int i, p, nr_pages = 0, nr_pages_mapped = 0;
@@ -103,12 +102,10 @@ static int write_batch(struct xc_sr_cont
assert(nr_pfns != 0);
- /* Pointers to page data to send. Mapped gfns or local allocations. */
- guest_data = calloc(nr_pfns, sizeof(*guest_data));
/* Pointers to locally allocated pages. Need freeing. */
local_pages = calloc(nr_pfns, sizeof(*local_pages));
- if ( !guest_data || !local_pages )
+ if ( !local_pages )
{
ERROR("Unable to allocate arrays for a batch of %u pages",
nr_pfns);
@@ -165,7 +162,10 @@ static int write_batch(struct xc_sr_cont
for ( i = 0, p = 0; i < nr_pfns; ++i )
{
if ( !page_type_has_stream_data(ctx->save.types[i]) )
+ {
+ ctx->save.guest_data[i] = NULL;
continue;
+ }
if ( ctx->save.errors[p] )
{
@@ -183,6 +183,7 @@ static int write_batch(struct xc_sr_cont
if ( rc )
{
+ ctx->save.guest_data[i] = NULL;
if ( rc == -1 && errno == EAGAIN )
{
set_bit(ctx->save.batch_pfns[i], ctx->save.deferred_pages);
@@ -194,7 +195,7 @@ static int write_batch(struct xc_sr_cont
goto err;
}
else
- guest_data[i] = page;
+ ctx->save.guest_data[i] = page;
rc = -1;
++p;
@@ -232,9 +233,9 @@ static int write_batch(struct xc_sr_cont
{
for ( i = 0; i < nr_pfns; ++i )
{
- if ( guest_data[i] )
+ if ( ctx->save.guest_data[i] )
{
- ctx->save.iov[iovcnt].iov_base = guest_data[i];
+ ctx->save.iov[iovcnt].iov_base = ctx->save.guest_data[i];
ctx->save.iov[iovcnt].iov_len = PAGE_SIZE;
iovcnt++;
--nr_pages;
@@ -258,7 +259,6 @@ static int write_batch(struct xc_sr_cont
for ( i = 0; local_pages && i < nr_pfns; ++i )
free(local_pages[i]);
free(local_pages);
- free(guest_data);
return rc;
}
@@ -836,11 +836,12 @@ static int setup(struct xc_sr_context *c
ctx->save.errors = malloc(MAX_BATCH_SIZE * sizeof(*ctx->save.errors));
ctx->save.iov = malloc((4 + MAX_BATCH_SIZE) * sizeof(*ctx->save.iov));
ctx->save.rec_pfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->save.rec_pfns));
+ ctx->save.guest_data = malloc(MAX_BATCH_SIZE * sizeof(*ctx->save.guest_data));
ctx->save.deferred_pages = bitmap_alloc(ctx->save.p2m_size);
if ( !ctx->save.batch_pfns || !ctx->save.mfns || !ctx->save.types ||
!ctx->save.errors || !ctx->save.iov || !ctx->save.rec_pfns ||
- !dirty_bitmap || !ctx->save.deferred_pages )
+ !ctx->save.guest_data ||!dirty_bitmap || !ctx->save.deferred_pages )
{
ERROR("Unable to allocate memory for dirty bitmaps, batch pfns and"
" deferred pages");
@@ -871,6 +872,7 @@ static void cleanup(struct xc_sr_context
xc_hypercall_buffer_free_pages(xch, dirty_bitmap,
NRPAGES(bitmap_size(ctx->save.p2m_size)));
free(ctx->save.deferred_pages);
+ free(ctx->save.guest_data);
free(ctx->save.rec_pfns);
free(ctx->save.iov);
free(ctx->save.errors);

124
libxc-sr-save-iov.patch Normal file
View File

@ -0,0 +1,124 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Fri, 23 Oct 2020 11:30:41 +0200
Subject: libxc sr save iov
tools: save: preallocate iov array
Remove repeated allocation from migration loop. There will never be
more than MAX_BATCH_SIZE pages to process in a batch.
Allocate the space once.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
tools/libs/guest/xg_sr_common.h | 1 +
tools/libs/guest/xg_sr_save.c | 34 ++++++++++++++++-----------------
2 files changed, 18 insertions(+), 17 deletions(-)
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -247,6 +247,7 @@ struct xc_sr_context
xen_pfn_t *mfns;
xen_pfn_t *types;
int *errors;
+ struct iovec *iov;
unsigned int nr_batch_pfns;
unsigned long *deferred_pages;
unsigned long nr_deferred_pages;
--- a/tools/libs/guest/xg_sr_save.c
+++ b/tools/libs/guest/xg_sr_save.c
@@ -96,7 +96,7 @@ static int write_batch(struct xc_sr_cont
unsigned int nr_pfns = ctx->save.nr_batch_pfns;
void *page, *orig_page;
uint64_t *rec_pfns = NULL;
- struct iovec *iov = NULL; int iovcnt = 0;
+ int iovcnt = 0;
struct xc_sr_rec_page_data_header hdr = { 0 };
struct xc_sr_record rec = {
.type = REC_TYPE_PAGE_DATA,
@@ -108,10 +108,8 @@ static int write_batch(struct xc_sr_cont
guest_data = calloc(nr_pfns, sizeof(*guest_data));
/* Pointers to locally allocated pages. Need freeing. */
local_pages = calloc(nr_pfns, sizeof(*local_pages));
- /* iovec[] for writev(). */
- iov = malloc((nr_pfns + 4) * sizeof(*iov));
- if ( !guest_data || !local_pages || !iov )
+ if ( !guest_data || !local_pages )
{
ERROR("Unable to allocate arrays for a batch of %u pages",
nr_pfns);
@@ -221,17 +219,17 @@ static int write_batch(struct xc_sr_cont
for ( i = 0; i < nr_pfns; ++i )
rec_pfns[i] = ((uint64_t)(ctx->save.types[i]) << 32) | ctx->save.batch_pfns[i];
- iov[0].iov_base = &rec.type;
- iov[0].iov_len = sizeof(rec.type);
+ ctx->save.iov[0].iov_base = &rec.type;
+ ctx->save.iov[0].iov_len = sizeof(rec.type);
- iov[1].iov_base = &rec.length;
- iov[1].iov_len = sizeof(rec.length);
+ ctx->save.iov[1].iov_base = &rec.length;
+ ctx->save.iov[1].iov_len = sizeof(rec.length);
- iov[2].iov_base = &hdr;
- iov[2].iov_len = sizeof(hdr);
+ ctx->save.iov[2].iov_base = &hdr;
+ ctx->save.iov[2].iov_len = sizeof(hdr);
- iov[3].iov_base = rec_pfns;
- iov[3].iov_len = nr_pfns * sizeof(*rec_pfns);
+ ctx->save.iov[3].iov_base = rec_pfns;
+ ctx->save.iov[3].iov_len = nr_pfns * sizeof(*rec_pfns);
iovcnt = 4;
ctx->save.pages_sent += nr_pages;
@@ -243,15 +241,15 @@ static int write_batch(struct xc_sr_cont
{
if ( guest_data[i] )
{
- iov[iovcnt].iov_base = guest_data[i];
- iov[iovcnt].iov_len = PAGE_SIZE;
+ ctx->save.iov[iovcnt].iov_base = guest_data[i];
+ ctx->save.iov[iovcnt].iov_len = PAGE_SIZE;
iovcnt++;
--nr_pages;
}
}
}
- if ( writev_exact(ctx->fd, iov, iovcnt) )
+ if ( writev_exact(ctx->fd, ctx->save.iov, iovcnt) )
{
PERROR("Failed to write page data to stream");
goto err;
@@ -267,7 +265,6 @@ static int write_batch(struct xc_sr_cont
xenforeignmemory_unmap(xch->fmem, guest_mapping, nr_pages_mapped);
for ( i = 0; local_pages && i < nr_pfns; ++i )
free(local_pages[i]);
- free(iov);
free(local_pages);
free(guest_data);
@@ -845,10 +842,12 @@ static int setup(struct xc_sr_context *c
ctx->save.mfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->save.mfns));
ctx->save.types = malloc(MAX_BATCH_SIZE * sizeof(*ctx->save.types));
ctx->save.errors = malloc(MAX_BATCH_SIZE * sizeof(*ctx->save.errors));
+ ctx->save.iov = malloc((4 + MAX_BATCH_SIZE) * sizeof(*ctx->save.iov));
ctx->save.deferred_pages = bitmap_alloc(ctx->save.p2m_size);
if ( !ctx->save.batch_pfns || !ctx->save.mfns || !ctx->save.types ||
- !ctx->save.errors || !dirty_bitmap || !ctx->save.deferred_pages )
+ !ctx->save.errors || !ctx->save.iov || !dirty_bitmap ||
+ !ctx->save.deferred_pages )
{
ERROR("Unable to allocate memory for dirty bitmaps, batch pfns and"
" deferred pages");
@@ -879,6 +878,7 @@ static void cleanup(struct xc_sr_context
xc_hypercall_buffer_free_pages(xch, dirty_bitmap,
NRPAGES(bitmap_size(ctx->save.p2m_size)));
free(ctx->save.deferred_pages);
+ free(ctx->save.iov);
free(ctx->save.errors);
free(ctx->save.types);
free(ctx->save.mfns);

View File

@ -0,0 +1,218 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Fri, 23 Oct 2020 12:47:56 +0200
Subject: libxc sr save local_pages
tools: save: preallocate local_pages array
Remove repeated allocation from migration loop. There will never be
more than MAX_BATCH_SIZE pages to process in a batch.
Allocate the space once.
Adjust the code to use the unmodified src page in case of HVM.
In case of PV the page may need to be normalised, use a private memory
area for this purpose.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
tools/libs/guest/xg_sr_common.h | 22 ++++++++++---------
tools/libs/guest/xg_sr_save.c | 26 ++++------------------
tools/libs/guest/xg_sr_save_x86_hvm.c | 5 +++--
tools/libs/guest/xg_sr_save_x86_pv.c | 31 ++++++++++++++++++---------
4 files changed, 40 insertions(+), 44 deletions(-)
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -33,16 +33,12 @@ struct xc_sr_save_ops
* Optionally transform the contents of a page from being specific to the
* sending environment, to being generic for the stream.
*
- * The page of data at the end of 'page' may be a read-only mapping of a
- * running guest; it must not be modified. If no transformation is
- * required, the callee should leave '*pages' untouched.
+ * The page of data '*src' may be a read-only mapping of a running guest;
+ * it must not be modified. If no transformation is required, the callee
+ * should leave '*src' untouched, and return it via '**ptr'.
*
- * If a transformation is required, the callee should allocate themselves
- * a local page using malloc() and return it via '*page'.
- *
- * The caller shall free() '*page' in all cases. In the case that the
- * callee encounters an error, it should *NOT* free() the memory it
- * allocated for '*page'.
+ * If a transformation is required, the callee should provide the
+ * transformed page in a private buffer and return it via '**ptr'.
*
* It is valid to fail with EAGAIN if the transformation is not able to be
* completed at this point. The page shall be retried later.
@@ -50,7 +46,7 @@ struct xc_sr_save_ops
* @returns 0 for success, -1 for failure, with errno appropriately set.
*/
int (*normalise_page)(struct xc_sr_context *ctx, xen_pfn_t type,
- void **page);
+ void *src, unsigned int idx, void **ptr);
/**
* Set up local environment to save a domain. (Typically querying
@@ -359,6 +355,12 @@ struct xc_sr_context
{
struct
{
+ /* Used by write_batch for modified pages. */
+ void *normalised_pages;
+ } save;
+
+ struct
+ {
/* State machine for the order of received records. */
bool seen_pv_info;
--- a/tools/libs/guest/xg_sr_save.c
+++ b/tools/libs/guest/xg_sr_save.c
@@ -89,11 +89,10 @@ static int write_batch(struct xc_sr_cont
{
xc_interface *xch = ctx->xch;
void *guest_mapping = NULL;
- void **local_pages = NULL;
int rc = -1;
unsigned int i, p, nr_pages = 0, nr_pages_mapped = 0;
unsigned int nr_pfns = ctx->save.nr_batch_pfns;
- void *page, *orig_page;
+ void *src;
int iovcnt = 0;
struct xc_sr_rec_page_data_header hdr = { 0 };
struct xc_sr_record rec = {
@@ -102,16 +101,6 @@ static int write_batch(struct xc_sr_cont
assert(nr_pfns != 0);
- /* Pointers to locally allocated pages. Need freeing. */
- local_pages = calloc(nr_pfns, sizeof(*local_pages));
-
- if ( !local_pages )
- {
- ERROR("Unable to allocate arrays for a batch of %u pages",
- nr_pfns);
- goto err;
- }
-
for ( i = 0; i < nr_pfns; ++i )
{
ctx->save.types[i] = ctx->save.mfns[i] = ctx->save.ops.pfn_to_gfn(ctx,
@@ -175,11 +164,9 @@ static int write_batch(struct xc_sr_cont
goto err;
}
- orig_page = page = guest_mapping + (p * PAGE_SIZE);
- rc = ctx->save.ops.normalise_page(ctx, ctx->save.types[i], &page);
-
- if ( orig_page != page )
- local_pages[i] = page;
+ src = guest_mapping + (p * PAGE_SIZE);
+ rc = ctx->save.ops.normalise_page(ctx, ctx->save.types[i], src, i,
+ &ctx->save.guest_data[i]);
if ( rc )
{
@@ -194,8 +181,6 @@ static int write_batch(struct xc_sr_cont
else
goto err;
}
- else
- ctx->save.guest_data[i] = page;
rc = -1;
++p;
@@ -256,9 +241,6 @@ static int write_batch(struct xc_sr_cont
err:
if ( guest_mapping )
xenforeignmemory_unmap(xch->fmem, guest_mapping, nr_pages_mapped);
- for ( i = 0; local_pages && i < nr_pfns; ++i )
- free(local_pages[i]);
- free(local_pages);
return rc;
}
--- a/tools/libs/guest/xg_sr_save_x86_hvm.c
+++ b/tools/libs/guest/xg_sr_save_x86_hvm.c
@@ -129,9 +129,10 @@ static xen_pfn_t x86_hvm_pfn_to_gfn(cons
return pfn;
}
-static int x86_hvm_normalise_page(struct xc_sr_context *ctx,
- xen_pfn_t type, void **page)
+static int x86_hvm_normalise_page(struct xc_sr_context *ctx, xen_pfn_t type,
+ void *src, unsigned int idx, void **ptr)
{
+ *ptr = src;
return 0;
}
--- a/tools/libs/guest/xg_sr_save_x86_pv.c
+++ b/tools/libs/guest/xg_sr_save_x86_pv.c
@@ -999,29 +999,31 @@ static xen_pfn_t x86_pv_pfn_to_gfn(const
* save_ops function. Performs pagetable normalisation on appropriate pages.
*/
static int x86_pv_normalise_page(struct xc_sr_context *ctx, xen_pfn_t type,
- void **page)
+ void *src, unsigned int idx, void **ptr)
{
xc_interface *xch = ctx->xch;
- void *local_page;
+ void *dst;
int rc;
type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
if ( type < XEN_DOMCTL_PFINFO_L1TAB || type > XEN_DOMCTL_PFINFO_L4TAB )
+ {
+ *ptr = src;
return 0;
+ }
- local_page = malloc(PAGE_SIZE);
- if ( !local_page )
+ if ( idx >= MAX_BATCH_SIZE )
{
- ERROR("Unable to allocate scratch page");
- rc = -1;
- goto out;
+ ERROR("idx %u out of range", idx);
+ errno = ERANGE;
+ return -1;
}
- rc = normalise_pagetable(ctx, *page, local_page, type);
- *page = local_page;
+ dst = ctx->x86.pv.save.normalised_pages + (idx * PAGE_SIZE);
+ rc = normalise_pagetable(ctx, src, dst, type);
+ *ptr = dst;
- out:
return rc;
}
@@ -1031,8 +1033,16 @@ static int x86_pv_normalise_page(struct
*/
static int x86_pv_setup(struct xc_sr_context *ctx)
{
+ xc_interface *xch = ctx->xch;
int rc;
+ ctx->x86.pv.save.normalised_pages = malloc(MAX_BATCH_SIZE * PAGE_SIZE);
+ if ( !ctx->x86.pv.save.normalised_pages )
+ {
+ PERROR("Failed to allocate normalised_pages");
+ return -1;
+ }
+
rc = x86_pv_domain_info(ctx);
if ( rc )
return rc;
@@ -1118,6 +1128,7 @@ static int x86_pv_check_vm_state(struct
static int x86_pv_cleanup(struct xc_sr_context *ctx)
{
+ free(ctx->x86.pv.save.normalised_pages);
free(ctx->x86.pv.p2m_pfns);
if ( ctx->x86.pv.p2m )

132
libxc-sr-save-mfns.patch Normal file
View File

@ -0,0 +1,132 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Fri, 23 Oct 2020 11:20:36 +0200
Subject: libxc sr save mfns
tools: save: preallocate mfns array
Remove repeated allocation from migration loop. There will never be
more than MAX_BATCH_SIZE pages to process in a batch, see add_to_batch.
Allocate the space once.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
tools/libs/guest/xg_sr_common.h | 1 +
tools/libs/guest/xg_sr_save.c | 25 +++++++++++++------------
2 files changed, 14 insertions(+), 12 deletions(-)
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -244,6 +244,7 @@ struct xc_sr_context
struct precopy_stats stats;
xen_pfn_t *batch_pfns;
+ xen_pfn_t *mfns;
unsigned int nr_batch_pfns;
unsigned long *deferred_pages;
unsigned long nr_deferred_pages;
--- a/tools/libs/guest/xg_sr_save.c
+++ b/tools/libs/guest/xg_sr_save.c
@@ -88,7 +88,7 @@ static int write_checkpoint_record(struc
static int write_batch(struct xc_sr_context *ctx)
{
xc_interface *xch = ctx->xch;
- xen_pfn_t *mfns = NULL, *types = NULL;
+ xen_pfn_t *types = NULL;
void *guest_mapping = NULL;
void **guest_data = NULL;
void **local_pages = NULL;
@@ -105,8 +105,6 @@ static int write_batch(struct xc_sr_cont
assert(nr_pfns != 0);
- /* Mfns of the batch pfns. */
- mfns = malloc(nr_pfns * sizeof(*mfns));
/* Types of the batch pfns. */
types = malloc(nr_pfns * sizeof(*types));
/* Errors from attempting to map the gfns. */
@@ -118,7 +116,7 @@ static int write_batch(struct xc_sr_cont
/* iovec[] for writev(). */
iov = malloc((nr_pfns + 4) * sizeof(*iov));
- if ( !mfns || !types || !errors || !guest_data || !local_pages || !iov )
+ if ( !types || !errors || !guest_data || !local_pages || !iov )
{
ERROR("Unable to allocate arrays for a batch of %u pages",
nr_pfns);
@@ -127,11 +125,11 @@ static int write_batch(struct xc_sr_cont
for ( i = 0; i < nr_pfns; ++i )
{
- types[i] = mfns[i] = ctx->save.ops.pfn_to_gfn(ctx,
+ types[i] = ctx->save.mfns[i] = ctx->save.ops.pfn_to_gfn(ctx,
ctx->save.batch_pfns[i]);
/* Likely a ballooned page. */
- if ( mfns[i] == INVALID_MFN )
+ if ( ctx->save.mfns[i] == INVALID_MFN )
{
set_bit(ctx->save.batch_pfns[i], ctx->save.deferred_pages);
++ctx->save.nr_deferred_pages;
@@ -150,20 +148,21 @@ static int write_batch(struct xc_sr_cont
{
if ( !is_known_page_type(types[i]) )
{
- ERROR("Unknown type %#"PRIpfn" for pfn %#"PRIpfn, types[i], mfns[i]);
+ ERROR("Unknown type %#"PRIpfn" for pfn %#"PRIpfn,
+ types[i], ctx->save.mfns[i]);
goto err;
}
if ( !page_type_has_stream_data(types[i]) )
continue;
- mfns[nr_pages++] = mfns[i];
+ ctx->save.mfns[nr_pages++] = ctx->save.mfns[i];
}
if ( nr_pages > 0 )
{
guest_mapping = xenforeignmemory_map(
- xch->fmem, ctx->domid, PROT_READ, nr_pages, mfns, errors);
+ xch->fmem, ctx->domid, PROT_READ, nr_pages, ctx->save.mfns, errors);
if ( !guest_mapping )
{
PERROR("Failed to map guest pages");
@@ -179,7 +178,7 @@ static int write_batch(struct xc_sr_cont
if ( errors[p] )
{
ERROR("Mapping of pfn %#"PRIpfn" (mfn %#"PRIpfn") failed %d",
- ctx->save.batch_pfns[i], mfns[p], errors[p]);
+ ctx->save.batch_pfns[i], ctx->save.mfns[p], errors[p]);
goto err;
}
@@ -277,7 +276,6 @@ static int write_batch(struct xc_sr_cont
free(guest_data);
free(errors);
free(types);
- free(mfns);
return rc;
}
@@ -850,9 +848,11 @@ static int setup(struct xc_sr_context *c
xch, dirty_bitmap, NRPAGES(bitmap_size(ctx->save.p2m_size)));
ctx->save.batch_pfns = malloc(MAX_BATCH_SIZE *
sizeof(*ctx->save.batch_pfns));
+ ctx->save.mfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->save.mfns));
ctx->save.deferred_pages = bitmap_alloc(ctx->save.p2m_size);
- if ( !ctx->save.batch_pfns || !dirty_bitmap || !ctx->save.deferred_pages )
+ if ( !ctx->save.batch_pfns || !ctx->save.mfns ||
+ !dirty_bitmap || !ctx->save.deferred_pages )
{
ERROR("Unable to allocate memory for dirty bitmaps, batch pfns and"
" deferred pages");
@@ -883,6 +883,7 @@ static void cleanup(struct xc_sr_context
xc_hypercall_buffer_free_pages(xch, dirty_bitmap,
NRPAGES(bitmap_size(ctx->save.p2m_size)));
free(ctx->save.deferred_pages);
+ free(ctx->save.mfns);
free(ctx->save.batch_pfns);
}

View File

@ -0,0 +1,110 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Fri, 23 Oct 2020 11:34:00 +0200
Subject: libxc sr save rec_pfns
tools: save: preallocate rec_pfns array
Remove repeated allocation from migration loop. There will never be
more than MAX_BATCH_SIZE pages to process in a batch.
Allocate the space once.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
tools/libs/guest/xg_sr_common.h | 1 +
tools/libs/guest/xg_sr_save.c | 28 +++++++++++-----------------
2 files changed, 12 insertions(+), 17 deletions(-)
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -248,6 +248,7 @@ struct xc_sr_context
xen_pfn_t *types;
int *errors;
struct iovec *iov;
+ uint64_t *rec_pfns;
unsigned int nr_batch_pfns;
unsigned long *deferred_pages;
unsigned long nr_deferred_pages;
--- a/tools/libs/guest/xg_sr_save.c
+++ b/tools/libs/guest/xg_sr_save.c
@@ -95,7 +95,6 @@ static int write_batch(struct xc_sr_cont
unsigned int i, p, nr_pages = 0, nr_pages_mapped = 0;
unsigned int nr_pfns = ctx->save.nr_batch_pfns;
void *page, *orig_page;
- uint64_t *rec_pfns = NULL;
int iovcnt = 0;
struct xc_sr_rec_page_data_header hdr = { 0 };
struct xc_sr_record rec = {
@@ -202,22 +201,15 @@ static int write_batch(struct xc_sr_cont
}
}
- rec_pfns = malloc(nr_pfns * sizeof(*rec_pfns));
- if ( !rec_pfns )
- {
- ERROR("Unable to allocate %zu bytes of memory for page data pfn list",
- nr_pfns * sizeof(*rec_pfns));
- goto err;
- }
-
hdr.count = nr_pfns;
rec.length = sizeof(hdr);
- rec.length += nr_pfns * sizeof(*rec_pfns);
+ rec.length += nr_pfns * sizeof(*ctx->save.rec_pfns);
rec.length += nr_pages * PAGE_SIZE;
for ( i = 0; i < nr_pfns; ++i )
- rec_pfns[i] = ((uint64_t)(ctx->save.types[i]) << 32) | ctx->save.batch_pfns[i];
+ ctx->save.rec_pfns[i] = ((uint64_t)(ctx->save.types[i]) << 32) |
+ ctx->save.batch_pfns[i];
ctx->save.iov[0].iov_base = &rec.type;
ctx->save.iov[0].iov_len = sizeof(rec.type);
@@ -228,12 +220,13 @@ static int write_batch(struct xc_sr_cont
ctx->save.iov[2].iov_base = &hdr;
ctx->save.iov[2].iov_len = sizeof(hdr);
- ctx->save.iov[3].iov_base = rec_pfns;
- ctx->save.iov[3].iov_len = nr_pfns * sizeof(*rec_pfns);
+ ctx->save.iov[3].iov_base = ctx->save.rec_pfns;
+ ctx->save.iov[3].iov_len = nr_pfns * sizeof(*ctx->save.rec_pfns);
iovcnt = 4;
ctx->save.pages_sent += nr_pages;
- ctx->save.overhead_sent += sizeof(rec) + sizeof(hdr) + nr_pfns * sizeof(*rec_pfns);
+ ctx->save.overhead_sent += sizeof(rec) + sizeof(hdr) +
+ nr_pfns * sizeof(*ctx->save.rec_pfns);
if ( nr_pages )
{
@@ -260,7 +253,6 @@ static int write_batch(struct xc_sr_cont
rc = ctx->save.nr_batch_pfns = 0;
err:
- free(rec_pfns);
if ( guest_mapping )
xenforeignmemory_unmap(xch->fmem, guest_mapping, nr_pages_mapped);
for ( i = 0; local_pages && i < nr_pfns; ++i )
@@ -843,11 +835,12 @@ static int setup(struct xc_sr_context *c
ctx->save.types = malloc(MAX_BATCH_SIZE * sizeof(*ctx->save.types));
ctx->save.errors = malloc(MAX_BATCH_SIZE * sizeof(*ctx->save.errors));
ctx->save.iov = malloc((4 + MAX_BATCH_SIZE) * sizeof(*ctx->save.iov));
+ ctx->save.rec_pfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->save.rec_pfns));
ctx->save.deferred_pages = bitmap_alloc(ctx->save.p2m_size);
if ( !ctx->save.batch_pfns || !ctx->save.mfns || !ctx->save.types ||
- !ctx->save.errors || !ctx->save.iov || !dirty_bitmap ||
- !ctx->save.deferred_pages )
+ !ctx->save.errors || !ctx->save.iov || !ctx->save.rec_pfns ||
+ !dirty_bitmap || !ctx->save.deferred_pages )
{
ERROR("Unable to allocate memory for dirty bitmaps, batch pfns and"
" deferred pages");
@@ -878,6 +871,7 @@ static void cleanup(struct xc_sr_context
xc_hypercall_buffer_free_pages(xch, dirty_bitmap,
NRPAGES(bitmap_size(ctx->save.p2m_size)));
free(ctx->save.deferred_pages);
+ free(ctx->save.rec_pfns);
free(ctx->save.iov);
free(ctx->save.errors);
free(ctx->save.types);

View File

@ -0,0 +1,116 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Fri, 23 Oct 2020 15:39:59 +0200
Subject: libxc sr save show_transfer_rate
tools: show migration transfer rate in send_dirty_pages
Show how fast domU pages are transferred in each iteration.
The relevant data is how fast the pfns travel, not so much how much
protocol overhead exists. So the reported MiB/sec is just for pfns.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
v02:
- rearrange MiB_sec calculation (jgross)
---
tools/libs/guest/xg_sr_common.h | 2 ++
tools/libs/guest/xg_sr_save.c | 46 +++++++++++++++++++++++++++++++++
2 files changed, 48 insertions(+)
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -238,6 +238,8 @@ struct xc_sr_context
bool debug;
unsigned long p2m_size;
+ size_t pages_sent;
+ size_t overhead_sent;
struct precopy_stats stats;
--- a/tools/libs/guest/xg_sr_save.c
+++ b/tools/libs/guest/xg_sr_save.c
@@ -1,5 +1,6 @@
#include <assert.h>
#include <arpa/inet.h>
+#include <time.h>
#include "xg_sr_common.h"
@@ -238,6 +239,8 @@ static int write_batch(struct xc_sr_cont
iov[3].iov_len = nr_pfns * sizeof(*rec_pfns);
iovcnt = 4;
+ ctx->save.pages_sent += nr_pages;
+ ctx->save.overhead_sent += sizeof(rec) + sizeof(hdr) + nr_pfns * sizeof(*rec_pfns);
if ( nr_pages )
{
@@ -356,6 +359,42 @@ static int suspend_domain(struct xc_sr_c
return 0;
}
+static void show_transfer_rate(struct xc_sr_context *ctx, struct timespec *start)
+{
+ xc_interface *xch = ctx->xch;
+ struct timespec end = {}, diff = {};
+ size_t ms, MiB_sec;
+
+ if (!ctx->save.pages_sent)
+ return;
+
+ if ( clock_gettime(CLOCK_MONOTONIC, &end) )
+ PERROR("clock_gettime");
+
+ if ( (end.tv_nsec - start->tv_nsec) < 0 )
+ {
+ diff.tv_sec = end.tv_sec - start->tv_sec - 1;
+ diff.tv_nsec = end.tv_nsec - start->tv_nsec + (1000U*1000U*1000U);
+ }
+ else
+ {
+ diff.tv_sec = end.tv_sec - start->tv_sec;
+ diff.tv_nsec = end.tv_nsec - start->tv_nsec;
+ }
+
+ ms = (diff.tv_nsec / (1000U*1000U));
+ ms += (diff.tv_sec * 1000U);
+ if (!ms)
+ ms = 1;
+
+ MiB_sec = (ctx->save.pages_sent * PAGE_SIZE * 1000U) / ms / (1024U*1024U);
+
+ errno = 0;
+ IPRINTF("%s: %zu bytes + %zu pages in %ld.%09ld sec, %zu MiB/sec", __func__,
+ ctx->save.overhead_sent, ctx->save.pages_sent,
+ diff.tv_sec, diff.tv_nsec, MiB_sec);
+}
+
/*
* Send a subset of pages in the guests p2m, according to the dirty bitmap.
* Used for each subsequent iteration of the live migration loop.
@@ -369,9 +408,15 @@ static int send_dirty_pages(struct xc_sr
xen_pfn_t p;
unsigned long written;
int rc;
+ struct timespec start = {};
DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
&ctx->save.dirty_bitmap_hbuf);
+ ctx->save.pages_sent = 0;
+ ctx->save.overhead_sent = 0;
+ if ( clock_gettime(CLOCK_MONOTONIC, &start) )
+ PERROR("clock_gettime");
+
for ( p = 0, written = 0; p < ctx->save.p2m_size; ++p )
{
if ( !test_bit(p, dirty_bitmap) )
@@ -395,6 +440,7 @@ static int send_dirty_pages(struct xc_sr
if ( written > entries )
DPRINTF("Bitmap contained more entries than expected...");
+ show_transfer_rate(ctx, &start);
xc_report_progress_step(xch, entries, entries);
return ctx->save.ops.check_vm_state(ctx);

154
libxc-sr-save-types.patch Normal file
View File

@ -0,0 +1,154 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Fri, 23 Oct 2020 11:23:51 +0200
Subject: libxc sr save types
tools: save: preallocate types array
Remove repeated allocation from migration loop. There will never be
more than MAX_BATCH_SIZE pages to process in a batch.
Allocate the space once.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
tools/libs/guest/xg_sr_common.h | 1 +
tools/libs/guest/xg_sr_save.c | 28 +++++++++++++---------------
2 files changed, 14 insertions(+), 15 deletions(-)
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -245,6 +245,7 @@ struct xc_sr_context
xen_pfn_t *batch_pfns;
xen_pfn_t *mfns;
+ xen_pfn_t *types;
unsigned int nr_batch_pfns;
unsigned long *deferred_pages;
unsigned long nr_deferred_pages;
--- a/tools/libs/guest/xg_sr_save.c
+++ b/tools/libs/guest/xg_sr_save.c
@@ -88,7 +88,6 @@ static int write_checkpoint_record(struc
static int write_batch(struct xc_sr_context *ctx)
{
xc_interface *xch = ctx->xch;
- xen_pfn_t *types = NULL;
void *guest_mapping = NULL;
void **guest_data = NULL;
void **local_pages = NULL;
@@ -105,8 +104,6 @@ static int write_batch(struct xc_sr_cont
assert(nr_pfns != 0);
- /* Types of the batch pfns. */
- types = malloc(nr_pfns * sizeof(*types));
/* Errors from attempting to map the gfns. */
errors = malloc(nr_pfns * sizeof(*errors));
/* Pointers to page data to send. Mapped gfns or local allocations. */
@@ -116,7 +113,7 @@ static int write_batch(struct xc_sr_cont
/* iovec[] for writev(). */
iov = malloc((nr_pfns + 4) * sizeof(*iov));
- if ( !types || !errors || !guest_data || !local_pages || !iov )
+ if ( !errors || !guest_data || !local_pages || !iov )
{
ERROR("Unable to allocate arrays for a batch of %u pages",
nr_pfns);
@@ -125,7 +122,7 @@ static int write_batch(struct xc_sr_cont
for ( i = 0; i < nr_pfns; ++i )
{
- types[i] = ctx->save.mfns[i] = ctx->save.ops.pfn_to_gfn(ctx,
+ ctx->save.types[i] = ctx->save.mfns[i] = ctx->save.ops.pfn_to_gfn(ctx,
ctx->save.batch_pfns[i]);
/* Likely a ballooned page. */
@@ -136,7 +133,7 @@ static int write_batch(struct xc_sr_cont
}
}
- rc = xc_get_pfn_type_batch(xch, ctx->domid, nr_pfns, types);
+ rc = xc_get_pfn_type_batch(xch, ctx->domid, nr_pfns, ctx->save.types);
if ( rc )
{
PERROR("Failed to get types for pfn batch");
@@ -146,14 +143,14 @@ static int write_batch(struct xc_sr_cont
for ( i = 0; i < nr_pfns; ++i )
{
- if ( !is_known_page_type(types[i]) )
+ if ( !is_known_page_type(ctx->save.types[i]) )
{
ERROR("Unknown type %#"PRIpfn" for pfn %#"PRIpfn,
- types[i], ctx->save.mfns[i]);
+ ctx->save.types[i], ctx->save.mfns[i]);
goto err;
}
- if ( !page_type_has_stream_data(types[i]) )
+ if ( !page_type_has_stream_data(ctx->save.types[i]) )
continue;
ctx->save.mfns[nr_pages++] = ctx->save.mfns[i];
@@ -172,7 +169,7 @@ static int write_batch(struct xc_sr_cont
for ( i = 0, p = 0; i < nr_pfns; ++i )
{
- if ( !page_type_has_stream_data(types[i]) )
+ if ( !page_type_has_stream_data(ctx->save.types[i]) )
continue;
if ( errors[p] )
@@ -183,7 +180,7 @@ static int write_batch(struct xc_sr_cont
}
orig_page = page = guest_mapping + (p * PAGE_SIZE);
- rc = ctx->save.ops.normalise_page(ctx, types[i], &page);
+ rc = ctx->save.ops.normalise_page(ctx, ctx->save.types[i], &page);
if ( orig_page != page )
local_pages[i] = page;
@@ -194,7 +191,7 @@ static int write_batch(struct xc_sr_cont
{
set_bit(ctx->save.batch_pfns[i], ctx->save.deferred_pages);
++ctx->save.nr_deferred_pages;
- types[i] = XEN_DOMCTL_PFINFO_XTAB;
+ ctx->save.types[i] = XEN_DOMCTL_PFINFO_XTAB;
--nr_pages;
}
else
@@ -223,7 +220,7 @@ static int write_batch(struct xc_sr_cont
rec.length += nr_pages * PAGE_SIZE;
for ( i = 0; i < nr_pfns; ++i )
- rec_pfns[i] = ((uint64_t)(types[i]) << 32) | ctx->save.batch_pfns[i];
+ rec_pfns[i] = ((uint64_t)(ctx->save.types[i]) << 32) | ctx->save.batch_pfns[i];
iov[0].iov_base = &rec.type;
iov[0].iov_len = sizeof(rec.type);
@@ -275,7 +272,6 @@ static int write_batch(struct xc_sr_cont
free(local_pages);
free(guest_data);
free(errors);
- free(types);
return rc;
}
@@ -849,9 +845,10 @@ static int setup(struct xc_sr_context *c
ctx->save.batch_pfns = malloc(MAX_BATCH_SIZE *
sizeof(*ctx->save.batch_pfns));
ctx->save.mfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->save.mfns));
+ ctx->save.types = malloc(MAX_BATCH_SIZE * sizeof(*ctx->save.types));
ctx->save.deferred_pages = bitmap_alloc(ctx->save.p2m_size);
- if ( !ctx->save.batch_pfns || !ctx->save.mfns ||
+ if ( !ctx->save.batch_pfns || !ctx->save.mfns || !ctx->save.types ||
!dirty_bitmap || !ctx->save.deferred_pages )
{
ERROR("Unable to allocate memory for dirty bitmaps, batch pfns and"
@@ -883,6 +880,7 @@ static void cleanup(struct xc_sr_context
xc_hypercall_buffer_free_pages(xch, dirty_bitmap,
NRPAGES(bitmap_size(ctx->save.p2m_size)));
free(ctx->save.deferred_pages);
+ free(ctx->save.types);
free(ctx->save.mfns);
free(ctx->save.batch_pfns);
}

View File

@ -0,0 +1,263 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Thu, 4 Feb 2021 20:33:53 +0100
Subject: libxc sr track migration time
Track live migration state unconditionally in logfiles to see how long a domU was suspended.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
tools/include/xentoollog.h | 1 +
tools/libs/ctrl/xc_domain.c | 12 +++++--
tools/libs/ctrl/xc_private.h | 9 +++++
tools/libs/guest/xg_resume.c | 5 ++-
tools/libs/guest/xg_sr_common.c | 59 ++++++++++++++++++++++++++++++++
tools/libs/guest/xg_sr_common.h | 3 ++
tools/libs/guest/xg_sr_restore.c | 3 ++
tools/libs/guest/xg_sr_save.c | 6 +++-
tools/xl/xl.c | 2 ++
9 files changed, 96 insertions(+), 4 deletions(-)
--- a/tools/include/xentoollog.h
+++ b/tools/include/xentoollog.h
@@ -133,6 +133,7 @@ const char *xtl_level_to_string(xentooll
});
+#define XL_NO_SUSEINFO "XL_NO_SUSEINFO"
#endif /* XENTOOLLOG_H */
/*
--- a/tools/libs/ctrl/xc_domain.c
+++ b/tools/libs/ctrl/xc_domain.c
@@ -66,20 +66,28 @@ int xc_domain_cacheflush(xc_interface *x
int xc_domain_pause(xc_interface *xch,
uint32_t domid)
{
+ int ret;
DECLARE_DOMCTL;
domctl.cmd = XEN_DOMCTL_pausedomain;
domctl.domain = domid;
- return do_domctl(xch, &domctl);
+ ret = do_domctl(xch, &domctl);
+ if (getenv(XL_NO_SUSEINFO) == NULL)
+ SUSEINFO("domid %u: %s returned %d", domid, __func__, ret);
+ return ret;
}
int xc_domain_unpause(xc_interface *xch,
uint32_t domid)
{
+ int ret;
DECLARE_DOMCTL;
domctl.cmd = XEN_DOMCTL_unpausedomain;
domctl.domain = domid;
- return do_domctl(xch, &domctl);
+ ret = do_domctl(xch, &domctl);
+ if (getenv(XL_NO_SUSEINFO) == NULL)
+ SUSEINFO("domid %u: %s returned %d", domid, __func__, ret);
+ return ret;
}
--- a/tools/libs/ctrl/xc_private.h
+++ b/tools/libs/ctrl/xc_private.h
@@ -42,6 +42,15 @@
#include <xen-tools/common-macros.h>
+/*
+ * Using loglevel ERROR to make sure the intended informational messages appear
+ * in libvirts libxl-driver.log
+ */
+#define SUSEINFO(_m, _a...) do { int ERROR_errno = errno; \
+ xc_report(xch, xch->error_handler, XTL_ERROR, XC_ERROR_NONE, "SUSEINFO: " _m , ## _a ); \
+ errno = ERROR_errno; \
+ } while (0)
+
#if defined(HAVE_VALGRIND_MEMCHECK_H) && !defined(NDEBUG) && !defined(__MINIOS__)
/* Compile in Valgrind client requests? */
#include <valgrind/memcheck.h>
--- a/tools/libs/guest/xg_resume.c
+++ b/tools/libs/guest/xg_resume.c
@@ -259,7 +259,10 @@ out:
*/
int xc_domain_resume(xc_interface *xch, uint32_t domid, int fast)
{
- return (fast
+ int ret = (fast
? xc_domain_resume_cooperative(xch, domid)
: xc_domain_resume_any(xch, domid));
+ if (getenv(XL_NO_SUSEINFO) == NULL)
+ SUSEINFO("domid %u: %s%s returned %d", domid, __func__, fast ? " fast" : "", ret);
+ return ret;
}
--- a/tools/libs/guest/xg_sr_common.c
+++ b/tools/libs/guest/xg_sr_common.c
@@ -163,6 +163,65 @@ static void __attribute__((unused)) buil
BUILD_BUG_ON(sizeof(struct xc_sr_rec_hvm_params) != 8);
}
+/* Write a two-character hex representation of 'byte' to digits[].
+ Pre-condition: sizeof(digits) >= 2 */
+static void byte_to_hex(char *digits, const uint8_t byte)
+{
+ uint8_t nybbel = byte >> 4;
+
+ if ( nybbel > 9 )
+ digits[0] = 'a' + nybbel-10;
+ else
+ digits[0] = '0' + nybbel;
+
+ nybbel = byte & 0x0f;
+ if ( nybbel > 9 )
+ digits[1] = 'a' + nybbel-10;
+ else
+ digits[1] = '0' + nybbel;
+}
+
+/* Convert an array of 16 unsigned bytes to a DCE/OSF formatted UUID
+ string.
+
+ Pre-condition: sizeof(dest) >= 37 */
+void sr_uuid_to_string(char *dest, const uint8_t *uuid)
+{
+ int i = 0;
+ char *p = dest;
+
+ for (; i < 4; i++ )
+ {
+ byte_to_hex(p, uuid[i]);
+ p += 2;
+ }
+ *p++ = '-';
+ for (; i < 6; i++ )
+ {
+ byte_to_hex(p, uuid[i]);
+ p += 2;
+ }
+ *p++ = '-';
+ for (; i < 8; i++ )
+ {
+ byte_to_hex(p, uuid[i]);
+ p += 2;
+ }
+ *p++ = '-';
+ for (; i < 10; i++ )
+ {
+ byte_to_hex(p, uuid[i]);
+ p += 2;
+ }
+ *p++ = '-';
+ for (; i < 16; i++ )
+ {
+ byte_to_hex(p, uuid[i]);
+ p += 2;
+ }
+ *p = '\0';
+}
+
/*
* Expand the tracking structures as needed.
* To avoid realloc()ing too excessively, the size increased to the nearest
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -294,6 +294,7 @@ struct xc_sr_context
xc_stream_type_t stream_type;
xc_domaininfo_t dominfo;
+ char uuid[16*2+4+1];
union /* Common save or restore data. */
{
@@ -505,6 +506,8 @@ extern struct xc_sr_save_ops save_ops_x8
extern struct xc_sr_restore_ops restore_ops_x86_pv;
extern struct xc_sr_restore_ops restore_ops_x86_hvm;
+extern void sr_uuid_to_string(char *dest, const uint8_t *uuid);
+
struct xc_sr_record
{
uint32_t type;
--- a/tools/libs/guest/xg_sr_restore.c
+++ b/tools/libs/guest/xg_sr_restore.c
@@ -871,6 +871,8 @@ static int restore(struct xc_sr_context
struct xc_sr_rhdr rhdr;
int rc, saved_rc = 0, saved_errno = 0;
+ SUSEINFO("domid %u: %s %s start", ctx->domid, ctx->uuid, __func__);
+ DPRINTF("domid %u: max_pages %lx tot_pages %lx p2m_size %lx", ctx->domid, ctx->restore.max_pages, ctx->restore.tot_pages, ctx->restore.p2m_size);
IPRINTF("Restoring domain");
rc = setup(ctx);
@@ -946,6 +948,7 @@ static int restore(struct xc_sr_context
PERROR("Restore failed");
done:
+ SUSEINFO("domid %u: %s done", ctx->domid, __func__);
cleanup(ctx);
if ( saved_rc )
@@ -1011,6 +1014,7 @@ int xc_domain_restore(xc_interface *xch,
io_fd, dom, hvm, stream_type);
ctx.domid = dom;
+ sr_uuid_to_string(ctx.uuid, ctx.dominfo.handle);
if ( read_headers(&ctx) )
return -1;
--- a/tools/libs/guest/xg_sr_save.c
+++ b/tools/libs/guest/xg_sr_save.c
@@ -353,7 +353,7 @@ static void show_transfer_rate(struct xc
MiB_sec = (ctx->save.pages_sent * PAGE_SIZE * 1000U) / ms / (1024U*1024U);
errno = 0;
- IPRINTF("%s: %zu bytes + %zu pages in %ld.%09ld sec, %zu MiB/sec", __func__,
+ SUSEINFO("domid %u: %zu bytes + %zu pages in %ld.%09ld sec, %zu MiB/sec", ctx->domid,
ctx->save.overhead_sent, ctx->save.pages_sent,
diff.tv_sec, diff.tv_nsec, MiB_sec);
}
@@ -875,13 +875,16 @@ static int save(struct xc_sr_context *ct
{
xc_interface *xch = ctx->xch;
int rc, saved_rc = 0, saved_errno = 0;
+ unsigned long tot_pages = ctx->dominfo.tot_pages;
+ SUSEINFO("domid %u: %s %s start, %lu pages allocated", ctx->domid, ctx->uuid, __func__, tot_pages);
IPRINTF("Saving domain %d, type %s",
ctx->domid, dhdr_type_to_str(guest_type));
rc = setup(ctx);
if ( rc )
goto err;
+ SUSEINFO("domid %u: p2m_size %lx", ctx->domid, ctx->save.p2m_size);
xc_report_progress_single(xch, "Start of stream");
@@ -995,6 +998,7 @@ static int save(struct xc_sr_context *ct
PERROR("Save failed");
done:
+ SUSEINFO("domid %u: %s done", ctx->domid, __func__);
cleanup(ctx);
if ( saved_rc )
@@ -1054,6 +1058,7 @@ int xc_domain_save(xc_interface *xch, in
io_fd, dom, flags, hvm);
ctx.domid = dom;
+ sr_uuid_to_string(ctx.uuid, ctx.dominfo.handle);
if ( hvm )
{
--- a/tools/xl/xl.c
+++ b/tools/xl/xl.c
@@ -424,6 +424,8 @@ int main(int argc, char **argv)
logger = xtl_createlogger_stdiostream(stderr, minmsglevel, xtl_flags);
if (!logger) exit(EXIT_FAILURE);
+ /* Provide context to libxl and libxc: no SUSEINFO() from xl */
+ setenv(XL_NO_SUSEINFO, "1", 0);
xl_ctx_alloc();
atexit(xl_ctx_free);

View File

@ -0,0 +1,197 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Fri, 5 Feb 2021 20:16:02 +0100
Subject: libxc sr xg_sr_bitmap populated_pfns
tools: use xg_sr_bitmap for populated_pfns
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
tools/libs/guest/xg_sr_common.h | 20 ++++++-
tools/libs/guest/xg_sr_restore.c | 69 ------------------------
tools/libs/guest/xg_sr_restore_x86_hvm.c | 9 ++++
tools/libs/guest/xg_sr_restore_x86_pv.c | 7 +++
4 files changed, 34 insertions(+), 71 deletions(-)
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -375,8 +375,7 @@ struct xc_sr_context
uint32_t xenstore_domid, console_domid;
/* Bitmap of currently populated PFNs during restore. */
- unsigned long *populated_pfns;
- xen_pfn_t max_populated_pfn;
+ struct sr_bitmap populated_pfns;
/* Sender has invoked verify mode on the stream. */
bool verify;
@@ -632,6 +631,23 @@ static inline bool page_type_has_stream_
}
}
+static inline bool pfn_is_populated(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+ return sr_test_bit(pfn, &ctx->restore.populated_pfns);
+}
+
+static inline int pfn_set_populated(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+ xc_interface *xch = ctx->xch;
+
+ if ( sr_set_bit(pfn, &ctx->restore.populated_pfns) == false )
+ {
+ PERROR("Failed to realloc populated_pfns bitmap");
+ errno = ENOMEM;
+ return -1;
+ }
+ return 0;
+}
#endif
/*
* Local variables:
--- a/tools/libs/guest/xg_sr_restore.c
+++ b/tools/libs/guest/xg_sr_restore.c
@@ -72,64 +72,6 @@ static int read_headers(struct xc_sr_con
}
/*
- * Is a pfn populated?
- */
-static bool pfn_is_populated(const struct xc_sr_context *ctx, xen_pfn_t pfn)
-{
- if ( pfn > ctx->restore.max_populated_pfn )
- return false;
- return test_bit(pfn, ctx->restore.populated_pfns);
-}
-
-/*
- * Set a pfn as populated, expanding the tracking structures if needed. To
- * avoid realloc()ing too excessively, the size increased to the nearest power
- * of two large enough to contain the required pfn.
- */
-static int pfn_set_populated(struct xc_sr_context *ctx, xen_pfn_t pfn)
-{
- xc_interface *xch = ctx->xch;
-
- if ( pfn > ctx->restore.max_populated_pfn )
- {
- xen_pfn_t new_max;
- size_t old_sz, new_sz;
- unsigned long *p;
-
- /* Round up to the nearest power of two larger than pfn, less 1. */
- new_max = pfn;
- new_max |= new_max >> 1;
- new_max |= new_max >> 2;
- new_max |= new_max >> 4;
- new_max |= new_max >> 8;
- new_max |= new_max >> 16;
-#ifdef __x86_64__
- new_max |= new_max >> 32;
-#endif
-
- old_sz = bitmap_size(ctx->restore.max_populated_pfn + 1);
- new_sz = bitmap_size(new_max + 1);
- p = realloc(ctx->restore.populated_pfns, new_sz);
- if ( !p )
- {
- ERROR("Failed to realloc populated bitmap");
- errno = ENOMEM;
- return -1;
- }
-
- memset((uint8_t *)p + old_sz, 0x00, new_sz - old_sz);
-
- ctx->restore.populated_pfns = p;
- ctx->restore.max_populated_pfn = new_max;
- }
-
- assert(!test_bit(pfn, ctx->restore.populated_pfns));
- set_bit(pfn, ctx->restore.populated_pfns);
-
- return 0;
-}
-
-/*
* Given a set of pfns, obtain memory from Xen to fill the physmap for the
* unpopulated subset. If types is NULL, no page type checking is performed
* and all unpopulated pfns are populated.
@@ -911,16 +853,6 @@ static int setup(struct xc_sr_context *c
if ( rc )
goto err;
- ctx->restore.max_populated_pfn = (32 * 1024 / 4) - 1;
- ctx->restore.populated_pfns = bitmap_alloc(
- ctx->restore.max_populated_pfn + 1);
- if ( !ctx->restore.populated_pfns )
- {
- ERROR("Unable to allocate memory for populated_pfns bitmap");
- rc = -1;
- goto err;
- }
-
ctx->restore.pfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.pfns));
ctx->restore.types = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.types));
ctx->restore.mfns = malloc(MAX_BATCH_SIZE * sizeof(*ctx->restore.mfns));
@@ -969,7 +901,6 @@ static void cleanup(struct xc_sr_context
xch, dirty_bitmap, NRPAGES(bitmap_size(ctx->restore.p2m_size)));
free(ctx->restore.buffered_records);
- free(ctx->restore.populated_pfns);
free(ctx->restore.pages);
free(ctx->restore.iov);
free(ctx->restore.guest_data);
--- a/tools/libs/guest/xg_sr_restore_x86_hvm.c
+++ b/tools/libs/guest/xg_sr_restore_x86_hvm.c
@@ -136,6 +136,7 @@ static int x86_hvm_localise_page(struct
static int x86_hvm_setup(struct xc_sr_context *ctx)
{
xc_interface *xch = ctx->xch;
+ unsigned long max_pfn, max_pages = ctx->dominfo.max_pages;
if ( ctx->restore.guest_type != DHDR_TYPE_X86_HVM )
{
@@ -161,6 +162,13 @@ static int x86_hvm_setup(struct xc_sr_co
}
#endif
+ max_pfn = max(ctx->restore.p2m_size, max_pages);
+ if ( !sr_bitmap_expand(&ctx->restore.populated_pfns, max_pfn) )
+ {
+ PERROR("Unable to allocate memory for populated_pfns bitmap");
+ return -1;
+ }
+
return 0;
}
@@ -241,6 +249,7 @@ static int x86_hvm_stream_complete(struc
static int x86_hvm_cleanup(struct xc_sr_context *ctx)
{
+ sr_bitmap_free(&ctx->restore.populated_pfns);
free(ctx->x86.hvm.restore.context.ptr);
free(ctx->x86.restore.cpuid.ptr);
--- a/tools/libs/guest/xg_sr_restore_x86_pv.c
+++ b/tools/libs/guest/xg_sr_restore_x86_pv.c
@@ -1060,6 +1060,12 @@ static int x86_pv_setup(struct xc_sr_con
if ( rc )
return rc;
+ if ( !sr_bitmap_expand(&ctx->restore.populated_pfns, 32 * 1024 / 4) )
+ {
+ PERROR("Unable to allocate memory for populated_pfns bitmap");
+ return -1;
+ }
+
ctx->x86.pv.restore.nr_vcpus = ctx->dominfo.max_vcpu_id + 1;
ctx->x86.pv.restore.vcpus = calloc(sizeof(struct xc_sr_x86_pv_restore_vcpu),
ctx->x86.pv.restore.nr_vcpus);
@@ -1153,6 +1159,7 @@ static int x86_pv_stream_complete(struct
*/
static int x86_pv_cleanup(struct xc_sr_context *ctx)
{
+ sr_bitmap_free(&ctx->restore.populated_pfns);
free(ctx->x86.pv.p2m);
free(ctx->x86.pv.p2m_pfns);

141
libxc-sr-xg_sr_bitmap.patch Normal file
View File

@ -0,0 +1,141 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Fri, 5 Feb 2021 19:50:03 +0100
Subject: libxc sr xg_sr_bitmap
tools: add API for expandable bitmaps
Since the incoming migration stream lacks info about what the highest pfn
will be, some data structures can not be allocated upfront.
Add an API for expandable bitmaps, loosely based on pfn_set_populated.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
tools/libs/guest/xg_sr_common.c | 39 +++++++++++++++++++
tools/libs/guest/xg_sr_common.h | 67 +++++++++++++++++++++++++++++++++
2 files changed, 106 insertions(+)
--- a/tools/libs/guest/xg_sr_common.c
+++ b/tools/libs/guest/xg_sr_common.c
@@ -164,6 +164,45 @@ static void __attribute__((unused)) buil
}
/*
+ * Expand the tracking structures as needed.
+ * To avoid realloc()ing too excessively, the size increased to the nearest
+ * power of two large enough to contain the required number of bits.
+ */
+bool _sr_bitmap_expand(struct sr_bitmap *bm, unsigned long bits)
+{
+ size_t new_max;
+ size_t old_sz, new_sz;
+ void *p;
+
+ if (bits <= bm->bits)
+ return true;
+
+ /* Round up to the nearest power of two larger than bit, less 1. */
+ new_max = bits;
+ new_max |= new_max >> 1;
+ new_max |= new_max >> 2;
+ new_max |= new_max >> 4;
+ new_max |= new_max >> 8;
+ new_max |= new_max >> 16;
+ new_max |= sizeof(unsigned long) > 4 ? new_max >> 32 : 0;
+
+ /* Allocate units of unsigned long */
+ new_max = (new_max + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
+
+ old_sz = bitmap_size(bm->bits);
+ new_sz = bitmap_size(new_max);
+ p = realloc(bm->p, new_sz);
+ if (!p)
+ return false;
+
+ memset(p + old_sz, 0, new_sz - old_sz);
+ bm->p = p;
+ bm->bits = new_max;
+
+ return true;
+}
+
+/*
* Local variables:
* mode: C
* c-file-style: "BSD"
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -18,6 +18,73 @@ const char *rec_type_to_str(uint32_t typ
struct xc_sr_context;
struct xc_sr_record;
+struct sr_bitmap
+{
+ void *p;
+ unsigned long bits;
+};
+
+extern bool _sr_bitmap_expand(struct sr_bitmap *bm, unsigned long bits);
+
+static inline bool sr_bitmap_expand(struct sr_bitmap *bm, unsigned long bits)
+{
+ if (bits > bm->bits)
+ return _sr_bitmap_expand(bm, bits);
+ return true;
+}
+
+static inline void sr_bitmap_free(struct sr_bitmap *bm)
+{
+ free(bm->p);
+ bm->p = NULL;
+}
+
+static inline bool sr_set_bit(unsigned long bit, struct sr_bitmap *bm)
+{
+ if (sr_bitmap_expand(bm, bit + 1) == false)
+ return false;
+
+ set_bit(bit, bm->p);
+ return true;
+}
+
+static inline bool sr_test_bit(unsigned long bit, struct sr_bitmap *bm)
+{
+ if (bit + 1 > bm->bits)
+ return false;
+ return !!test_bit(bit, bm->p);
+}
+
+static inline void sr_clear_bit(unsigned long bit, struct sr_bitmap *bm)
+{
+ if (bit + 1 <= bm->bits)
+ clear_bit(bit, bm->p);
+}
+
+static inline bool sr_test_and_clear_bit(unsigned long bit, struct sr_bitmap *bm)
+{
+ if (bit + 1 > bm->bits)
+ return false;
+ return !!test_and_clear_bit(bit, bm->p);
+}
+
+/* No way to report potential allocation error, bitmap must be expanded prior usage */
+static inline bool sr_test_and_set_bit(unsigned long bit, struct sr_bitmap *bm)
+{
+ if (bit + 1 > bm->bits)
+ return false;
+ return !!test_and_set_bit(bit, bm->p);
+}
+
+static inline bool sr_set_long_bit(unsigned long base_bit, struct sr_bitmap *bm)
+{
+ if (sr_bitmap_expand(bm, base_bit + BITS_PER_LONG) == false)
+ return false;
+
+ set_bit_long(base_bit, bm->p);
+ return true;
+}
+
/**
* Save operations. To be implemented for each type of guest, for use by the
* common save algorithm.

View File

@ -0,0 +1,46 @@
From: Olaf Hering <olaf@aepfle.de>
Date: Thu, 29 Oct 2020 17:00:19 +0100
Subject: libxc sr xl migration debug
xl: fix description of migrate --debug
xl migrate --debug used to track every pfn in every batch of pages.
But these times are gone. The code in xc_domain_save is the consumer
of this knob, now may enable verification mode.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
v03:
- adjust to describe what --debug would do when the code which
consumes this knob is fixed.
v02:
- the option has no effect anymore
---
docs/man/xl.1.pod.in | 4 +++-
tools/xl/xl_cmdtable.c | 2 +-
2 files changed, 4 insertions(+), 2 deletions(-)
--- a/docs/man/xl.1.pod.in
+++ b/docs/man/xl.1.pod.in
@@ -486,7 +486,9 @@ domain.
=item B<--debug>
-Display huge (!) amount of debug information during the migration process.
+This enables verification mode, which will transfer the entire domU memory
+once more to the receiving host to make sure the content is identical on
+both sides.
=item B<-p>
--- a/tools/xl/xl_cmdtable.c
+++ b/tools/xl/xl_cmdtable.c
@@ -173,7 +173,7 @@ const struct cmd_spec cmd_table[] = {
" migrate-receive [-d -e]\n"
"-e Do not wait in the background (on <host>) for the death\n"
" of the domain.\n"
- "--debug Print huge (!) amount of debug during the migration process.\n"
+ "--debug Enable verification mode.\n"
"-p Do not unpause domain after migrating it.\n"
"-D Preserve the domain id"
},

View File

@ -0,0 +1,318 @@
References: bsc#1120095
A domU with a large amount of disks may run into the hardcoded
LIBXL_HOTPLUG_TIMEOUT limit, which is 40 seconds. This happens if the
preparation for each disk takes an unexpected large amount of time. Then
the sum of all configured disks and the individual preparation time will
be larger than 40 seconds. The hotplug script which does the preparation
takes a lock before doing the actual preparation. Since the hotplug
scripts for each disk are spawned at nearly the same time, each one has
to wait for the lock. Due to this contention, the total execution time
of a script can easily exceed the timeout. In this case libxl will
terminate the script because it has to assume an error condition.
Example:
10 configured disks, each one takes 3 seconds within the critital
section. The total execution time will be 30 seconds, which is still
within the limit. With 5 additional configured disks, the total
execution time will be 45 seconds, which would trigger the timeout.
To handle such setup without a recompile of libxl, a special key/value
has to be created in xenstore prior domain creation. This can be done
either manually, or at system startup.
If this systemd service file is placed in /etc/systemd/system/, and
activated, it will create the required entry in xenstore:
/etc/systemd/system # cat xen-LIBXL_HOTPLUG_TIMEOUT.service
[Unit]
Description=set global LIBXL_HOTPLUG_TIMEOUT
ConditionPathExists=/proc/xen/capabilities
Requires=xenstored.service
After=xenstored.service
Requires=xen-init-dom0.service
After=xen-init-dom0.service
Before=xencommons.service
[Service]
Type=oneshot
RemainAfterExit=true
ExecStartPre=/bin/grep -q control_d /proc/xen/capabilities
ExecStart=/usr/bin/xenstore-write /libxl/suse/per-device-LIBXL_HOTPLUG_TIMEOUT 5
[Install]
WantedBy=multi-user.target
/etc/systemd/system # systemctl enable xen-LIBXL_HOTPLUG_TIMEOUT.service
/etc/systemd/system # systemctl start xen-LIBXL_HOTPLUG_TIMEOUT.service
In this example the per-device value will be set to 5 seconds.
The change for libxl which handles this xenstore value will enable
additional logging if the key is found. That extra logging will show how
the execution time of each script.
Index: xen-4.18.0-testing/tools/libs/light/libxl_aoutils.c
===================================================================
--- xen-4.18.0-testing.orig/tools/libs/light/libxl_aoutils.c
+++ xen-4.18.0-testing/tools/libs/light/libxl_aoutils.c
@@ -529,6 +529,8 @@ static void async_exec_timeout(libxl__eg
{
libxl__async_exec_state *aes = CONTAINER_OF(ev, *aes, time);
STATE_AO_GC(aes->ao);
+ char b[64];
+ libxl__suse_diff_timespec(&aes->start, b, sizeof(b));
if (!aes->rc)
aes->rc = rc;
@@ -536,7 +538,7 @@ static void async_exec_timeout(libxl__eg
libxl__ev_time_deregister(gc, &aes->time);
assert(libxl__ev_child_inuse(&aes->child));
- LOG(ERROR, "killing execution of %s because of timeout", aes->what);
+ LOG(ERROR, "killing execution of %s because of timeout%s", aes->what, b);
if (kill(aes->child.pid, SIGKILL)) {
LOGEV(ERROR, errno, "unable to kill %s [%ld]",
@@ -552,6 +554,10 @@ static void async_exec_done(libxl__egc *
{
libxl__async_exec_state *aes = CONTAINER_OF(child, *aes, child);
STATE_AO_GC(aes->ao);
+ char b[64];
+ libxl__suse_diff_timespec(&aes->start, b, sizeof(b));
+ if (b[0])
+ LOG(NOTICE, "finished execution of '%s'%s", aes->what, b);
libxl__ev_time_deregister(gc, &aes->time);
Index: xen-4.18.0-testing/tools/libs/light/libxl_create.c
===================================================================
--- xen-4.18.0-testing.orig/tools/libs/light/libxl_create.c
+++ xen-4.18.0-testing/tools/libs/light/libxl_create.c
@@ -1323,6 +1323,7 @@ static void initiate_domain_create(libxl
* build info around just to know if the domain has a device model or not.
*/
store_libxl_entry(gc, domid, &d_config->b_info);
+ libxl__suse_domain_set_hotplug_timeout(gc, domid, d_config->num_disks, d_config->num_nics);
for (i = 0; i < d_config->num_disks; i++) {
ret = libxl__disk_devtype.set_default(gc, domid, &d_config->disks[i],
Index: xen-4.18.0-testing/tools/libs/light/libxl_device.c
===================================================================
--- xen-4.18.0-testing.orig/tools/libs/light/libxl_device.c
+++ xen-4.18.0-testing/tools/libs/light/libxl_device.c
@@ -1278,7 +1278,7 @@ static void device_hotplug(libxl__egc *e
}
aes->ao = ao;
- aes->what = GCSPRINTF("%s %s", args[0], args[1]);
+ aes->what = GCSPRINTF("%s %s for %s", args[0], args[1], be_path);
aes->env = env;
aes->args = args;
aes->callback = device_hotplug_child_death_cb;
@@ -1287,6 +1287,15 @@ static void device_hotplug(libxl__egc *e
aes->stdfds[1] = 2;
aes->stdfds[2] = -1;
+ switch (aodev->dev->backend_kind) {
+ case LIBXL__DEVICE_KIND_VBD:
+ case LIBXL__DEVICE_KIND_VIF:
+ if (aodev->num_exec == 0)
+ libxl__suse_domain_get_hotplug_timeout(gc, aodev->dev->domid, aodev->dev->backend_kind, &aes->start, &aes->timeout_ms, be_path);
+ default:
+ break;
+ }
+
rc = libxl__async_exec_start(aes);
if (rc)
goto out;
Index: xen-4.18.0-testing/tools/libs/light/libxl_event.c
===================================================================
--- xen-4.18.0-testing.orig/tools/libs/light/libxl_event.c
+++ xen-4.18.0-testing/tools/libs/light/libxl_event.c
@@ -1032,27 +1032,29 @@ static void devstate_callback(libxl__egc
{
EGC_GC;
libxl__ev_devstate *ds = CONTAINER_OF(xsw, *ds, w);
+ char b[64];
+ libxl__suse_diff_timespec(&ds->w.start, b, sizeof(b));
if (rc) {
if (rc == ERROR_TIMEDOUT)
- LOG(DEBUG, "backend %s wanted state %d "" timed out", ds->w.path,
- ds->wanted);
+ LOG(DEBUG, "backend %s wanted state %d "" timed out%s", ds->w.path,
+ ds->wanted, b);
goto out;
}
if (!sstate) {
- LOG(DEBUG, "backend %s wanted state %d"" but it was removed",
- ds->w.path, ds->wanted);
+ LOG(DEBUG, "backend %s wanted state %d"" but it was removed%s",
+ ds->w.path, ds->wanted, b);
rc = ERROR_INVAL;
goto out;
}
int got = atoi(sstate);
if (got == ds->wanted) {
- LOG(DEBUG, "backend %s wanted state %d ok", ds->w.path, ds->wanted);
+ LOG(DEBUG, "backend %s wanted state %d ok%s", ds->w.path, ds->wanted, b);
rc = 0;
} else {
- LOG(DEBUG, "backend %s wanted state %d"" still waiting state %d",
- ds->w.path, ds->wanted, got);
+ LOG(DEBUG, "backend %s wanted state %d"" still waiting state %d%s",
+ ds->w.path, ds->wanted, got, b);
return;
}
@@ -1078,6 +1080,8 @@ int libxl__ev_devstate_wait(libxl__ao *a
ds->w.path = state_path;
ds->w.timeout_ms = milliseconds;
ds->w.callback = devstate_callback;
+ rc = clock_gettime(CLOCK_MONOTONIC, &ds->w.start);
+ if (rc) goto out;
rc = libxl__xswait_start(gc, &ds->w);
if (rc) goto out;
Index: xen-4.18.0-testing/tools/libs/light/libxl_internal.c
===================================================================
--- xen-4.18.0-testing.orig/tools/libs/light/libxl_internal.c
+++ xen-4.18.0-testing/tools/libs/light/libxl_internal.c
@@ -18,6 +18,97 @@
#include "libxl_internal.h"
#include "libxl_arch.h"
+#define LIBXL_SUSE_PATH_TIMEOUT "/libxl/suse/per-device-LIBXL_HOTPLUG_TIMEOUT"
+#define LIBXL_SUSE_PATH_DISK_TIMEOUT "suse/disks-LIBXL_HOTPLUG_TIMEOUT"
+#define LIBXL_SUSE_PATH_NIC_TIMEOUT "suse/nics-LIBXL_HOTPLUG_TIMEOUT"
+
+void libxl__suse_domain_set_hotplug_timeout(libxl__gc *gc, uint32_t domid, long d, long n)
+{
+ char *path;
+ char *val, *p;
+ long v;
+
+ val = libxl__xs_read(gc, XBT_NULL, LIBXL_SUSE_PATH_TIMEOUT);
+ if (!val)
+ return;
+
+ v = strtol(val, NULL, 0);
+ if (v <= 0)
+ return;
+
+ path = libxl__xs_libxl_path(gc, domid);
+ if (d > 0) {
+ p = GCSPRINTF("%s/" LIBXL_SUSE_PATH_DISK_TIMEOUT, path);
+ LOGD(NOTICE, domid, "Setting %s to %ld*%ld=%ld", p, d, v, d*v);
+ libxl__xs_printf(gc, XBT_NULL, p, "%ld", d*v);
+ }
+ if (n > 0) {
+ p = GCSPRINTF("%s/" LIBXL_SUSE_PATH_NIC_TIMEOUT, path);
+ LOGD(NOTICE, domid, "Setting %s to %ld*%ld=%ld", p, n, v, n*v);
+ libxl__xs_printf(gc, XBT_NULL, p, "%ld", n*v);
+ }
+}
+
+void libxl__suse_domain_get_hotplug_timeout(libxl__gc *gc, uint32_t domid, libxl__device_kind kind, struct timespec *ts, int *timeout_ms, const char *be_path)
+{
+ char *path;
+ char *val, *p;
+ long v = 0;
+
+ path = libxl__xs_libxl_path(gc, domid);
+ if (!path)
+ return;
+
+ switch (kind) {
+ case LIBXL__DEVICE_KIND_VBD:
+ p = GCSPRINTF("%s/" LIBXL_SUSE_PATH_DISK_TIMEOUT, path);
+ break;
+ case LIBXL__DEVICE_KIND_VIF:
+ p = GCSPRINTF("%s/" LIBXL_SUSE_PATH_NIC_TIMEOUT, path);
+ break;
+ default:
+ return;
+ }
+ errno = 0;
+ val = libxl__xs_read(gc, XBT_NULL, p);
+ if (val)
+ v = strtol(val, NULL, 0);
+ LOGED(DEBUG, domid, "Got from '%s' = %ld from %s for %s", val?:"", v, p, be_path);
+ if (!val || v <= 0)
+ return;
+
+ if (v > (INT_MAX/1000))
+ v = (INT_MAX/1000);
+ v *= 1000;
+ LOGD(NOTICE, domid, "Replacing timeout %d with %ld for %s", *timeout_ms, v, be_path);
+ *timeout_ms = v;
+ if (clock_gettime(CLOCK_MONOTONIC, ts) < 0) {
+ LOGED(ERROR, domid, "clock_gettime failed for %s", be_path);
+ ts->tv_sec = ts->tv_nsec = 0;
+ }
+
+}
+
+void libxl__suse_diff_timespec(const struct timespec *old, char *b, size_t s)
+{
+ struct timespec new, diff;
+
+ if (old->tv_sec == 0 && old->tv_nsec == 0) {
+ *b = '\0';
+ return;
+ }
+ if (clock_gettime(CLOCK_MONOTONIC, &new))
+ new = *old;
+ if ((new.tv_nsec - old->tv_nsec) < 0) {
+ diff.tv_sec = new.tv_sec - old->tv_sec - 1;
+ diff.tv_nsec = new.tv_nsec - old->tv_nsec + (1000*1000*1000);
+ } else {
+ diff.tv_sec = new.tv_sec - old->tv_sec;
+ diff.tv_nsec = new.tv_nsec - old->tv_nsec;
+ }
+ snprintf(b, s, " (%ld.%09lds)", (long)diff.tv_sec, diff.tv_nsec);
+}
+
void libxl__alloc_failed(libxl_ctx *ctx, const char *func,
size_t nmemb, size_t size) {
#define M "libxl: FATAL ERROR: memory allocation failure"
Index: xen-4.18.0-testing/tools/libs/light/libxl_internal.h
===================================================================
--- xen-4.18.0-testing.orig/tools/libs/light/libxl_internal.h
+++ xen-4.18.0-testing/tools/libs/light/libxl_internal.h
@@ -50,6 +50,7 @@
#include <sys/un.h>
#include <sys/file.h>
#include <sys/ioctl.h>
+#include <time.h>
#include <xenevtchn.h>
#include <xenstore.h>
@@ -1629,6 +1630,7 @@ struct libxl__xswait_state {
const char *what; /* for error msgs: noun phrase, what we're waiting for */
const char *path;
int timeout_ms; /* as for poll(2) */
+ struct timespec start;
libxl__xswait_callback *callback;
/* remaining fields are private to xswait */
libxl__ev_time time_ev;
@@ -2707,6 +2709,7 @@ struct libxl__async_exec_state {
char **args; /* execution arguments */
char **env; /* execution environment */
+ struct timespec start;
/* private */
libxl__ev_time time;
libxl__ev_child child;
@@ -4896,6 +4899,9 @@ _hidden int userlookup_helper_getpwuid(l
#endif
+_hidden void libxl__suse_domain_set_hotplug_timeout(libxl__gc *gc, uint32_t domid, long d, long n);
+_hidden void libxl__suse_domain_get_hotplug_timeout(libxl__gc *gc, uint32_t domid, libxl__device_kind kind, struct timespec *ts, int *timeout_ms, const char *be_path);
+_hidden void libxl__suse_diff_timespec(const struct timespec *old, char *b, size_t s);
/*
* Local variables:
* mode: C

View File

@ -0,0 +1,215 @@
https://bugzilla.novell.com/show_bug.cgi?id=879425
---
tools/libxl/libxl.c | 2 ++
tools/libxl/libxl.h | 12 ++++++++++++
tools/libxl/libxlu_disk.c | 2 ++
tools/libxl/libxlu_disk_i.h | 2 +-
tools/libxl/libxlu_disk_l.l | 1 +
5 files changed, 18 insertions(+), 1 deletion(-)
Index: xen-4.18.0-testing/docs/man/xl-disk-configuration.5.pod.in
===================================================================
--- xen-4.18.0-testing.orig/docs/man/xl-disk-configuration.5.pod.in
+++ xen-4.18.0-testing/docs/man/xl-disk-configuration.5.pod.in
@@ -337,6 +337,32 @@ No
discard
+=item B<suse-diskcache-disable-flush>
+
+=over 4
+
+=item Description
+
+Request that the qemu block driver does not automatically flush written data to the backend storage.
+
+=item Supported values
+
+absent, present
+
+=item Mandatory
+
+No
+
+=item Default value
+
+absent
+
+=back
+
+This enables the '-disk cache=unsafe' mode inside qemu.
+In this mode writes to the underlying blockdevice are delayed.
+While using this option in production is dangerous, it improves performance during installation of a domU.
+
=back
An advisory setting for the backend driver, specifying whether to
Index: xen-4.18.0-testing/tools/include/libxl.h
===================================================================
--- xen-4.18.0-testing.orig/tools/include/libxl.h
+++ xen-4.18.0-testing/tools/include/libxl.h
@@ -584,6 +584,21 @@
*/
#define LIBXL_HAVE_CONSOLE_ADD_XENSTORE 1
/*
+ * The libxl_device_disk has no way to indicate that cache=unsafe is
+ * supposed to be used. Provide this knob without breaking the ABI.
+ * This is done by overloading struct libxl_device_disk->readwrite:
+ * readwrite == 0: disk is readonly, cache= does not matter
+ * readwrite == 1: disk is readwrite, backend driver may tweak cache=
+ * readwrite == MAGIC: disk is readwrite, backend driver should ignore
+ * flush requests from the frontend driver.
+ * Note: the macro with MAGIC is used by libvirt to decide if this patch is applied
+ */
+#define LIBXL_HAVE_LIBXL_DEVICE_DISK_DISABLE_FLUSH_MAGIC 0x00006000U
+#define LIBXL_HAVE_LIBXL_DEVICE_DISK_DISABLE_FLUSH_MASK 0xffff0fffU
+#define LIBXL_SUSE_IS_CACHE_UNSAFE(rw) (((rw) & ~LIBXL_HAVE_LIBXL_DEVICE_DISK_DISABLE_FLUSH_MASK) == LIBXL_HAVE_LIBXL_DEVICE_DISK_DISABLE_FLUSH_MAGIC)
+#define LIBXL_SUSE_SET_CACHE_UNSAFE(rw) (((rw) & LIBXL_HAVE_LIBXL_DEVICE_DISK_DISABLE_FLUSH_MASK) | LIBXL_HAVE_LIBXL_DEVICE_DISK_DISABLE_FLUSH_MAGIC)
+
+/*
* libxl ABI compatibility
*
* The only guarantee which libxl makes regarding ABI compatibility
Index: xen-4.18.0-testing/tools/libs/light/libxl_disk.c
===================================================================
--- xen-4.18.0-testing.orig/tools/libs/light/libxl_disk.c
+++ xen-4.18.0-testing/tools/libs/light/libxl_disk.c
@@ -422,6 +422,8 @@ static void device_disk_add(libxl__egc *
flexarray_append_pair(back, "discard-enable",
libxl_defbool_val(disk->discard_enable) ?
"1" : "0");
+ if (LIBXL_SUSE_IS_CACHE_UNSAFE(disk->readwrite))
+ flexarray_append_pair(back, "suse-diskcache-disable-flush", "1");
flexarray_append(back, "specification");
flexarray_append(back, libxl__device_disk_string_of_specification(disk->specification));
if (disk->specification == LIBXL_DISK_SPECIFICATION_VIRTIO) {
Index: xen-4.18.0-testing/tools/libs/light/libxl_dm.c
===================================================================
--- xen-4.18.0-testing.orig/tools/libs/light/libxl_dm.c
+++ xen-4.18.0-testing/tools/libs/light/libxl_dm.c
@@ -1019,14 +1019,27 @@ enum {
LIBXL__COLO_SECONDARY,
};
+static const char *qemu_cache_mode(const libxl_device_disk *disk)
+{
+ static const char cache_directsync[] = "directsync";
+ static const char cache_writeback[] = "writeback";
+ static const char cache_unsafe[] = "unsafe";
+
+ if (LIBXL_SUSE_IS_CACHE_UNSAFE(disk->readwrite))
+ return cache_unsafe;
+ if (disk->direct_io_safe)
+ return cache_directsync;
+ return cache_writeback;
+}
+
static char *qemu_disk_scsi_drive_string(libxl__gc *gc, const char *target_path,
int unit, const char *format,
const libxl_device_disk *disk,
int colo_mode, const char **id_ptr)
{
char *drive = NULL;
- char *common = GCSPRINTF("if=none,readonly=%s,cache=writeback",
- disk->readwrite ? "off" : "on");
+ char *common = GCSPRINTF("if=none,readonly=%s,cache=%s",
+ disk->readwrite ? "off" : "on", qemu_cache_mode(disk));
const char *exportname = disk->colo_export;
const char *active_disk = disk->active_disk;
const char *hidden_disk = disk->hidden_disk;
@@ -1085,8 +1098,8 @@ static char *qemu_disk_ide_drive_string(
switch (colo_mode) {
case LIBXL__COLO_NONE:
drive = GCSPRINTF
- ("file=%s,if=ide,index=%d,media=disk,format=%s,cache=writeback",
- target_path, unit, format);
+ ("file=%s,if=ide,index=%d,media=disk,format=%s,cache=%s",
+ target_path, unit, format, qemu_cache_mode(disk));
break;
case LIBXL__COLO_PRIMARY:
/*
@@ -1099,13 +1112,14 @@ static char *qemu_disk_ide_drive_string(
* vote-threshold=1
*/
drive = GCSPRINTF(
- "if=ide,index=%d,media=disk,cache=writeback,driver=quorum,"
+ "if=ide,index=%d,media=disk,cache=%s,driver=quorum,"
"id=%s,"
"children.0.file.filename=%s,"
"children.0.driver=%s,"
"read-pattern=fifo,"
"vote-threshold=1",
- unit, exportname, target_path, format);
+ unit, qemu_cache_mode(disk),
+ exportname, target_path, format);
break;
case LIBXL__COLO_SECONDARY:
/*
@@ -1119,7 +1133,7 @@ static char *qemu_disk_ide_drive_string(
* file.backing.backing=exportname,
*/
drive = GCSPRINTF(
- "if=ide,index=%d,id=top-colo,media=disk,cache=writeback,"
+ "if=ide,index=%d,id=top-colo,media=disk,cache=%s,"
"driver=replication,"
"mode=secondary,"
"top-id=top-colo,"
@@ -1128,7 +1142,8 @@ static char *qemu_disk_ide_drive_string(
"file.backing.driver=qcow2,"
"file.backing.file.filename=%s,"
"file.backing.backing=%s",
- unit, active_disk, hidden_disk, exportname);
+ unit, qemu_cache_mode(disk),
+ active_disk, hidden_disk, exportname);
break;
default:
abort();
@@ -1998,8 +2013,8 @@ static int libxl__build_device_model_arg
return ERROR_INVAL;
}
flexarray_vappend(dm_args, "-drive",
- GCSPRINTF("file=%s,if=none,id=ahcidisk-%d,format=%s,cache=writeback",
- target_path, disk, format),
+ GCSPRINTF("file=%s,if=none,id=ahcidisk-%d,format=%s,cache=%s",
+ target_path, disk, format, qemu_cache_mode(&disks[i])),
"-device", GCSPRINTF("ide-hd,bus=ahci0.%d,unit=0,drive=ahcidisk-%d",
disk, disk), NULL);
continue;
Index: xen-4.18.0-testing/tools/libs/util/libxlu_disk.c
===================================================================
--- xen-4.18.0-testing.orig/tools/libs/util/libxlu_disk.c
+++ xen-4.18.0-testing/tools/libs/util/libxlu_disk.c
@@ -78,6 +78,8 @@ int xlu_disk_parse(XLU_Config *cfg,
if (!disk->pdev_path || !strcmp(disk->pdev_path, ""))
disk->format = LIBXL_DISK_FORMAT_EMPTY;
}
+ if (disk->readwrite && dpc.suse_diskcache_disable_flush)
+ disk->readwrite = LIBXL_SUSE_SET_CACHE_UNSAFE(disk->readwrite);
if (!disk->vdev) {
xlu__disk_err(&dpc,0, "no vdev specified");
Index: xen-4.18.0-testing/tools/libs/util/libxlu_disk_i.h
===================================================================
--- xen-4.18.0-testing.orig/tools/libs/util/libxlu_disk_i.h
+++ xen-4.18.0-testing/tools/libs/util/libxlu_disk_i.h
@@ -10,7 +10,7 @@ typedef struct {
void *scanner;
YY_BUFFER_STATE buf;
libxl_device_disk *disk;
- int access_set, had_depr_prefix;
+ int access_set, suse_diskcache_disable_flush, had_depr_prefix;
const char *spec;
} DiskParseContext;
Index: xen-4.18.0-testing/tools/libs/util/libxlu_disk_l.l
===================================================================
--- xen-4.18.0-testing.orig/tools/libs/util/libxlu_disk_l.l
+++ xen-4.18.0-testing/tools/libs/util/libxlu_disk_l.l
@@ -216,6 +216,7 @@ colo-port=[^,]*,? { STRIP(','); setcolop
colo-export=[^,]*,? { STRIP(','); SAVESTRING("colo-export", colo_export, FROMEQUALS); }
active-disk=[^,]*,? { STRIP(','); SAVESTRING("active-disk", active_disk, FROMEQUALS); }
hidden-disk=[^,]*,? { STRIP(','); SAVESTRING("hidden-disk", hidden_disk, FROMEQUALS); }
+suse-diskcache-disable-flush,? { DPC->suse_diskcache_disable_flush = 1; }
trusted,? { libxl_defbool_set(&DPC->disk->trusted, true); }
untrusted,? { libxl_defbool_set(&DPC->disk->trusted, false); }

View File

@ -0,0 +1,51 @@
From fb0f946726ff8aaa15b76bc3ec3b18878851a447 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Fri, 27 Sep 2019 18:06:12 +0200
Subject: libxl: fix crash in helper_done due to uninitialized data
A crash in helper_done, called from libxl_domain_suspend, was reported,
triggered by 'virsh migrate --live xen+ssh://host':
#1 helper_done (...) at libxl_save_callout.c:371
helper_failed
helper_stop
libxl__save_helper_abort
#2 check_all_finished (..., rc=-3) at libxl_stream_write.c:671
stream_done
stream_complete
write_done
dc->callback == write_done
efd->func == datacopier_writable
#3 afterpoll_internal (...) at libxl_event.c:1269
This is triggered by a failed poll, the actual error was:
libxl_aoutils.c:328:datacopier_writable: unexpected poll event 0x1c on fd 37 (should be POLLOUT) writing libxc header during copy of save v2 stream
In this case revents in datacopier_writable is POLLHUP|POLLERR|POLLOUT,
which triggers datacopier_callback. In helper_done,
shs->completion_callback is still zero. libxl__xc_domain_save fills
dss.sws.shs. But that function is only called after stream_header_done.
Any error before that will leave dss partly uninitialized.
Fix this crash by checking if ->completion_callback is valid.
Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
tools/libxl/libxl_save_callout.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
--- a/tools/libs/light/libxl_save_callout.c
+++ b/tools/libs/light/libxl_save_callout.c
@@ -364,8 +364,9 @@ static void helper_done(libxl__egc *egc,
assert(!libxl__save_helper_inuse(shs));
shs->egc = egc;
- shs->completion_callback(egc, shs->caller_state,
- shs->rc, shs->retval, shs->errnoval);
+ if (shs->completion_callback)
+ shs->completion_callback(egc, shs->caller_state,
+ shs->rc, shs->retval, shs->errnoval);
shs->egc = 0;
}

View File

@ -0,0 +1,23 @@
References: bsc#1167608
unbound limits for max_event_channels
1023 is too low for a three digit value of vcpus
it is difficult to make the value depend on the number of vcpus
adding devices at runtime also needs event channels
But, having an unbound value (of 128k) may have a negative effect on XSA-344.
Therefore, just let the built-in default depend on the number of vcpus.
Index: xen-4.17.0-testing/tools/libs/light/libxl_create.c
===================================================================
--- xen-4.17.0-testing.orig/tools/libs/light/libxl_create.c
+++ xen-4.17.0-testing/tools/libs/light/libxl_create.c
@@ -263,7 +263,7 @@ int libxl__domain_build_info_setdefault(
b_info->iomem[i].gfn = b_info->iomem[i].start;
if (!b_info->event_channels)
- b_info->event_channels = 1023;
+ b_info->event_channels = max(1023, 8 * b_info->max_vcpus + 511);
rc = libxl_get_physinfo(CTX, &info);
if (rc) {

19
logrotate.conf Normal file
View File

@ -0,0 +1,19 @@
/var/log/xen/xen-hotplug.log {
compress
missingok
notifempty
rotate 2
size 100k
copytruncate
}
/var/log/xen/xl-*.log /var/log/xen/qemu-dm-*.log /var/log/xen/console/*.log {
compress
missingok
notifempty
rotate 4
dateext
dateformat -%Y%m%d-%H%M
size 2M
copytruncate
}

View File

@ -0,0 +1,63 @@
Index: xen-4.18.0-testing/tools/python/xen/migration/legacy.py
===================================================================
--- xen-4.18.0-testing.orig/tools/python/xen/migration/legacy.py
+++ xen-4.18.0-testing/tools/python/xen/migration/legacy.py
@@ -1,3 +1,4 @@
+#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
Index: xen-4.18.0-testing/tools/python/xen/migration/libxc.py
===================================================================
--- xen-4.18.0-testing.orig/tools/python/xen/migration/libxc.py
+++ xen-4.18.0-testing/tools/python/xen/migration/libxc.py
@@ -1,3 +1,4 @@
+#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
Index: xen-4.18.0-testing/tools/python/xen/migration/libxl.py
===================================================================
--- xen-4.18.0-testing.orig/tools/python/xen/migration/libxl.py
+++ xen-4.18.0-testing/tools/python/xen/migration/libxl.py
@@ -1,3 +1,4 @@
+#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
Index: xen-4.18.0-testing/tools/python/xen/migration/public.py
===================================================================
--- xen-4.18.0-testing.orig/tools/python/xen/migration/public.py
+++ xen-4.18.0-testing/tools/python/xen/migration/public.py
@@ -1,3 +1,4 @@
+#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
Index: xen-4.18.0-testing/tools/python/xen/migration/tests.py
===================================================================
--- xen-4.18.0-testing.orig/tools/python/xen/migration/tests.py
+++ xen-4.18.0-testing/tools/python/xen/migration/tests.py
@@ -1,3 +1,4 @@
+#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
Index: xen-4.18.0-testing/tools/python/xen/migration/verify.py
===================================================================
--- xen-4.18.0-testing.orig/tools/python/xen/migration/verify.py
+++ xen-4.18.0-testing/tools/python/xen/migration/verify.py
@@ -1,3 +1,4 @@
+#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
Index: xen-4.18.0-testing/tools/python/xen/migration/xl.py
===================================================================
--- xen-4.18.0-testing.orig/tools/python/xen/migration/xl.py
+++ xen-4.18.0-testing/tools/python/xen/migration/xl.py
@@ -1,3 +1,4 @@
+#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""

BIN
mini-os.tar.bz2 (Stored with Git LFS) Normal file

Binary file not shown.

View File

@ -0,0 +1,63 @@
Index: xen-4.17.2-testing/tools/pygrub/src/pygrub
===================================================================
--- xen-4.17.2-testing.orig/tools/pygrub/src/pygrub
+++ xen-4.17.2-testing/tools/pygrub/src/pygrub
@@ -579,7 +579,7 @@ class Grub:
self.cf.filename = f
break
if self.__dict__.get('cf', None) is None:
- raise RuntimeError("couldn't find bootloader config file in the image provided.")
+ return
f = fs.open_file(self.cf.filename)
# limit read size to avoid pathological cases
buf = f.read(FS_READ_MAX)
@@ -754,6 +754,20 @@ def run_grub(file, entry, fs, cfg_args):
g = Grub(file, fs)
+ # If missing config or grub has no menu entries to select, look for
+ # vmlinuz-xen and initrd-xen in /boot
+ if g.__dict__.get('cf', None) is None or len(g.cf.images) == 0 or re.search(r"xen(-pae)?\.gz",g.cf.images[0].kernel[1]):
+ if not list_entries:
+ chosencfg = { "kernel": None, "ramdisk": None, "args": "" }
+ chosencfg = sniff_xen_kernel(fs, incfg)
+ if chosencfg["kernel"] and chosencfg["ramdisk"]:
+ chosencfg["args"] = cfg_args
+ return chosencfg
+ if g.__dict__.get('cf', None) is None:
+ raise RuntimeError("couldn't find bootloader config file in the image provided.")
+ else:
+ return
+
if list_entries:
for i in range(len(g.cf.images)):
img = g.cf.images[i]
@@ -840,6 +854,19 @@ def sniff_netware(fs, cfg):
return cfg
+def sniff_xen_kernel(fs, cfg):
+ if not cfg["kernel"]:
+ if fs.file_exists('/boot/vmlinuz-xen'):
+ cfg["kernel"] = '/boot/vmlinuz-xen'
+ elif fs.file_exists('/boot/vmlinuz-xenpae'):
+ cfg["kernel"] = '/boot/vmlinuz-xenpae'
+ if cfg["kernel"] and not cfg["ramdisk"]:
+ if fs.file_exists('/boot/initrd-xen'):
+ cfg["ramdisk"] = '/boot/initrd-xen'
+ elif fs.file_exists('/boot/initrd-xenpae'):
+ cfg["ramdisk"] = '/boot/initrd-xenpae'
+ return cfg
+
def format_sxp(kernel, ramdisk, args):
s = "linux (kernel %s)" % repr(kernel)
if ramdisk:
@@ -918,7 +945,7 @@ if __name__ == "__main__":
debug = False
not_really = False
output_format = "sxp"
- output_directory = "/var/run/xen/pygrub/"
+ output_directory = "/var/run/xen"
uid = None
# what was passed in

View File

@ -0,0 +1,59 @@
References: bsc#978413
The parsing code can't handle a single line menu entry.
For example: menuentry 'halt' { halt }
Force it to fall through where it will handle the closing brace.
Also change warning to debug to cut down on verbose output.
Index: xen-4.18.0-testing/tools/pygrub/src/GrubConf.py
===================================================================
--- xen-4.18.0-testing.orig/tools/pygrub/src/GrubConf.py
+++ xen-4.18.0-testing/tools/pygrub/src/GrubConf.py
@@ -150,7 +150,7 @@ class GrubImage(_GrubImage):
else:
logging.info("Ignored image directive %s" %(com,))
else:
- logging.warning("Unknown image directive %s" %(com,))
+ logging.debug("Unknown image directive %s" %(com,))
# now put the line in the list of lines
if replace is None:
@@ -309,7 +309,7 @@ class GrubConfigFile(_GrubConfigFile):
else:
logging.info("Ignored directive %s" %(com,))
else:
- logging.warning("Unknown directive %s" %(com,))
+ logging.debug("Unknown directive %s" %(com,))
if img:
self.add_image(GrubImage(title, img))
@@ -343,7 +343,7 @@ class Grub2Image(_GrubImage):
elif com.startswith('set:'):
pass
else:
- logging.warning("Unknown image directive %s" %(com,))
+ logging.debug("Unknown image directive %s" %(com,))
# now put the line in the list of lines
if replace is None:
@@ -408,7 +408,10 @@ class Grub2ConfigFile(_GrubConfigFile):
raise RuntimeError("syntax error: cannot nest menuentry (%d %s)" % (len(img),img))
img = []
title = title_match.group(1)
- continue
+ if not l.endswith('}'):
+ continue
+ # One line menuentry, Ex. menuentry 'halt' { halt }
+ l = '}'
if l.startswith("submenu"):
menu_level += 1
@@ -447,7 +450,7 @@ class Grub2ConfigFile(_GrubConfigFile):
elif com.startswith('set:'):
pass
else:
- logging.warning("Unknown directive %s" %(com,))
+ logging.debug("Unknown directive %s" %(com,))
if img is not None:
raise RuntimeError("syntax error: end of file with open menuentry(%d %s)" % (len(img),img))

View File

@ -0,0 +1,151 @@
From 5e1e18fde92bae1ae87f78d470e80b1ffc9350d1 Mon Sep 17 00:00:00 2001
From: Michal Kubecek <mkubecek@suse.cz>
Date: Wed, 26 Jul 2017 10:28:54 +0200
Subject: [PATCH] replace obsolete network configuration commands in scripts
Some scripts still use obsolete network configuration commands ifconfig and
brctl. Replace them by commands from iproute2 package.
---
README | 3 +--
tools/hotplug/Linux/colo-proxy-setup | 14 ++++++--------
tools/hotplug/Linux/remus-netbuf-setup | 3 ++-
tools/hotplug/Linux/vif-bridge | 7 ++++---
tools/hotplug/Linux/vif-nat | 2 +-
tools/hotplug/Linux/vif-route | 6 ++++--
tools/hotplug/Linux/xen-network-common.sh | 6 ++----
.../i386-dm/qemu-ifup-Linux | 5 +++--
9 files changed, 26 insertions(+), 26 deletions(-)
Index: xen-4.17.0-testing/README
===================================================================
--- xen-4.17.0-testing.orig/README
+++ xen-4.17.0-testing/README
@@ -61,8 +61,7 @@ provided by your OS distributor:
* Development install of GLib v2.0 (e.g. libglib2.0-dev)
* Development install of Pixman (e.g. libpixman-1-dev)
* pkg-config
- * bridge-utils package (/sbin/brctl)
- * iproute package (/sbin/ip)
+ * iproute package (/sbin/ip, /sbin/bridge)
* GNU bison and GNU flex
* ACPI ASL compiler (iasl)
Index: xen-4.17.0-testing/tools/hotplug/Linux/remus-netbuf-setup
===================================================================
--- xen-4.17.0-testing.orig/tools/hotplug/Linux/remus-netbuf-setup
+++ xen-4.17.0-testing/tools/hotplug/Linux/remus-netbuf-setup
@@ -76,6 +76,7 @@
#specific setup code such as renaming.
dir=$(dirname "$0")
. "$dir/xen-hotplug-common.sh"
+. "$dir/xen-network-common.sh"
findCommand "$@"
@@ -139,8 +140,16 @@ check_ifb() {
setup_ifb() {
- for ifb in `ifconfig -a -s|egrep ^ifb|cut -d ' ' -f1`
+ if [ "$legacy_tools" ]; then
+ ifbs=`ifconfig -a -s|egrep ^ifb|cut -d ' ' -f1`
+ else
+ ifbs=$(ip --oneline link show type ifb | cut -d ' ' -f2)
+ fi
+ for ifb in $ifbs
do
+ if [ ! "$legacy_tools" ]; then
+ ifb="${ifb%:}"
+ fi
check_ifb "$ifb" || continue
REMUS_IFB="$ifb"
break
Index: xen-4.17.0-testing/tools/hotplug/Linux/vif-bridge
===================================================================
--- xen-4.17.0-testing.orig/tools/hotplug/Linux/vif-bridge
+++ xen-4.17.0-testing/tools/hotplug/Linux/vif-bridge
@@ -42,7 +42,8 @@ if [ -z "$bridge" ]; then
if which brctl >&/dev/null; then
bridge=$(brctl show | awk 'NR==2{print$1}')
else
- bridge=$(bridge link | cut -d" " -f7)
+ bridge=$(ip --oneline link show type bridge | awk '(NR == 1) { print $2; }')
+ bridge="${bridge%:}"
fi
if [ -z "$bridge" ]
then
Index: xen-4.17.0-testing/tools/hotplug/Linux/vif-nat
===================================================================
--- xen-4.17.0-testing.orig/tools/hotplug/Linux/vif-nat
+++ xen-4.17.0-testing/tools/hotplug/Linux/vif-nat
@@ -172,7 +172,11 @@ case "$command" in
;;
offline)
[ "$dhcp" != 'no' ] && dhcp_down
- do_without_error ifconfig "${dev}" down
+ if [ "$legacy_tools" ]; then
+ do_without_error ifconfig "${dev}" down
+ else
+ do_without_error ip link set "${dev}" down
+ fi
;;
esac
Index: xen-4.17.0-testing/tools/hotplug/Linux/vif-route
===================================================================
--- xen-4.17.0-testing.orig/tools/hotplug/Linux/vif-route
+++ xen-4.17.0-testing/tools/hotplug/Linux/vif-route
@@ -23,13 +23,23 @@ main_ip=$(dom0_ip)
case "${command}" in
add|online)
- ifconfig ${dev} ${main_ip} netmask 255.255.255.255 up
+ if [ "$legacy_tools" ]; then
+ ifconfig ${dev} ${main_ip} netmask 255.255.255.255 up
+ else
+ ip addr add "${main_ip}/32" dev "$dev"
+ fi
+ ip link set "dev" up
echo 1 >/proc/sys/net/ipv4/conf/${dev}/proxy_arp
ipcmd='add'
cmdprefix=''
;;
remove|offline)
- do_without_error ifdown ${dev}
+ if [ "$legacy_tools" ]; then
+ do_without_error ifdown ${dev}
+ else
+ do_without_error ip addr flush dev "$dev"
+ do_without_error ip link set "$dev" down
+ fi
ipcmd='del'
cmdprefix='do_without_error'
;;
Index: xen-4.17.0-testing/tools/hotplug/Linux/xen-network-common.sh
===================================================================
--- xen-4.17.0-testing.orig/tools/hotplug/Linux/xen-network-common.sh
+++ xen-4.17.0-testing/tools/hotplug/Linux/xen-network-common.sh
@@ -15,6 +15,12 @@
#
+# Use brctl and ifconfig on older systems
+legacy_tools=
+if [ -f /sbin/brctl -a -f /sbin/ifconfig ]; then
+ legacy_tools="true"
+fi
+
# Gentoo doesn't have ifup/ifdown, so we define appropriate alternatives.
# Other platforms just use ifup / ifdown directly.
@@ -152,8 +158,10 @@ remove_from_bridge () {
log debug "removing $dev from bridge $bridge"
if which brctl >&/dev/null; then
do_without_error brctl delif ${bridge} ${dev}
+ do_without_error ifconfig "$dev" down
else
do_without_error ip link set ${dev} nomaster
+ do_without_error ip link set "$dev" down
fi
else
log debug "$dev not on bridge $bridge"

16
stdvga-cache.patch Normal file
View File

@ -0,0 +1,16 @@
Index: xen-4.18.0-testing/xen/arch/x86/hvm/stdvga.c
===================================================================
--- xen-4.18.0-testing.orig/xen/arch/x86/hvm/stdvga.c
+++ xen-4.18.0-testing/xen/arch/x86/hvm/stdvga.c
@@ -165,7 +165,10 @@ static int stdvga_outb(uint64_t addr, ui
/* When in standard vga mode, emulate here all writes to the vram buffer
* so we can immediately satisfy reads without waiting for qemu. */
- s->stdvga = (s->sr[7] == 0x00);
+ s->stdvga =
+ (s->sr[7] == 0x00) && /* standard vga mode */
+ (s->gr[6] == 0x05); /* misc graphics register w/ MemoryMapSelect=1
+ * 0xa0000-0xaffff (64k region), AlphaDis=1 */
if ( !prev_stdvga && s->stdvga )
{

BIN
stubdom.tar.bz2 (Stored with Git LFS) Normal file

Binary file not shown.

View File

@ -0,0 +1,20 @@
xendomains: remove libvirtd conflict
Conflicting with libvirtd is fine for upstream, where xl/libxl is king.
But down the SUSE stream, we promote libvirt and all the libvirt-based
tools. If a user installs libvirt on their SUSE Xen host, then libvirt
should be king and override xendomains.
bsc#1015348
Index: xen-4.8.0-testing/tools/hotplug/Linux/systemd/xendomains.service.in
===================================================================
--- xen-4.8.0-testing.orig/tools/hotplug/Linux/systemd/xendomains.service.in
+++ xen-4.8.0-testing/tools/hotplug/Linux/systemd/xendomains.service.in
@@ -5,7 +5,6 @@ After=proc-xen.mount xenstored.service x
After=network-online.target
After=remote-fs.target
ConditionPathExists=/proc/xen/capabilities
-Conflicts=libvirtd.service
[Service]
Type=oneshot

78
suspend_evtchn_lock.patch Normal file
View File

@ -0,0 +1,78 @@
Fix problems that suspend eventchannel lock file might be obselete for some reason
like segment fault or other abnormal exit, and once obselete lock file exists,
it might affact latter save process.
Have discussed with upstream, for some reason not accepted.
http://xen.1045712.n5.nabble.com/Re-PATCH-improve-suspend-evtchn-lock-processing-td3395229.html
Signed-off-by: Chunyan Liu <cyliu@suse.com>
Index: xen-4.10.0-testing/tools/libs/guest/xg_suspend.c
===================================================================
--- xen-4.10.0-testing.orig/tools/libs/guest/xg_suspend.c
+++ xen-4.10.0-testing/tools/libs/guest/xg_suspend.c
@@ -20,6 +20,10 @@
#include "xc_private.h"
#include "xenguest.h"
+#include <signal.h>
+#ifdef __MINIOS__
+extern int kill (__pid_t __pid, int __sig);
+#endif
#define SUSPEND_LOCK_FILE XEN_RUN_DIR "/suspend-evtchn-%d.lock"
@@ -35,6 +39,37 @@
#define SUSPEND_FILE_BUFLEN (sizeof(SUSPEND_LOCK_FILE) + 10)
+/* cleanup obsolete suspend lock file which is unlinked for any reason,
+so that current process can get lock */
+static void clean_obsolete_lock(int domid)
+{
+ int fd, pid, n;
+ char buf[128];
+ char suspend_file[256];
+
+ snprintf(suspend_file, sizeof(suspend_file), "%s_%d_lock.d",
+ SUSPEND_LOCK_FILE, domid);
+ fd = open(suspend_file, O_RDWR);
+
+ if (fd < 0)
+ return;
+
+ n = read(fd, buf, 127);
+
+ close(fd);
+
+ if (n > 0)
+ {
+ sscanf(buf, "%d", &pid);
+ /* pid does not exist, this lock file is obsolete, just delete it */
+ if ( kill(pid,0) )
+ {
+ unlink(suspend_file);
+ return;
+ }
+ }
+}
+
static void get_suspend_file(char buf[], uint32_t domid)
{
snprintf(buf, SUSPEND_FILE_BUFLEN, SUSPEND_LOCK_FILE, domid);
@@ -48,6 +83,7 @@ static int lock_suspend_event(xc_interfa
struct flock fl;
get_suspend_file(suspend_file, domid);
+ clean_obsolete_lock(domid);
*lockfd = -1;
@@ -97,6 +133,8 @@ static int lock_suspend_event(xc_interfa
if (fd >= 0)
close(fd);
+ unlink(suspend_file);
+
return -1;
}

9
sysconfig.pciback Normal file
View File

@ -0,0 +1,9 @@
## Path: System/Virtualization
## Type: string
## Default: ""
#
# Space delimited list of PCI devices to late bind to pciback
# Format: <driver>,<PCI ID>
#
#XEN_PCI_HIDE_LIST="e1000,0000:0b:00.0 e1000,0000:0b:00.1"
XEN_PCI_HIDE_LIST=""

View File

@ -0,0 +1,13 @@
Index: xen-4.15.0-testing/tools/hotplug/Linux/vif-bridge
===================================================================
--- xen-4.15.0-testing.orig/tools/hotplug/Linux/vif-bridge
+++ xen-4.15.0-testing/tools/hotplug/Linux/vif-bridge
@@ -87,7 +87,7 @@ case "$command" in
;;
esac
-handle_iptable
+#handle_iptable
call_hooks vif post

30
vif-bridge-tap-fix.patch Normal file
View File

@ -0,0 +1,30 @@
# HG changeset patch
# User Jim Fehlig <jfehlig@suse.com>
# Date 1319581952 21600
# Node ID 74da2a3a1db1476d627f42e4a99e9e720cc6774d
# Parent 6c583d35d76dda2236c81d9437ff9d57ab02c006
Prevent vif-bridge from adding user-created tap interfaces to a bridge
Exit vif-bridge script if there is no device info in xenstore, preventing
it from adding user-created taps to bridges.
Signed-off-by: Jim Fehlig <jfehlig@suse.com>
Index: xen-4.5.0-testing/tools/hotplug/Linux/vif-bridge
===================================================================
--- xen-4.5.0-testing.orig/tools/hotplug/Linux/vif-bridge
+++ xen-4.5.0-testing/tools/hotplug/Linux/vif-bridge
@@ -28,6 +28,13 @@
dir=$(dirname "$0")
. "$dir/vif-common.sh"
+mac=$(xenstore_read_default "$XENBUS_PATH/mac" "")
+if [ -z "$mac" ]
+then
+ log debug "No device details in $XENBUS_PATH, exiting."
+ exit 0
+fi
+
bridge=${bridge:-}
bridge=$(xenstore_read_default "$XENBUS_PATH/bridge" "$bridge")

25
vif-route.patch Normal file
View File

@ -0,0 +1,25 @@
References: bsc#985503
Index: xen-4.15.1-testing/tools/hotplug/Linux/vif-route
===================================================================
--- xen-4.15.1-testing.orig/tools/hotplug/Linux/vif-route
+++ xen-4.15.1-testing/tools/hotplug/Linux/vif-route
@@ -57,11 +57,13 @@ case "${type_if}" in
;;
esac
-# If we've been given a list of IP addresses, then add routes from dom0 to
-# the guest using those addresses.
-for addr in ${ip} ; do
- ${cmdprefix} ip route ${ipcmd} ${addr} dev ${dev} src ${main_ip} metric ${metric}
-done
+if [ "${ip}" ] && [ "${ipcmd}" ] ; then
+ # If we've been given a list of IP addresses, then add routes from dom0 to
+ # the guest using those addresses.
+ for addr in ${ip} ; do
+ ${cmdprefix} ip route ${ipcmd} ${addr} dev ${dev} src ${main_ip} metric ${metric}
+ done
+fi
handle_iptable

View File

@ -0,0 +1,16 @@
Change default IO-APIC ack mode for single IO-APIC systems to old-style.
--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -2074,7 +2074,10 @@ void __init setup_IO_APIC(void)
io_apic_irqs = ~PIC_IRQS;
printk("ENABLING IO-APIC IRQs\n");
- printk(" -> Using %s ACK method\n", ioapic_ack_new ? "new" : "old");
+ if (!directed_eoi_enabled && !ioapic_ack_forced) {
+ ioapic_ack_new = (nr_ioapics > 1);
+ printk(" -> Using %s ACK method\n", ioapic_ack_new ? "new" : "old");
+ }
if (ioapic_ack_new) {
ioapic_level_type.ack = irq_complete_move;

BIN
xen-4.18.0-testing-src.tar.bz2 (Stored with Git LFS) Normal file

Binary file not shown.

View File

@ -0,0 +1,13 @@
Index: xen-4.18.0-testing/xen/arch/Kconfig
===================================================================
--- xen-4.18.0-testing.orig/xen/arch/Kconfig
+++ xen-4.18.0-testing/xen/arch/Kconfig
@@ -7,7 +7,7 @@ config PHYS_ADDR_T_32
config NR_CPUS
int "Maximum number of CPUs"
range 1 4095
- default "256" if X86
+ default "1024" if X86
default "8" if ARM && RCAR3
default "4" if ARM && QEMU
default "4" if ARM && MPSOC

30
xen-destdir.patch Normal file
View File

@ -0,0 +1,30 @@
--- xen-4.18.0-testing/tools/xs-clients/Makefile.orig 2023-10-02 12:51:09.364766336 -0600
+++ xen-4.18.0-testing/tools/xs-clients/Makefile 2023-10-02 12:53:09.360769196 -0600
@@ -29,7 +29,7 @@ all: $(TARGETS)
clients: xenstore $(CLIENTS) xenstore-control
$(CLIENTS): xenstore
- ln -f xenstore $@
+ ln -sf xenstore $@
xenstore: xenstore_client.o
$(CC) $(LDFLAGS) $^ $(LDLIBS) -o $@ $(APPEND_LDFLAGS)
@@ -54,7 +54,7 @@ install: all
$(INSTALL_PROG) xenstore-control $(DESTDIR)$(bindir)
$(INSTALL_PROG) xenstore $(DESTDIR)$(bindir)
set -e ; for c in $(CLIENTS) ; do \
- ln -f $(DESTDIR)$(bindir)/xenstore $(DESTDIR)$(bindir)/$${c} ; \
+ ln -sf xenstore $(DESTDIR)$(bindir)/$${c} ; \
done
.PHONY: uninstall
--- xen-4.18.0-testing/tools/xenstored/Makefile.orig 2023-10-02 12:51:03.364766193 -0600
+++ xen-4.18.0-testing/tools/xenstored/Makefile 2023-10-02 12:54:09.472770628 -0600
@@ -37,6 +37,7 @@ TAGS:
install: all
$(INSTALL_DIR) $(DESTDIR)$(sbindir)
$(INSTALL_PROG) xenstored $(DESTDIR)$(sbindir)
+ $(INSTALL_DIR) $(DESTDIR)$(bindir)
.PHONY: uninstall
uninstall:

16
xen-dom0-modules.service Normal file
View File

@ -0,0 +1,16 @@
[Unit]
Description=Load dom0 backend drivers
ConditionPathExists=/proc/xen
Before=xenstored.service xen-watchdog.service
[Install]
WantedBy=multi-user.target
[Service]
Type=oneshot
RemainAfterExit=true
Environment=PATH=/usr/local/sbin:/usr/sbin:/sbin:/usr/local/bin:/usr/bin:/bin
# dummy to have always one valid line
ExecStart=-/usr/bin/env uname -a
# modules listed in /usr/lib/modules.d/xen.conf
# load them manually to avoid usage of system-modules-load.service

81
xen-supportconfig Normal file
View File

@ -0,0 +1,81 @@
#!/bin/bash
#############################################################
# Name: Supportconfig Plugin for Xen
# Description: Gathers important troubleshooting information
# about Xen and its tools
#############################################################
# TODO:
# - Anything needed for UEFI?
#
RCFILE="/usr/lib/supportconfig/resources/supportconfig.rc"
OF="output-xen.txt"
GRUB2_CONF_FILES="/etc/default/grub"
XEN_CONF_FILES="/etc/xen/xl.conf /etc/sysconfig/xencommons /etc/sysconfig/xendomains"
XEN_SERVICES="xencommons xendomains xen-watchdog"
PERSISTENT_VM_CONF_FILES=""
ACTIVE_VM_CONF_FILES=""
XEN_LOG_FILES=""
if [ -s $RCFILE ]; then
if ! source $RCFILE; then
log_write $OF "ERROR: Initializing resource file: $RCFILE"
exit 1
fi
fi
# if no xen package we are done
rpm_verify $OF xen || exit 111
# if not a xen host (dom0) we are done
log_write $OF "#==[ Checking if booted Xen ]=================================#"
if [ ! -d /proc/xen ] || [ ! -e /proc/xen/capabilities ] || [ `cat /proc/xen/capabilities` != "control_d" ]; then
log_write $OF "No"
log_write $OF "Skipped"
exit 0
else
log_write $OF "Yes"
fi
# basic system information:
log_cmd $OF "uname -r"
for service in $XEN_SERVICES; do
log_cmd $OF "systemctl status $service"
log_cmd $OF "systemctl is-enabled $service"
done
log_cmd $OF "lscpu"
log_cmd $OF "xl info --numa"
log_cmd $OF "xl list"
log_cmd $OF "xl pci-assignable-list"
log_cmd $OF "xenstore-ls"
log_cmd $OF "ps -ef | grep xen"
# dump grub2-related conf files
conf_files $OF "$GRUB2_CONF_FILES"
# dump Xen-related conf files
conf_files $OF "$XEN_CONF_FILES"
# detailed system info:
log_cmd $OF "xl list --long"
log_cmd $OF "xl dmesg"
# network-related info often useful for debugging
if [ systemctl is-enabled NetworkManager.service 2>&1 > /dev/null ]; then
log_write $OF "NOTE: NetworkManager should not be enabled on a Xen host"
fi
log_cmd $OF "route -n"
log_cmd $OF "arp -v"
log_cmd $OF "ip link show type bridge"
log_cmd $OF "bridge link show"
# list contents of common config and image directories
log_cmd $OF "ls -alR /etc/xen/vm/"
log_cmd $OF "ls -alR /etc/xen/auto/"
log_cmd $OF "ls -alR /var/lib/xen/images/"
# dump VM-related conf files
test -d /etc/xen/vm && PERSISTENT_VM_CONF_FILES=$(find -L /etc/xen/vm/ -type f | sort)
conf_files $OF "$PERSISTENT_VM_CONF_FILES"
test -d /var/lib/xen && ACTIVE_VM_CONF_FILES=$(find -L /var/lib/xen/userdata* -type f | sort)
conf_files $OF "$ACTIVE_VM_CONF_FILES"
# dump log files
test -d /var/log/xen && XEN_LOG_FILES="$(find -L /var/log/xen/ -type f | grep 'log$' | sort)"
log_files $OF 0 "$XEN_LOG_FILES"

View File

@ -0,0 +1,58 @@
suse_vtsc_tolerance=<val>
Reference: bsc#1026236
To avoid emulation of vTSC after live migration or save/restore allow
different clock frequency up to the specified value. If the frequency
is within the allowed range TSC access by the domU will be performed
at native speed. Otherwise TSC access will be emulated. It is up to
the hostadmin to decide how much tolerance all running domUs can
actually handle. The default is zero tolerance.
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -47,6 +47,9 @@
static char __initdata opt_clocksource[10];
string_param("clocksource", opt_clocksource);
+static unsigned int __ro_after_init opt_suse_vtsc_tolerance;
+integer_param("suse_vtsc_tolerance", opt_suse_vtsc_tolerance);
+
unsigned long __read_mostly cpu_khz; /* CPU clock frequency in kHz. */
DEFINE_SPINLOCK(rtc_lock);
unsigned long pit0_ticks;
@@ -2720,6 +2723,8 @@ int tsc_set_info(struct domain *d,
switch ( tsc_mode )
{
+ bool disable_vtsc;
+
case XEN_CPUID_TSC_MODE_DEFAULT:
case XEN_CPUID_TSC_MODE_ALWAYS_EMULATE:
d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
@@ -2733,8 +2738,25 @@ int tsc_set_info(struct domain *d,
* When a guest is created, gtsc_khz is passed in as zero, making
* d->arch.tsc_khz == cpu_khz. Thus no need to check incarnation.
*/
+ disable_vtsc = d->arch.tsc_khz == cpu_khz;
+
+ if ( tsc_mode == XEN_CPUID_TSC_MODE_DEFAULT && !disable_vtsc &&
+ opt_suse_vtsc_tolerance && is_hvm_domain(d) )
+ {
+ long khz_diff = ABS((long)cpu_khz - gtsc_khz);
+
+ disable_vtsc = khz_diff <= opt_suse_vtsc_tolerance;
+
+ printk(XENLOG_G_INFO "%pd: host has %lu kHz,"
+ " domU expects %u kHz,"
+ " difference of %ld is %s tolerance of %u\n",
+ d, cpu_khz, gtsc_khz, khz_diff,
+ disable_vtsc ? "within" : "outside",
+ opt_suse_vtsc_tolerance);
+ }
+
if ( tsc_mode == XEN_CPUID_TSC_MODE_DEFAULT && host_tsc_is_safe() &&
- (d->arch.tsc_khz == cpu_khz ||
+ (disable_vtsc ||
(is_hvm_domain(d) &&
hvm_get_tsc_scaling_ratio(d->arch.tsc_khz))) )
{

View File

@ -0,0 +1,31 @@
The result of $(wildcard *) is random.
Sort input files to reduce build-compare noise.
---
docs/Makefile | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
Index: xen-4.18.0-testing/docs/Makefile
===================================================================
--- xen-4.18.0-testing.orig/docs/Makefile
+++ xen-4.18.0-testing/docs/Makefile
@@ -192,7 +192,7 @@ uninstall: uninstall-man-pages uninstall
# Individual file build targets
html/index.html: $(DOC_HTML) $(CURDIR)/gen-html-index INDEX
- $(PERL) -w -- $(CURDIR)/gen-html-index -i INDEX html $(DOC_HTML)
+ $(PERL) -w -- $(CURDIR)/gen-html-index -i INDEX html $(sort $(DOC_HTML))
html/%.txt: %.txt
@$(INSTALL_DIR) $(@D)
@@ -207,8 +207,8 @@ html/hypercall/%/index.html: $(CURDIR)/x
$(INSTALL_DIR) $(@D)
$(PERL) -w $(CURDIR)/xen-headers -O $(@D) \
-T 'arch-$* - Xen public headers' \
- $(patsubst %,-X arch-%,$(filter-out $*,$(DOC_ARCHES))) \
- $(patsubst %,-X xen-%,$(filter-out $*,$(DOC_ARCHES))) \
+ $(sort $(patsubst %,-X arch-%,$(filter-out $*,$(DOC_ARCHES)))) \
+ $(sort $(patsubst %,-X xen-%,$(filter-out $*,$(DOC_ARCHES)))) \
$(EXTRA_EXCLUDE) \
$(XEN_ROOT)/xen include/public include/xen/errno.h

14579
xen.changes Normal file

File diff suppressed because it is too large Load Diff

123
xen.libxl.dmmd.patch Normal file
View File

@ -0,0 +1,123 @@
References: bsc#954872
---
tools/libxl/libxl.c | 4 ++++
tools/libxl/libxl_device.c | 3 ++-
tools/libxl/libxl_dm.c | 34 +++++++++++++++++++++++++++++-----
tools/libxl/libxlu_disk_l.l | 2 ++
4 files changed, 37 insertions(+), 6 deletions(-)
Index: xen-4.18.0-testing/tools/libs/light/libxl_disk.c
===================================================================
--- xen-4.18.0-testing.orig/tools/libs/light/libxl_disk.c
+++ xen-4.18.0-testing/tools/libs/light/libxl_disk.c
@@ -203,7 +203,7 @@ static int libxl__device_disk_setdefault
return rc;
}
-static int libxl__device_from_disk(libxl__gc *gc, uint32_t domid,
+int libxl__device_from_disk(libxl__gc *gc, uint32_t domid,
const libxl_device_disk *disk,
libxl__device *device)
{
@@ -372,6 +372,10 @@ static void device_disk_add(libxl__egc *
rc = ERROR_FAIL;
goto out;
case LIBXL_DISK_BACKEND_QDISK:
+ if (disk->script) {
+ script = libxl__abs_path(gc, disk->script, libxl__xen_script_dir_path());
+ flexarray_append_pair(back, "script", script);
+ }
flexarray_append(back, "params");
flexarray_append(back, GCSPRINTF("%s:%s",
libxl__device_disk_string_of_format(disk->format),
Index: xen-4.18.0-testing/tools/libs/light/libxl_device.c
===================================================================
--- xen-4.18.0-testing.orig/tools/libs/light/libxl_device.c
+++ xen-4.18.0-testing/tools/libs/light/libxl_device.c
@@ -333,7 +333,8 @@ static int disk_try_backend(disk_try_bac
return 0;
case LIBXL_DISK_BACKEND_QDISK:
- if (a->disk->script) goto bad_script;
+ LOG(DEBUG, "Disk vdev=%s, uses script=%s on %s backend",
+ a->disk->vdev, a->disk->script, libxl_disk_backend_to_string(backend));
return backend;
case LIBXL_DISK_BACKEND_STANDALONE:
Index: xen-4.18.0-testing/tools/libs/light/libxl_dm.c
===================================================================
--- xen-4.18.0-testing.orig/tools/libs/light/libxl_dm.c
+++ xen-4.18.0-testing/tools/libs/light/libxl_dm.c
@@ -1197,6 +1197,30 @@ out:
return rc;
}
+static void libxl__suse_node_to_path(libxl__gc *gc, int domid, const libxl_device_disk *dp, const char **pdev_path)
+{
+ libxl_ctx *ctx = libxl__gc_owner(gc);
+ char *be_path, *node;
+ libxl__device device;
+ libxl_device_disk disk;
+ int rc;
+
+ disk = *dp;
+ rc = libxl__device_from_disk(gc, domid, &disk, &device);
+ if (rc) {
+ LIBXL__LOG(ctx, LIBXL__LOG_WARNING, "libxl__device_from_disk failed %d", rc);
+ return;
+ }
+ be_path = libxl__device_backend_path(gc, &device);
+
+ node = libxl__xs_read(gc, XBT_NULL, libxl__sprintf(gc, "%s/node", be_path));
+ if (!node)
+ return;
+
+ LIBXL__LOG(ctx, LIBXL__LOG_WARNING, "replacing '%s' with '%s' from %s/node, just for qemu-xen", *pdev_path, node, be_path);
+ *pdev_path = node;
+}
+
static int libxl__build_device_model_args_new(libxl__gc *gc,
const char *dm, int guest_domid,
const libxl_domain_config *guest_config,
@@ -1885,9 +1909,11 @@ static int libxl__build_device_model_arg
libxl__device_disk_dev_number(disks[i].vdev, &disk, &part);
const char *format;
char *drive;
- const char *target_path = NULL;
+ const char *target_path = disks[i].pdev_path;
int colo_mode;
+ libxl__suse_node_to_path(gc, guest_domid, disks + i, &target_path);
+
if (dev_number == -1) {
LOGD(WARN, guest_domid, "unable to determine"" disk number for %s",
disks[i].vdev);
Index: xen-4.18.0-testing/tools/libs/util/libxlu_disk_l.l
===================================================================
--- xen-4.18.0-testing.orig/tools/libs/util/libxlu_disk_l.l
+++ xen-4.18.0-testing/tools/libs/util/libxlu_disk_l.l
@@ -253,6 +253,8 @@ target=.* { STRIP(','); SAVESTRING("targ
free(newscript);
}
+dmmd:/.* { DPC->had_depr_prefix=1; DEPRECATE(0); }
+npiv:/.* { DPC->had_depr_prefix=1; DEPRECATE(0); }
tapdisk:/.* { DPC->had_depr_prefix=1; DEPRECATE(0); }
tap2?:/.* { DPC->had_depr_prefix=1; DEPRECATE(0); }
aio:/.* { DPC->had_depr_prefix=1; DEPRECATE(0); }
Index: xen-4.18.0-testing/tools/libs/light/libxl_internal.h
===================================================================
--- xen-4.18.0-testing.orig/tools/libs/light/libxl_internal.h
+++ xen-4.18.0-testing/tools/libs/light/libxl_internal.h
@@ -2073,6 +2073,10 @@ _hidden char *libxl__object_to_json(libx
_hidden int libxl__cpuid_legacy(libxl_ctx *ctx, uint32_t domid, bool retore,
libxl_domain_build_info *info);
+_hidden int libxl__device_from_disk(libxl__gc *gc, uint32_t domid,
+ const libxl_device_disk *disk,
+ libxl__device *device);
+
/* Calls poll() again - useful to check whether a signaled condition
* is still true. Cannot fail. Returns currently-true revents. */
_hidden short libxl__fd_poll_recheck(libxl__egc *egc, int fd, short events);

1245
xen.spec Normal file

File diff suppressed because it is too large Load Diff

102
xen.sysconfig-fillup.patch Normal file
View File

@ -0,0 +1,102 @@
Fix xencommons for fillup
The usage in xen.spec is like this:
%post tools
%{fillup_only -n xencommons xencommons}
After fresh install, modify the key=value pairs as required, then
upgrade the package, the sysconfig file is broken and changes are lost:
# rm /etc/sysconfig/xencommons
# zypper in --oldpackage xen-tools-4.12.4_02-3.30.1
# echo XENSTORETYPE=domain >> /etc/sysconfig/xencommons
# echo XENSTORE_DOMAIN_SIZE=123 >> /etc/sysconfig/xencommons
# zypper in --oldpackage xen-tools-4.12.4_04-3.33.1
# diff -u /var/adm/fillup-templates/sysconfig.xencommons /etc/sysconfig/xencommons
Basically fillup removed all comments, and also the two added key=value lines.
Prevent this by defining all keys, with empty values, so that consumers
of the values will continue to use the built-in defaults.
Index: xen-4.16.0-testing/tools/hotplug/Linux/init.d/sysconfig.xencommons.in
===================================================================
--- xen-4.16.0-testing.orig/tools/hotplug/Linux/init.d/sysconfig.xencommons.in
+++ xen-4.16.0-testing/tools/hotplug/Linux/init.d/sysconfig.xencommons.in
@@ -3,7 +3,9 @@
## Default: "none"
#
# Log xenconsoled messages (cf xl dmesg)
-#XENCONSOLED_TRACE=[none|guest|hv|all]
+# One of [none|guest|hv|all]
+#
+XENCONSOLED_TRACE=
## Type: string
## Default: daemon
@@ -16,10 +18,10 @@
#
# Changing this requires a reboot to take effect.
#
-#XENSTORETYPE=daemon
+XENSTORETYPE=
## Type: string
-## Default: xenstored
+## Default: @XENSTORED@
#
# Select xenstore implementation, this can be either
# of these below.
@@ -30,7 +32,7 @@
# * @sbindir@/xenstored
#
# Changing this requires a reboot to take effect.
-#XENSTORED=@XENSTORED@
+XENSTORED=
## Type: string
## Default: unlimited
@@ -57,9 +59,10 @@ XENSTORED_ARGS=
## Type: string
## Default: Not defined, tracing off
#
-# Log xenstored messages
+# Log xenstored messages if a non-empty value is assigned.
# Only evaluated if XENSTORETYPE is "daemon".
-#XENSTORED_TRACE=[yes|on|1]
+#
+XENSTORED_TRACE=
## Type: integer
## Default: 50
@@ -75,14 +78,14 @@ XENSTORED_ARGS=
#
# xenstore domain kernel.
# Only evaluated if XENSTORETYPE is "domain".
-#XENSTORE_DOMAIN_KERNEL=@LIBEXEC@/boot/xenstore-stubdom.gz
+XENSTORE_DOMAIN_KERNEL=
## Type: integer
## Default: 8
#
# xenstore domain memory size in MiB.
# Only evaluated if XENSTORETYPE is "domain".
-#XENSTORE_DOMAIN_SIZE=8
+XENSTORE_DOMAIN_SIZE=
## Type: string
## Default: not set, no autoballooning of xenstore domain
@@ -93,7 +96,7 @@ XENSTORED_ARGS=
# - combination of both in form of <val>:<frac> (e.g. 8:1/100), resulting
# value will be the higher of both specifications
# Only evaluated if XENSTORETYPE is "domain".
-#XENSTORE_MAX_DOMAIN_SIZE=
+XENSTORE_MAX_DOMAIN_SIZE=
## Type: string
## Default: ""
@@ -106,4 +109,4 @@ XENSTORE_DOMAIN_ARGS=
#QEMU_XEN=@qemu_xen_path@
# Dom0 UUID
-#XEN_DOM0_UUID=00000000-0000-0000-0000-000000000000
+XEN_DOM0_UUID=

137
xen2libvirt.py Normal file
View File

@ -0,0 +1,137 @@
#!/usr/bin/python3
#
# Copyright (C) 2014 SUSE LINUX Products GmbH, Nuernberg, Germany.
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library. If not, see
# <http://www.gnu.org/licenses/>.
#
# Authors:
# Jim Fehlig <jfehlig@suse.com>
#
# Read native Xen configuration format, convert to libvirt domXML, and
# import (virsh define <xml>) into libvirt.
import sys
import os
import argparse
import re
from xml.etree import ElementTree
try:
import libvirt
except ImportError:
print('Unable to import the libvirt module. Is libvirt-python installed?')
sys.exit(1)
parser = argparse.ArgumentParser(description='Import Xen domain configuration into libvirt')
parser.add_argument('-c', '--convert-only', help='Convert Xen domain configuration into libvirt domXML, but do not import into libvirt', action='store_true', dest='convert_only')
parser.add_argument('-r', '--recursive', help='Operate recursivelly on all Xen domain configuration rooted at path', action='store_true')
parser.add_argument('-f', '--format', help='Format of Xen domain configuration. Supported formats are xm and sexpr', choices=['xm', 'sexpr'], default=None)
parser.add_argument('-v', '--verbose', help='Print information about the import process', action='store_true')
parser.add_argument('path', help='Path to Xen domain configuration')
def print_verbose(msg):
if args.verbose:
print(msg)
def check_config(path, config):
isbinary = os.system('file -b ' + path + ' | grep text > /dev/null')
if isbinary:
print('Skipping %s (not a valid Xen configuration file)' % path)
return 'unknown'
for line in config.splitlines():
if len(line) == 0 or line.startswith('#'):
continue
if line.startswith('<domain'):
# XML is not a supported conversion format
break
if line.startswith('(domain'):
print('Found sexpr formatted file %s' % path)
return 'sexpr'
if '=' in line:
print('Found xm formatted file %s' % path)
return 'xm'
break
print('Skipping %s (not a valid Xen configuration file)' % path)
return 'unknown'
def import_domain(conn, path, format=None, convert_only=False):
f = open(path, 'r')
config = f.read()
print_verbose('Xen domain configuration read from %s:\n %s' % (path, config))
if format is None:
format = check_config(path, config)
if format == 'sexpr':
print_verbose('scrubbing domid from configuration')
config = re.sub("\(domid [0-9]*\)", "", config)
print_verbose('scrubbed sexpr:\n %s' % config)
xml = conn.domainXMLFromNative('xen-sxpr', config, 0)
elif format == 'xm':
xml = conn.domainXMLFromNative('xen-xm', config, 0)
else:
# Return to continue on to next file (if recursive)
return
f.close()
# domUloader is no longer available in SLES12, replace with pygrub
tree = ElementTree.fromstring(xml)
bl = tree.find('.//bootloader')
if bl is not None and bl.text is not None and 'domUloader' in bl.text:
bl.text = 'pygrub'
xml = ElementTree.tostring(tree)
print_verbose('Successfully converted Xen domain configuration to '
'libvirt domXML:\n %s' % xml)
if convert_only:
print(xml)
else:
print_verbose('Importing converted libvirt domXML into libvirt...')
dom = conn.defineXML(xml.decode("utf-8"))
if dom is None:
print('Failed to define domain from converted domXML')
sys.exit(1)
print_verbose('domXML successfully imported into libvirt')
args = parser.parse_args()
path = args.path
# Connect to libvirt
conn = libvirt.open(None)
if conn is None:
print('Failed to open connection to the hypervisor')
sys.exit(1)
if args.recursive:
try:
for root, dirs, files in os.walk(path):
for name in files:
abs_name = os.path.join(root, name)
print_verbose('Processing file %s' % abs_name)
import_domain(conn, abs_name, args.format, args.convert_only)
except IOError:
print('Failed to open/read path %s' % path)
sys.exit(1)
else:
import_domain(conn, args.path, args.format, args.convert_only)

395
xen_maskcalc.py Normal file
View File

@ -0,0 +1,395 @@
#!/usr/bin/python3
# Xen Mask Calculator - Calculate CPU masking information based on cpuid(1)
# Copyright (C) 2017 Armando Vega
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import argparse
import sys
import os
EAX1_MATCH = '0x00000001 0x00:'
EAX7_MATCH = '0x00000007 0x00:'
EXP_LINELN = 76
libxl_names_ecx1 = []
libxl_names_edx1 = []
libvirt_names_ecx1 = []
libvirt_names_edx1 = []
libxl_names_ebx7 = []
libxl_names_ecx7 = []
libvirt_names_ebx7 = []
libvirt_names_ecx7 = []
def fill_ecx1(bit, libxl, libvirt):
if libxl_names_ecx1[bit]:
print("ecx bit %s already set: libxl %s libvirt %s. Ignoring %s/%s\n" % (bit, libxl_names_ecx1[bit], libvirt_names_ecx1[bit], libxl, libvirt))
return
libxl_names_ecx1[bit] = libxl
libvirt_names_ecx1[bit] = libvirt
def fill_edx1(bit, libxl, libvirt):
if libxl_names_edx1[bit]:
print("edx bit %s already set: libxl %s libvirt %s. Ignoring %s/%s\n" % (bit, libxl_names_edx1[bit], libvirt_names_edx1[bit], libxl, libvirt))
return
libxl_names_edx1[bit] = libxl
libvirt_names_edx1[bit] = libvirt
def fill_ebx7(bit, libxl, libvirt):
if libxl_names_ebx7[bit]:
print("edx bit %s already set: libxl %s libvirt %s. Ignoring %s/%s\n" % (bit, libxl_names_ebx7[bit], libvirt_names_ebx7[bit], libxl, libvirt))
return
libxl_names_ebx7[bit] = libxl
libvirt_names_ebx7[bit] = libvirt
def fill_ecx7(bit, libxl, libvirt):
if libxl_names_ecx7[bit]:
print("ecx bit %s already set: libxl %s libvirt %s. Ignoring %s/%s\n" % (bit, libxl_names_ecx7[bit], libvirt_names_ecx7[bit], libxl, libvirt))
return
libxl_names_ecx7[bit] = libxl
libvirt_names_ecx7[bit] = libvirt
def fill_bit_names():
for i in range(0,32):
libxl_names_ecx1.append(None)
libxl_names_edx1.append(None)
libxl_names_ebx7.append(None)
libxl_names_ecx7.append(None)
libvirt_names_ecx1.append(None)
libvirt_names_edx1.append(None)
libvirt_names_ebx7.append(None)
libvirt_names_ecx7.append(None)
fill_ecx1(0, "sse3", "pni")
fill_ecx1(1, "pclmulqdq", "pclmuldq")
fill_ecx1(2, "dtes64", "dtes64")
fill_ecx1(3, "monitor", "monitor")
fill_ecx1(4, "dscpl", "ds_cpl")
fill_ecx1(5, "vmx", "vmx")
fill_ecx1(6, "smx", "smx")
fill_ecx1(7, "est", "est")
fill_ecx1(8, "tm2", "tm2")
fill_ecx1(9, "ssse3", "ssse3")
fill_ecx1(10, "cntxid", "cid")
fill_ecx1(12, "fma", "fma")
fill_ecx1(13, "cmpxchg16", "cx16")
fill_ecx1(14, "xtpr", "xtpr")
fill_ecx1(15, "pdcm", "pdcm")
fill_ecx1(17, "pcid", "pcid")
fill_ecx1(18, "dca", "dca")
fill_ecx1(19, "sse4_1", "sse4.1")
fill_ecx1(20, "sse4_2", "sse4.2")
fill_ecx1(21, "x2apic", "x2apic")
fill_ecx1(22, "movbe", "movbe")
fill_ecx1(23, "popcnt", "popcnt")
fill_ecx1(24, "tsc-deadline", "tsc-deadline")
fill_ecx1(25, "aes", "aes")
fill_ecx1(26, "xsave", "xsave")
fill_ecx1(27, "osxsave", "osxsave")
fill_ecx1(28, "avx", "avx")
fill_ecx1(29, "f16c", "f16c")
fill_ecx1(30, "rdrand", "rdrand")
fill_ecx1(31, "hypervisor", "hypervisor")
fill_edx1(0, "fpu", "fpu")
fill_edx1(1, "vme", "vme")
fill_edx1(2, "de", "de")
fill_edx1(3, "pse", "pse")
fill_edx1(4, "tsc", "tsc")
fill_edx1(5, "msr", "msr")
fill_edx1(6, "pae", "pae")
fill_edx1(7, "mce", "mce")
fill_edx1(8, "cmpxchg8", "cx8")
fill_edx1(9, "apic", "apic")
fill_edx1(11, "sysenter", "sep")
fill_edx1(12, "mtrr", "mtrr")
fill_edx1(13, "pge", "pge")
fill_edx1(14, "mca", "mca")
fill_edx1(15, "cmov", "cmov")
fill_edx1(16, "pat", "pat")
fill_edx1(17, "pse36", "pse36")
fill_edx1(18, "psn", "pn")
fill_edx1(19, "clfsh", "clflush")
fill_edx1(21, "ds", "ds")
fill_edx1(22, "acpi", "acpi")
fill_edx1(23, "mmx", "mmx")
fill_edx1(24, "fxsr", "fxsr")
fill_edx1(25, "sse", "sse")
fill_edx1(26, "sse2", "sse2")
fill_edx1(27, "ss", "ss")
fill_edx1(28, "htt", "ht")
fill_edx1(29, "tm", "tm")
fill_edx1(30, "ia64", "ia64")
fill_edx1(31, "pbe", "pbe")
fill_ebx7(0, "fsgsbase", "fsgsbase")
fill_ebx7(1, "tsc_adjust", "tsc_adjust")
fill_ebx7(3, "bmi1", "bmi1")
fill_ebx7(4, "hle", "hle")
fill_ebx7(5, "avx2", "avx2")
fill_ebx7(7, "smep", "smep")
fill_ebx7(8, "bmi2", "bmi2")
fill_ebx7(9, "erms", "erms")
fill_ebx7(10, "invpcid", "invpcid")
fill_ebx7(11, "rtm", "rtm")
fill_ebx7(12, "cmt", "cmt")
fill_ebx7(14, "mpx", "mpx")
fill_ebx7(16, "avx512f", "avx512f")
fill_ebx7(17, "avx512dq", "avx512dq")
fill_ebx7(18, "rdseed", "rdseed")
fill_ebx7(19, "adx", "adx")
fill_ebx7(20, "smap", "smap")
fill_ebx7(21, "avx512-ifma", "avx512-ifma")
fill_ebx7(23, "clflushopt", "clflushopt")
fill_ebx7(24, "clwb", "clwb")
fill_ebx7(26, "avx512pf", "avx512pf")
fill_ebx7(27, "avx512er", "avx512er")
fill_ebx7(28, "avx512cd", "avx512cd")
fill_ebx7(29, "sha", "sha")
fill_ebx7(30, "avx512bw", "avx512bw")
fill_ebx7(31, "avx512vl", "avx512vl")
fill_ecx7(0, "prefetchwt1", "prefetchwt1")
fill_ecx7(1, "avx512-vbmi", "avx512-vbmi")
fill_ecx7(2, "umip", "umip")
fill_ecx7(3, "pku", "pku")
fill_ecx7(4, "ospke", "ospke")
fill_ecx7(6, "avx512-vbmi2", "avx512-vbmi2")
fill_ecx7(8, "gfni", "gfni")
fill_ecx7(9, "vaes", "vaes")
fill_ecx7(10, "vpclmulqdq", "vpclmulqdq")
fill_ecx7(11, "avx512-vnni", "avx512-vnni")
fill_ecx7(12, "avx512-bitalg", "avx512-bitalg")
fill_ecx7(14, "avx512-vpopcntdq", "avx512-vpopcntdq")
fill_ecx7(22, "rdpid", "rdpid")
fill_ecx7(25, "cldemote", "cldemote")
def get_register_mask(regs):
""" Take a list of register values and return the calculated mask """
reg_n = len(regs)
mask = ''
for idx in range(32):
counter = 0
for reg in regs:
counter += 1 if (reg & (1 << idx) > 0) else 0
# if we have all 1s or all 0s we don't mask the bit
if counter == reg_n or counter == 0:
mask = mask + 'x'
else:
mask = mask + '0'
# we calculated the mask in reverse, so we reverse it again
return mask[::-1]
def print_xl_masking_config(nodes):
""" Take a dictionary of nodes containing their registers and print out CPUID masking configuration for xl """
nomasking = 'x' * 32
libxl = []
libvirt = []
eax1_ecx_regs = []
eax1_edx_regs = []
eax7_ebx_regs = []
eax7_ecx_regs = []
for node in nodes:
eax1_ecx_regs.append(nodes[node]['eax1_ecx'])
eax1_edx_regs.append(nodes[node]['eax1_edx'])
eax7_ebx_regs.append(nodes[node]['eax7_ebx'])
eax7_ecx_regs.append(nodes[node]['eax7_ecx'])
# Get masks for the EAX1 and EAX7 registers
eax1_ecx_mask = get_register_mask(eax1_ecx_regs)
eax1_edx_mask = get_register_mask(eax1_edx_regs)
eax7_ebx_mask = get_register_mask(eax7_ebx_regs)
eax7_ecx_mask = get_register_mask(eax7_ecx_regs)
# Build the xl CPUID config
cpuid_config = 'cpuid = [\n "0x00000001:ecx=' + eax1_ecx_mask
if eax1_edx_mask != nomasking:
cpuid_config += ',edx=' + eax1_edx_mask
cpuid_config += '",\n'
cpuid_config += ' "0x00000007,0x00:ebx=' + eax7_ebx_mask
if eax7_ecx_mask != nomasking:
cpuid_config += ',ecx=' + eax7_ecx_mask
cpuid_config += '"\n'
cpuid_config += ']'
print(cpuid_config)
bitnum = len(eax1_ecx_mask)
while bitnum > 0:
bitnum -= 1
bitval = eax1_ecx_mask[len(eax1_ecx_mask) - 1 - bitnum]
if bitval == "0" and libxl_names_ecx1[bitnum]:
libxl.append(libxl_names_ecx1[bitnum] + "=0")
libvirt.append(libvirt_names_ecx1[bitnum])
bitnum = len(eax1_edx_mask)
while bitnum > 0:
bitnum -= 1
bitval = eax1_edx_mask[len(eax1_edx_mask) - 1 - bitnum]
if bitval == "0" and libxl_names_edx1[bitnum]:
libxl.append(libxl_names_edx1[bitnum] + "=0")
libvirt.append(libvirt_names_edx1[bitnum])
bitnum = len(eax7_ebx_mask)
while bitnum > 0:
bitnum -= 1
bitval = eax7_ebx_mask[len(eax7_ebx_mask) - 1 - bitnum]
if bitval == "0" and libxl_names_ebx7[bitnum]:
libxl.append(libxl_names_ebx7[bitnum] + "=0")
libvirt.append(libvirt_names_ebx7[bitnum])
bitnum = len(eax7_ecx_mask)
while bitnum > 0:
bitnum -= 1
bitval = eax7_ecx_mask[len(eax7_ecx_mask) - 1 - bitnum]
if bitval == "0" and libxl_names_ecx7[bitnum]:
libxl.append(libxl_names_ecx7[bitnum] + "=0")
libvirt.append(libvirt_names_ecx7[bitnum])
if len(libxl) > 0:
output = "cpuid = [ host"
for i in libxl:
output += "," + i
output += " ]"
print(output)
print("<domain>")
print(" <cpu>")
for i in libvirt:
print(" <feature policy='optional' name='%s' />" % i)
print(" </cpu>")
print("</domain>")
def print_verbose_masking_info(nodes):
""" Take a dictionary of nodes containing their registers and print out verbose mask derivation information """
eax1_ecx_regs = []
eax1_edx_regs = []
eax7_ebx_regs = []
eax7_ecx_regs = []
for node in nodes:
eax1_ecx_regs.append(nodes[node]['eax1_ecx'])
eax1_edx_regs.append(nodes[node]['eax1_edx'])
eax7_ebx_regs.append(nodes[node]['eax7_ebx'])
eax7_ecx_regs.append(nodes[node]['eax7_ecx'])
print("")
print('== Detailed mask derivation info ==')
print("")
print('EAX1 ECX registers:')
for reg in eax1_ecx_regs:
print('{0:032b}'.format(reg))
print('================================')
print(get_register_mask(eax1_ecx_regs))
print("")
print('EAX1 EDX registers:')
for reg in eax1_edx_regs:
print('{0:032b}'.format(reg))
print('================================')
print(get_register_mask(eax1_edx_regs))
print("")
print('EAX7,0 EBX registers:')
for reg in eax7_ebx_regs:
print('{0:032b}'.format(reg))
print('================================')
print(get_register_mask(eax7_ebx_regs))
print("")
print('EAX7,0 ECX registers:')
for reg in eax7_ecx_regs:
print('{0:032b}'.format(reg))
print('================================')
print(get_register_mask(eax7_ecx_regs))
if __name__ == '__main__':
epilog = """The individual 'node_files' are generated with 'cpuid -1r':
server1~$ cpuid -1r > node1
server2~$ cpuid -1r > node2
server3~$ cpuid -1r > node3
~$ {0} node1 node2 node3
Use 'zypper install cpuid' to install the cpuid.rpm.
Note: Run 'cpuid' with NATIVE boot instead of dom0 to get the complete cpid value.
Xen hides some bits from dom0!
""".format(sys.argv[0])
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description='A utility that calculates a XEN CPUID difference mask',
epilog=epilog
)
parser.add_argument('node_files', nargs='*', help='Filenames of XEN node CPUID outputs')
parser.add_argument('-v', '--verbose', action='store_true', help='Get detailed mask derivation information')
args = parser.parse_args()
if len(args.node_files) < 2:
print('Need at least 2 files to do the comparison!')
parser.print_help()
sys.exit(1)
fill_bit_names()
nodes = dict()
for node in args.node_files:
if os.path.isfile(node):
try:
f = open(node)
except IOError as e:
print("I/O error({0}): {1}".format(e.errno, e.strerror))
sys.exit(1)
else:
lines = [line.strip() for line in f]
eax1 = ''
eax7 = ''
# try to match the lines containing interesting registers
# EAX1 - Processor Info and Feature Bits
# EAX7 - Extended features
for line in lines:
if line.startswith(EAX1_MATCH):
eax1 = line
elif line.startswith(EAX7_MATCH):
eax7 = line
# if we get garbled data we should probably just give up
if len(eax1) < EXP_LINELN or len(eax7) < EXP_LINELN:
print('ERROR: invalid data format in file : ' + node)
sys.exit(1)
# check if we can actually parse the strings into integers
try:
eax1_ecx = int(eax1.split()[4].split('=')[1], 0)
eax1_edx = int(eax1.split()[5].split('=')[1], 0)
eax7_ebx = int(eax7.split()[3].split('=')[1], 0)
eax7_ecx = int(eax7.split()[4].split('=')[1], 0)
except ValueError:
print('ERROR: invalid data format in file: ' + node)
sys.exit(1)
nodes[node] = dict()
nodes[node]['eax1_ecx'] = eax1_ecx
nodes[node]['eax1_edx'] = eax1_edx
nodes[node]['eax7_ebx'] = eax7_ebx
nodes[node]['eax7_ecx'] = eax7_ecx
f.close()
else:
print('File not found: ' + node)
sys.exit(1)
print_xl_masking_config(nodes)
if args.verbose:
print_verbose_masking_info(nodes)

1
xenapiusers Normal file
View File

@ -0,0 +1 @@
root

36
xencommons.service Normal file
View File

@ -0,0 +1,36 @@
[Unit]
Description=xencommons
ConditionPathExists=/proc/xen/capabilities
# Avoid errors from systemd-modules-load.service
Requires=xen-dom0-modules.service
After=xen-dom0-modules.service
# Pull in all upstream service files
Requires=proc-xen.mount
After=proc-xen.mount
Requires=xenstored.service
After=xenstored.service
Requires=xenconsoled.service
After=xenconsoled.service
Requires=xen-init-dom0.service
After=xen-init-dom0.service
Requires=xen-qemu-dom0-disk-backend.service
After=xen-qemu-dom0-disk-backend.service
# Make sure network (for bridge) and remote mounts (for xendomains) are available ...
After=network-online.target
After=remote-fs.target
# ... for libvirt and xendomains
Before=xendomains.service libvirtd.service
[Service]
Type=oneshot
RemainAfterExit=true
ExecStartPre=/bin/grep -q control_d /proc/xen/capabilities
ExecStart=/usr/bin/xenstore-ls -f
ExecStartPost=/bin/sh -c 'mv -vf /var/log/xen/xen-boot.log /var/log/xen/xen-boot.prev.log ; /usr/sbin/xl dmesg > /var/log/xen/xen-boot.log'
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,27 @@
Index: xen-4.18.0-testing/tools/console/client/main.c
===================================================================
--- xen-4.18.0-testing.orig/tools/console/client/main.c
+++ xen-4.18.0-testing/tools/console/client/main.c
@@ -101,6 +101,7 @@ static int get_pty_fd(struct xs_handle *
* Assumes there is already a watch set in the store for this path. */
{
struct timeval tv;
+ struct flock lock;
fd_set watch_fdset;
int xs_fd = xs_fileno(xs), pty_fd = -1;
time_t start, now;
@@ -124,6 +125,14 @@ static int get_pty_fd(struct xs_handle *
pty_fd = open(pty_path, O_RDWR | O_NOCTTY);
if (pty_fd == -1)
warn("Could not open tty `%s'", pty_path);
+ else {
+ memset(&lock, 0, sizeof(lock));
+ lock.l_type = F_WRLCK;
+ lock.l_whence = SEEK_SET;
+ if (fcntl(pty_fd, F_SETLK, &lock) != 0)
+ err(errno, "Could not lock tty '%s'",
+ pty_path);
+ }
}
free(pty_path);
}

View File

@ -0,0 +1,674 @@
GNU GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The GNU General Public License is a free, copyleft license for
software and other kinds of works.
The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users. We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors. You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.
To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights. Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received. You must make sure that they, too, receive
or can get the source code. And you must show them these terms so they
know their rights.
Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.
For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software. For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.
Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so. This is fundamentally incompatible with the aim of
protecting users' freedom to change the software. The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable. Therefore, we
have designed this version of the GPL to prohibit the practice for those
products. If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.
Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary. To prevent this, the GPL assures that
patents cannot be used to render the program non-free.
The precise terms and conditions for copying, distribution and
modification follow.
TERMS AND CONDITIONS
0. Definitions.
"This License" refers to version 3 of the GNU General Public License.
"Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.
"The Program" refers to any copyrightable work licensed under this
License. Each licensee is addressed as "you". "Licensees" and
"recipients" may be individuals or organizations.
To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy. The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.
A "covered work" means either the unmodified Program or a work based
on the Program.
To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy. Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.
To "convey" a work means any kind of propagation that enables other
parties to make or receive copies. Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.
An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License. If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.
1. Source Code.
The "source code" for a work means the preferred form of the work
for making modifications to it. "Object code" means any non-source
form of a work.
A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.
The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form. A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.
The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities. However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work. For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.
The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.
The Corresponding Source for a work in source code form is that
same work.
2. Basic Permissions.
All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met. This License explicitly affirms your unlimited
permission to run the unmodified Program. The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work. This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.
You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force. You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright. Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.
Conveying under any other circumstances is permitted solely under
the conditions stated below. Sublicensing is not allowed; section 10
makes it unnecessary.
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.
When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.
4. Conveying Verbatim Copies.
You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.
You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.
5. Conveying Modified Source Versions.
You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:
a) The work must carry prominent notices stating that you modified
it, and giving a relevant date.
b) The work must carry prominent notices stating that it is
released under this License and any conditions added under section
7. This requirement modifies the requirement in section 4 to
"keep intact all notices".
c) You must license the entire work, as a whole, under this
License to anyone who comes into possession of a copy. This
License will therefore apply, along with any applicable section 7
additional terms, to the whole of the work, and all its parts,
regardless of how they are packaged. This License gives no
permission to license the work in any other way, but it does not
invalidate such permission if you have separately received it.
d) If the work has interactive user interfaces, each must display
Appropriate Legal Notices; however, if the Program has interactive
interfaces that do not display Appropriate Legal Notices, your
work need not make them do so.
A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit. Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.
6. Conveying Non-Source Forms.
You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:
a) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by the
Corresponding Source fixed on a durable physical medium
customarily used for software interchange.
b) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by a
written offer, valid for at least three years and valid for as
long as you offer spare parts or customer support for that product
model, to give anyone who possesses the object code either (1) a
copy of the Corresponding Source for all the software in the
product that is covered by this License, on a durable physical
medium customarily used for software interchange, for a price no
more than your reasonable cost of physically performing this
conveying of source, or (2) access to copy the
Corresponding Source from a network server at no charge.
c) Convey individual copies of the object code with a copy of the
written offer to provide the Corresponding Source. This
alternative is allowed only occasionally and noncommercially, and
only if you received the object code with such an offer, in accord
with subsection 6b.
d) Convey the object code by offering access from a designated
place (gratis or for a charge), and offer equivalent access to the
Corresponding Source in the same way through the same place at no
further charge. You need not require recipients to copy the
Corresponding Source along with the object code. If the place to
copy the object code is a network server, the Corresponding Source
may be on a different server (operated by you or a third party)
that supports equivalent copying facilities, provided you maintain
clear directions next to the object code saying where to find the
Corresponding Source. Regardless of what server hosts the
Corresponding Source, you remain obligated to ensure that it is
available for as long as needed to satisfy these requirements.
e) Convey the object code using peer-to-peer transmission, provided
you inform other peers where the object code and Corresponding
Source of the work are being offered to the general public at no
charge under subsection 6d.
A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.
A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling. In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage. For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product. A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.
"Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source. The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.
If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information. But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).
The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed. Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.
Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.
7. Additional Terms.
"Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law. If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.
When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it. (Additional permissions may be written to require their own
removal in certain cases when you modify the work.) You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.
Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:
a) Disclaiming warranty or limiting liability differently from the
terms of sections 15 and 16 of this License; or
b) Requiring preservation of specified reasonable legal notices or
author attributions in that material or in the Appropriate Legal
Notices displayed by works containing it; or
c) Prohibiting misrepresentation of the origin of that material, or
requiring that modified versions of such material be marked in
reasonable ways as different from the original version; or
d) Limiting the use for publicity purposes of names of licensors or
authors of the material; or
e) Declining to grant rights under trademark law for use of some
trade names, trademarks, or service marks; or
f) Requiring indemnification of licensors and authors of that
material by anyone who conveys the material (or modified versions of
it) with contractual assumptions of liability to the recipient, for
any liability that these contractual assumptions directly impose on
those licensors and authors.
All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10. If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term. If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.
If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.
Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.
8. Termination.
You may not propagate or modify a covered work except as expressly
provided under this License. Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).
However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.
Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.
Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License. If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.
9. Acceptance Not Required for Having Copies.
You are not required to accept this License in order to receive or
run a copy of the Program. Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance. However,
nothing other than this License grants you permission to propagate or
modify any covered work. These actions infringe copyright if you do
not accept this License. Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.
10. Automatic Licensing of Downstream Recipients.
Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License. You are not responsible
for enforcing compliance by third parties with this License.
An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations. If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.
You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License. For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.
11. Patents.
A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based. The
work thus licensed is called the contributor's "contributor version".
A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version. For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.
Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.
In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement). To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.
If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients. "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.
If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.
A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License. You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.
Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.
12. No Surrender of Others' Freedom.
If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all. For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.
13. Use with the GNU Affero General Public License.
Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.
14. Revised Versions of this License.
The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.
If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.
Later license versions may give you additional or different
permissions. However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.
15. Disclaimer of Warranty.
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. Limitation of Liability.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.
17. Interpretation of Sections 15 and 16.
If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
{one line to give the program's name and a brief idea of what it does.}
Copyright (C) {year} {name of author}
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Also add information on how to contact you by electronic and paper mail.
If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:
{project} Copyright (C) {year} {fullname}
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".
You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
<http://www.gnu.org/licenses/>.
The GNU General Public License does not permit incorporating your program
into proprietary programs. If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License. But first, please read
<http://www.gnu.org/philosophy/why-not-lgpl.html>.

View File

@ -0,0 +1,28 @@
# xen-tools-xendomains-wait-disk
[xendomains.service](https://github.com/xen-project/xen/blob/RELEASE-4.13.0/tools/hotplug/Linux/systemd/xendomains.service.in) has problems
with disks that appear only later in boot process (or even after booting is complete). This project creates a service that
loops over all disks that domU will use and wait for them to appear.
xendomains-wait-disk.service launches a script that reads both /etc/xen/auto/ configurations and /var/lib/xen/save/ dumps.
From those files, it extracts which disks are needed for all domU that will be started (respecting /etc/sysconfig/xendomains
settings). After that, it simply loops waiting for those disks to appear. There is a timeout (5 min) configured in
xendomains-wait-disk.service that prevents it to block booting process forever.
There are two known cases where this project is useful:
## degraded mdadm RAID
mdadm RAID are assembled by [udev rules](https://github.com/neilbrown/mdadm/blob/master/udev-md-raid-assembly.rules).
However, it is only assembled when it is healthy. When a member is still missing, it starts a [timer](https://github.com/neilbrown/mdadm/blob/master/systemd/mdadm-last-resort%40.timer) that will try to assemble the RAID anyway after 30s, even if degraded. This timer does not block xendomains to be started. So, if a domU is depending on a MD RAID that is degraded (i.e. RAID 1 missing one disk), xendomains.service will be started before those 30s passed and that domU will fail.
An alternative solution would be to add extra hard dependencies to xendomains.service for each required disk (Require=xxx.device). However, this solution introduces another bigger problem. Before, if a single RAID is degraded, only the domU that depends on it will fail. With Require=xxx.device, xendomains will never start if
a RAID could not be assembled even after 30s (i.e. RAID5 with two missing disks).
With xendomains-wait-disk.service, xendomains.service will be blocked up to 5 min waiting for those MD RAID used by domUs. If it fails, xendomains.service
continues anyway.
## iSCSI disks
domU that uses iSCSI disk (mapped by host OS) also fails to start during boot. open-iscsi.service returns before it connect to the remote target and rescan
iscsi disks. As in mdadm RAID case, xendomains.service is started and domU that depends on iSCSI disks will fail.

199
xendomains-wait-disks.sh Normal file
View File

@ -0,0 +1,199 @@
#!/bin/bash
#
# Generates xendomains unit
#
read_conf_from_file() {
${sbindir}/xl create --quiet --dryrun --defconfig "$1"
}
big2littleendian_32bit(){
echo ${1:6:2}${1:4:2}${1:2:2}${1:0:2}
}
read_hex() {
local out_var=$1; shift
local input=$1; shift
local pos_var=$1; shift
local length=$1; shift
local hex=$(dd bs=1 skip=${!pos_var} count=$length status=none <$input | xxd -p -c$length -l$length)
read -r $pos_var <<<"$((${!pos_var} + $length))"
read -r $out_var <<<"$hex"
}
hex2dec() {
local hex=$1; shift
local little_endian=$1; shift
if $little_endian; then
hex=$(big2littleendian_32bit $hex)
fi
echo $((0x$hex))
}
read_conf_from_image(){
local pos=0 length=0
local magic_header byte_order mandatory_flags optional_flags optional_data_len config_len config_json
read_hex magic_header $1 pos 32
# "Xen saved domain, xl format\n \0 \r"
if [ "$magic_header" != "58656e20736176656420646f6d61696e2c20786c20666f726d61740a2000200d" ]; then
log $err "Unknown file format in $1. Wrong magic header: '0x$magic_header'"
return 1
fi
read_hex byte_order $1 pos 4
case "$byte_order" in
04030201) little_endian=true;;
01020304) little_endian=false;;
*) log $err "Unknown byte order 0x$byte_order in $1"; return 1;;
esac
#define XL_MANDATORY_FLAG_JSON (1U << 0) /* config data is in JSON format */
#define XL_MANDATORY_FLAG_STREAMv2 (1U << 1) /* stream is v2 */
read_hex mandatory_flags $1 pos 4
if [ "$(($(hex2dec $mandatory_flags $little_endian) & 0x3))" -ne 3 ]; then
log $err "Unknown config format or stream version. Mandatory flags are 0x$mandatory_flag"
return 1
fi
read_hex optional_flags $1 pos 4
read_hex optional_data_len $1 pos 4
optional_data_len=$(hex2dec $optional_data_len $little_endian)
# I'll not use but saved memory dump will begin at $((pos+optional_data_len))
read_hex config_len $1 pos 4
config_len=$(hex2dec $config_len $little_endian)
# null terminated string
read_hex config_json $1 pos $config_len
xxd -p -r <<<"$config_json"
}
log() {
local msg_loglevel=$1; shift
if [ "$msg_loglevel" -gt "$LOGLEVEL" ]; then
return 0
fi
echo "$@" >&2
}
emerg=0; alert=1; crit=2; err=3
warning=4; notice=5; info=6; debug=7
LOGLEVEL=${LOGLEVEL:-4}
if [ "$SYSTEMD_LOG_LEVEL" ]; then
LOGLEVEL=${!SYSTEMD_LOG_LEVEL}
fi
log $debug "Using loglevel $LOGLEVEL"
trap "log $err Error on \$LINENO: \$(caller)" ERR
log $debug "loading /etc/xen/scripts/hotplugpath.sh..."
. /etc/xen/scripts/hotplugpath.sh
#log $debug "testing for ${sbindir}/xl..."
#CMD=${sbindir}/xl
#if ! $CMD list &> /dev/null; then
# log $err "${sbindir}/xl list failed!"
# log $err "$($CMD list &>&1)"
# exit $?
#fi
#log $debug "${sbindir}/xl list OK!"
log $debug "loading /etc/sysconfig/xendomains..."
XENDOM_CONFIG=/etc/sysconfig/xendomains
if ! test -r $XENDOM_CONFIG; then
echo "$XENDOM_CONFIG not existing" >&2;
exit 6
fi
. $XENDOM_CONFIG
doms_conf=()
doms_restore=()
doms_source=()
log $debug "Reading saved domains..."
if [ "$XENDOMAINS_RESTORE" = "true" ] && [ -d "$XENDOMAINS_SAVE" ]; then
for dom in $XENDOMAINS_SAVE/*; do
log $debug "Trying $dom..."
if ! [ -r $dom ] ; then
log $debug "Not readable $dom..."
continue
fi
log $debug "Reading conf from $dom..."
if ! dom_conf=$(read_conf_from_image $dom); then
log $error "Cannot read conf from $dom"
continue
fi
log $debug "Adding $dom to the list"
doms_conf+=("$dom_conf")
doms_restore+=(true)
doms_source+=("$dom")
done
fi
log $debug "Reading auto domains..."
if [ -d "$XENDOMAINS_AUTO" ]; then
for dom in $XENDOMAINS_AUTO/*; do
log $debug "Trying $dom..."
if ! [ -r $dom ] ; then
log $debug "Not readable $dom..."
continue
fi
log $debug "Reading conf from $dom..."
if ! dom_conf=$(read_conf_from_file $dom); then
echo 123
log $error "Cannot read conf from $dom"
continue
fi
log $debug "Adding $dom to the list"
doms_conf+=("$dom_conf")
doms_restore+=(false)
doms_source+=("$dom")
done
fi
log $debug "We have ${#doms_conf[*]} to check"
for i in ${!doms_conf[*]}; do
log $debug "Doing dom $i..."
dom_conf="${doms_conf[i]}"
dom_restore="${doms_restore[i]}"
dom_source="${doms_source[i]}"
dom_name=$(sed -n 's/^.*(name \(.*\))$/\1/p;s/^.*"name": "\(.*\)",$/\1/p' <<<"$dom_conf")
readarray -t required_disks <<<"$(sed -n -e '/^ "disks": \[/,/ \],/{ /"pdev_path":/ { s/.*"pdev_path": "//;s/".*//p } }' <<<"$dom_conf")"
log $debug "dom $i is named $dom_name..."
for disk in "${required_disks[@]}"; do
disk_control_var=control_$(tr -d -c '[a-zA-Z0-9_]' <<<"$disk")
if [ "${!disk_control_var:-0}" -eq 1 ]; then
log $debug "$disk for $dom_name is already being checked"
continue
fi
declare $disk_control_var=1
log $debug "waiting for $disk for $dom_name"
(
j=0 found_loglevel=$debug
while true; do
if [ -e "$disk" ]; then
log $found_loglevel "disk $disk found (after $j seconds)"
exit 0
fi
if [ "$(( j++ % 5))" -eq 0 ]; then
log $warning "still waiting for $disk for $dom_name..."
found_loglevel=$warning
fi
sleep 1
done
) &
done
done
wait
log $debug "Exiting normally"

View File

@ -0,0 +1,54 @@
References: fate#323663 - Run Xenstore in stubdomain
--- a/tools/hotplug/Linux/init.d/sysconfig.xencommons.in
+++ b/tools/hotplug/Linux/init.d/sysconfig.xencommons.in
@@ -8,7 +8,7 @@
XENCONSOLED_TRACE=
## Type: string
-## Default: daemon
+## Default: domain
#
# Select type of xentore service.
#
@@ -81,14 +81,14 @@ XENSTORED_TRACE=
XENSTORE_DOMAIN_KERNEL=
## Type: integer
-## Default: 8
+## Default: 32
#
# xenstore domain memory size in MiB.
# Only evaluated if XENSTORETYPE is "domain".
XENSTORE_DOMAIN_SIZE=
## Type: string
-## Default: not set, no autoballooning of xenstore domain
+## Default: 1/100
#
# Maximum xenstore domain memory size. Can be specified as:
# - plain integer value for max size in MiB
--- a/tools/hotplug/Linux/launch-xenstore.in
+++ b/tools/hotplug/Linux/launch-xenstore.in
@@ -48,7 +48,7 @@ test_xenstore && exit 0
test -f @CONFIG_DIR@/@CONFIG_LEAF_DIR@/xencommons && . @CONFIG_DIR@/@CONFIG_LEAF_DIR@/xencommons
-[ "$XENSTORETYPE" = "" ] && XENSTORETYPE=daemon
+[ "$XENSTORETYPE" = "" ] && XENSTORETYPE=domain
/bin/mkdir -p @XEN_RUN_DIR@
@@ -95,9 +95,10 @@ test -f @CONFIG_DIR@/@CONFIG_LEAF_DIR@/x
[ "$XENSTORETYPE" = "domain" ] && {
[ -z "$XENSTORE_DOMAIN_KERNEL" ] && XENSTORE_DOMAIN_KERNEL=@LIBEXEC@/boot/xenstore-stubdom.gz
XENSTORE_DOMAIN_ARGS="$XENSTORE_DOMAIN_ARGS --kernel $XENSTORE_DOMAIN_KERNEL"
- [ -z "$XENSTORE_DOMAIN_SIZE" ] && XENSTORE_DOMAIN_SIZE=8
+ [ -z "$XENSTORE_DOMAIN_SIZE" ] && XENSTORE_DOMAIN_SIZE=32
XENSTORE_DOMAIN_ARGS="$XENSTORE_DOMAIN_ARGS --memory $XENSTORE_DOMAIN_SIZE"
- [ -z "$XENSTORE_MAX_DOMAIN_SIZE" ] || XENSTORE_DOMAIN_ARGS="$XENSTORE_DOMAIN_ARGS --maxmem $XENSTORE_MAX_DOMAIN_SIZE"
+ [ -z "$XENSTORE_MAX_DOMAIN_SIZE" ] && XENSTORE_MAX_DOMAIN_SIZE="1/100"
+ XENSTORE_DOMAIN_ARGS="$XENSTORE_DOMAIN_ARGS --maxmem $XENSTORE_MAX_DOMAIN_SIZE"
echo -n Starting $XENSTORE_DOMAIN_KERNEL...
${LIBEXEC_BIN}/init-xenstore-domain $XENSTORE_DOMAIN_ARGS || exit 1

108
xenwatchdogd-restart.patch Normal file
View File

@ -0,0 +1,108 @@
References: bsc#1178736
Allow restart of xenwatchdogd in case it terminated unexpectetly.
Index: xen-4.14.0-testing/tools/misc/xenwatchdogd.c
===================================================================
--- xen-4.14.0-testing.orig/tools/misc/xenwatchdogd.c
+++ xen-4.14.0-testing/tools/misc/xenwatchdogd.c
@@ -9,12 +9,16 @@
#include <unistd.h>
#include <signal.h>
#include <stdio.h>
+#include <libgen.h>
+#include <syslog.h>
xc_interface *h;
int id = 0;
+static const char id_file[] = "/run/xenwatchdog_id.txt";
-void daemonize(void)
+static void daemonize(const char *str)
{
+ const char *err_str = "";
switch (fork()) {
case -1:
err(1, "fork");
@@ -23,7 +27,9 @@ void daemonize(void)
default:
exit(0);
}
- umask(0);
+#define err(x,s) do { err_str = (s); goto out; } while (0)
+ openlog(str, LOG_CONS, LOG_DAEMON);
+ umask(~(S_IRUSR|S_IWUSR));
if (setsid() < 0)
err(1, "setsid");
if (chdir("/") < 0)
@@ -34,6 +40,10 @@ void daemonize(void)
err(1, "reopen stdout");
if(freopen("/dev/null", "w", stderr) == NULL)
err(1, "reopen stderr");
+ return;
+out:
+ syslog(LOG_ERR, "%s: %m", err_str);
+ exit(1);
}
void catch_exit(int sig)
@@ -47,18 +57,21 @@ void catch_usr1(int sig)
{
if (id)
xc_watchdog(h, id, 0);
+ unlink(id_file);
exit(0);
}
int main(int argc, char **argv)
{
+ FILE *f;
int t, s;
int ret;
+ const char *err_str = "";
if (argc < 2)
errx(1, "usage: %s <timeout> <sleep>", argv[0]);
- daemonize();
+ daemonize(basename(argv[0]));
h = xc_interface_open(NULL, NULL, 0);
if (h == NULL)
@@ -86,9 +99,25 @@ int main(int argc, char **argv)
if (signal(SIGUSR1, &catch_usr1) == SIG_ERR)
err(1, "signal");
- id = xc_watchdog(h, 0, t);
- if (id <= 0)
- err(1, "xc_watchdog setup");
+ f = fopen(id_file, "r");
+ if (f) {
+ if (fscanf(f, "%d", &id) != 1)
+ id = -1;
+ if (id <= 0)
+ err(1, "xc_watchdog setup");
+ syslog(LOG_INFO, "reusing id %d", id);
+ fclose(f);
+ } else {
+ id = xc_watchdog(h, 0, t);
+ syslog(LOG_INFO, "obtained id %d", id);
+ if (id <= 0)
+ err(1, "xc_watchdog setup");
+ f = fopen(id_file, "w");
+ if (f) {
+ fprintf(f, "%d\n", id);
+ fclose(f);
+ }
+ }
for (;;) {
sleep(s);
@@ -96,4 +125,8 @@ int main(int argc, char **argv)
if (ret != 0)
err(1, "xc_watchdog");
}
+
+out:
+ syslog(LOG_ERR, "%s: %m", err_str);
+ exit(1);
}

View File

@ -0,0 +1,13 @@
Index: xen-4.14.0-testing/tools/examples/xl.conf
===================================================================
--- xen-4.14.0-testing.orig/tools/examples/xl.conf
+++ xen-4.14.0-testing/tools/examples/xl.conf
@@ -34,7 +34,7 @@
#vif.default.script="vif-bridge"
# default bridge device to use with vif-bridge hotplug scripts
-#vif.default.bridge="xenbr0"
+vif.default.bridge="br0"
# Reserve a claim of memory when launching a guest. This guarantees immediate
# feedback whether the guest can be launched due to memory exhaustion

View File

@ -0,0 +1,13 @@
Index: xen-4.14.0-testing/tools/examples/xl.conf
===================================================================
--- xen-4.14.0-testing.orig/tools/examples/xl.conf
+++ xen-4.14.0-testing/tools/examples/xl.conf
@@ -7,7 +7,7 @@
# Control whether dom0 is ballooned down when xen doesn't have enough
# free memory to create a domain. "auto" means only balloon if dom0
# starts with all the host's memory.
-#autoballoon="auto"
+autoballoon="off"
# full path of the lockfile used by xl during domain creation
#lockfile="/var/lock/xl"

180
xl-save-pc.patch Normal file
View File

@ -0,0 +1,180 @@
References: bug#1176189
Usage of xl save -p|-c will suspend the domU.
As a result the monitoring xl process with get a LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN/LIBXL_SHUTDOWN_REASON_SUSPEND event.
This will cause it to exit because it does not know the -p/-c flags were used to keep the domU active.
As a result the final shutdown will not destroy the domU.
Write a flag to xenstore to let the monitoring process know about the usage of -p/-c.
Remove the flag once the suspend is done.
Recognize the flag in the monitoring process.
Keep going if the flag is seen.
Watch again for @releaseDomain events.
Keep going if the event type and shutdown reason remains the same.
---
tools/xl/Makefile | 3 ++-
tools/xl/xl.h | 1 +
tools/xl/xl_saverestore.c | 15 ++++++++++++
tools/xl/xl_vmcontrol.c | 48 +++++++++++++++++++++++++++++++++++++++
4 files changed, 66 insertions(+), 1 deletion(-)
--- a/tools/xl/Makefile
+++ b/tools/xl/Makefile
@@ -26,6 +26,7 @@ XL_OBJS += xl_vmcontrol.o xl_saverestore
XL_OBJS += xl_vdispl.o xl_vsnd.o xl_vkb.o
$(XL_OBJS): CFLAGS += $(CFLAGS_libxentoollog)
+$(XL_OBJS): CFLAGS += $(CFLAGS_libxenstore)
$(XL_OBJS): CFLAGS += $(CFLAGS_XL)
$(XL_OBJS): CFLAGS += -include $(XEN_ROOT)/tools/config.h # libxl_json.h needs it.
@@ -33,7 +34,7 @@ $(XL_OBJS): CFLAGS += -include $(XEN_ROO
all: xl
xl: $(XL_OBJS)
- $(CC) $(LDFLAGS) -o $@ $(XL_OBJS) $(LDLIBS_libxenutil) $(LDLIBS_libxenlight) $(LDLIBS_libxentoollog) -lyajl $(APPEND_LDFLAGS)
+ $(CC) $(LDFLAGS) -o $@ $(XL_OBJS) $(LDLIBS_libxenutil) $(LDLIBS_libxenlight) $(LDLIBS_libxentoollog) $(LDLIBS_libxenstore) -lyajl $(APPEND_LDFLAGS)
.PHONY: install
install: all
--- a/tools/xl/xl.h
+++ b/tools/xl/xl.h
@@ -303,6 +303,7 @@ typedef enum {
DOMAIN_RESTART_SOFT_RESET, /* Soft reset should be performed */
} domain_restart_type;
+#define XL_SAVE_PAUSE_CHECKPOINT "suse-xl-save-pc"
extern void printf_info_sexp(int domid, libxl_domain_config *d_config, FILE *fh);
extern void apply_global_affinity_masks(libxl_domain_type type,
libxl_bitmap *vcpu_affinity_array,
--- a/tools/xl/xl_saverestore.c
+++ b/tools/xl/xl_saverestore.c
@@ -21,6 +21,7 @@
#include <time.h>
#include <unistd.h>
+#include <xenstore.h>
#include <libxl.h>
#include <libxl_utils.h>
#include <libxlutil.h>
@@ -127,6 +128,8 @@ static int save_domain(uint32_t domid, i
const char *filename, int checkpoint,
int leavepaused, const char *override_config_file)
{
+ struct xs_handle *xsh = NULL;
+ char path[80];
int fd;
uint8_t *config_data;
int config_len;
@@ -144,12 +147,24 @@ static int save_domain(uint32_t domid, i
fprintf(stderr, "Failed to open temp file %s for writing\n", filename);
exit(EXIT_FAILURE);
}
+ if (leavepaused || checkpoint)
+ {
+ snprintf(path, sizeof(path), "/libxl/%u/" XL_SAVE_PAUSE_CHECKPOINT, domid);
+ xsh = xs_open(0);
+ if (xsh)
+ xs_write(xsh, XBT_NULL, path, leavepaused ? "p" : "c", 1);
+ }
save_domain_core_writeconfig(fd, filename, config_data, config_len);
int rc = libxl_domain_suspend_suse(ctx, domid, fd, &props, NULL);
close(fd);
+ if (xsh) {
+ xs_rm(xsh, XBT_NULL, path);
+ xs_close(xsh);
+ }
+
if (rc < 0) {
fprintf(stderr, "Failed to save domain, resuming domain\n");
libxl_domain_resume(ctx, domid, 1, 0);
--- a/tools/xl/xl_vmcontrol.c
+++ b/tools/xl/xl_vmcontrol.c
@@ -22,6 +22,7 @@
#include <time.h>
#include <unistd.h>
+#include <xenstore.h>
#include <libxl.h>
#include <libxl_utils.h>
#include <libxlutil.h>
@@ -668,6 +669,10 @@ int create_domain(struct domain_create *
int migrate_fd = dom_info->migrate_fd;
bool config_in_json;
+ libxl_event_type type = 0;
+ uint8_t shutdown_reason = 0;
+ bool is_in_suspend = false;
+
int i;
int need_daemon = daemonize;
int ret, rc;
@@ -1034,6 +1039,24 @@ start:
ret = domain_wait_event(domid, &event);
if (ret) goto out;
+ if (is_in_suspend) {
+ if ( type == event->type && event->u.domain_shutdown.shutdown_reason == shutdown_reason) {
+ struct timespec req = { .tv_nsec = 123456789, };
+ libxl_evdisable_domain_death(ctx, deathw);
+ deathw = NULL;
+ ret = libxl_evenable_domain_death(ctx, domid, 0, &deathw);
+ if (ret) goto out;
+ libxl_event_free(ctx, event);
+ LOG("Domain %u still suspended", domid);
+ nanosleep(&req, NULL);
+ continue;
+ }
+ is_in_suspend = false;
+ LOG("Domain %u left suspend state", domid);
+ }
+ type = event->type;
+ shutdown_reason = event->u.domain_shutdown.shutdown_reason;
+
switch (event->type) {
case LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN:
@@ -1095,14 +1118,39 @@ start:
goto start;
case DOMAIN_RESTART_NONE:
+ {
+ struct xs_handle *xsh = xs_open(0);
+
+ if (xsh) {
+ char path[80];
+ unsigned int len = 0;
+ char *val;
+
+ snprintf(path, sizeof(path), "/libxl/%u/" XL_SAVE_PAUSE_CHECKPOINT, domid);
+ val = xs_read(xsh, XBT_NULL, path, &len);
+ xs_close(xsh);
+ LOG("Got %p '%s' from %s, len %u", val, val ?:"", path, len);
+ free(val);
+ if (val)
+ {
+ is_in_suspend = true;
+ libxl_evdisable_domain_death(ctx, deathw);
+ deathw = NULL;
+ ret = libxl_evenable_domain_death(ctx, domid, 0, &deathw);
+ if (ret) goto out;
+ break;
+ }
+ }
LOG("Done. Exiting now");
libxl_event_free(ctx, event);
ret = 0;
goto out;
+ }
default:
abort();
}
+ break;
case LIBXL_EVENT_TYPE_DOMAIN_DEATH:
LOG("Domain %u has been destroyed.", domid);