197 lines
7.6 KiB
Diff
197 lines
7.6 KiB
Diff
Index: nfs-utils-1.1.3/utils/mount/nfs.man
|
||
===================================================================
|
||
--- nfs-utils-1.1.3.orig/utils/mount/nfs.man
|
||
+++ nfs-utils-1.1.3/utils/mount/nfs.man
|
||
@@ -438,6 +438,8 @@ Specifying
|
||
.B proto=udp
|
||
forces all traffic types to use UDP.
|
||
.IP
|
||
+.B Before using NFS over UDP, please refer to the section WARNINGS below.
|
||
+.IP
|
||
If the
|
||
.B proto
|
||
mount option is not specified, the
|
||
@@ -452,6 +454,8 @@ The
|
||
option is an alternative to specifying
|
||
.BR proto=udp.
|
||
It is included for compatibility with other operating systems.
|
||
+.IP
|
||
+.B Before using NFS over UDP, please refer to the section WARNINGS below.
|
||
.TP 1.5i
|
||
.B tcp
|
||
The
|
||
@@ -799,7 +803,7 @@ dropped requests, but this can result in
|
||
and server load.
|
||
.P
|
||
However, UDP can be quite effective in specialized settings where
|
||
-the network’s MTU is large relative to NFS’s data transfer size (such
|
||
+the network's MTU is large relative to NFS's data transfer size (such
|
||
as network environments that enable jumbo Ethernet frames). In such
|
||
environments, trimming the
|
||
.B rsize
|
||
@@ -811,6 +815,8 @@ in a single frame) is advised. This r
|
||
the loss of a single MTU-sized network frame results in the loss of
|
||
an entire large read or write request.
|
||
.P
|
||
+Please see also the WARNINGS section below.
|
||
+.P
|
||
TCP is the default transport protocol used for all modern NFS
|
||
implementations. It performs well in almost every conceivable
|
||
network environment and provides excellent guarantees against data
|
||
@@ -1179,6 +1185,83 @@ of Access Control Lists that are semanti
|
||
NFS version 4 ACLs are not fully compatible with POSIX ACLs; as such,
|
||
some translation between the two is required
|
||
in an environment that mixes POSIX ACLs and NFS version 4.
|
||
+.SH WARNINGS
|
||
+Using NFS over UDP on high-speed links such as Gigabit
|
||
+.BR "can cause silent data corruption" .
|
||
+.P
|
||
+The problem can be triggered at high loads, and is caused by problems in
|
||
+IP fragment reassembly. NFS read and writes typically transmit UDP packets
|
||
+of 4 Kilobytes or more, which have to be broken up into several fragments
|
||
+in order to be sent over the Ethernet link, which limits packets to 1500
|
||
+bytes by default. This process happens at the IP network layer and is
|
||
+called fragmentation.
|
||
+.P
|
||
+In order to identify fragments that belong together, IP assigns a 16bit
|
||
+.I IP ID
|
||
+value to each packet; fragments generated from the same UDP packet
|
||
+will have the same IP ID. The receiving system will collect these
|
||
+fragments and combine them to form the original UDP packet. This process
|
||
+is called reassembly. The default timeout for packet reassembly is
|
||
+30 seconds; if the network stack does not receive all fragments of
|
||
+a given packet within this interval, it assumes the missing fragment(s)
|
||
+got lost and discards those it already received.
|
||
+.P
|
||
+The problem this creates over high-speed links is that it is possible
|
||
+to send more than 65536 packets within 30 seconds. In fact, with
|
||
+heavy NFS traffic one can observe that the IP IDs repeat after about
|
||
+5 seconds.
|
||
+.P
|
||
+This has serious effects on reassembly: if one fragment gets lost,
|
||
+another fragment
|
||
+.I from a different packet
|
||
+but with the
|
||
+.I same IP ID
|
||
+will arrive within the 30 second timeout, and the network stack will
|
||
+combine these fragments to form a new packet. Most of the time, network
|
||
+layers above IP will detect this mismatched reassembly - in the case
|
||
+of UDP, the UDP checksum, which is a 16 bit checksum over the entire
|
||
+packet payload, will usually not match, and UDP will discard the
|
||
+bad packet.
|
||
+.P
|
||
+However, the UDP checksum is 16 bit only, so there is a chance of 1 in
|
||
+65536 that it will match even if the packet payload is completely
|
||
+random (which very often isn't the case). If that is the case,
|
||
+silent data corruption will occur.
|
||
+.P
|
||
+This potential should be taken seriously, at least on Gigabit
|
||
+Ethernet.
|
||
+Network speeds of 100Mbit/s should be considered less
|
||
+problematic, because with most traffic patterns IP ID wrap around
|
||
+will take much longer than 30 seconds.
|
||
+.P
|
||
+It is therefore strongly recommended to use
|
||
+.BR "NFS over TCP where possible" ,
|
||
+since TCP does not perform fragmentation.
|
||
+.P
|
||
+If you absolutely have to use NFS over UDP over Gigabit Ethernet,
|
||
+some steps can be taken to mitigate the problem and reduce the
|
||
+probability of corruption:
|
||
+.TP +1.5i
|
||
+.I Jumbo frames:
|
||
+Many Gigabit network cards are capable of transmitting
|
||
+frames bigger than the 1500 byte limit of traditional Ethernet, typically
|
||
+9000 bytes. Using jumbo frames of 9000 bytes will allow you to run NFS over
|
||
+UDP at a page size of 8K without fragmentation. Of course, this is
|
||
+only feasible if all involved stations support jumbo frames.
|
||
+.IP
|
||
+To enable a machine to send jumbo frames on cards that support it,
|
||
+it is sufficient to configure the interface for a MTU value of 9000.
|
||
+.TP +1.5i
|
||
+.I Lower reassembly timeout:
|
||
+By lowering this timeout below the time it takes the IP ID counter
|
||
+to wrap around, incorrect reassembly of fragments can be prevented
|
||
+as well. To do so, simply write the new timeout value (in seconds)
|
||
+to the file
|
||
+.BR /proc/sys/net/ipv4/ipfrag_time .
|
||
+.IP
|
||
+A value of 2 seconds will greatly reduce the probability of IPID clashes on
|
||
+a single Gigabit link, while still allowing for a reasonable timeout
|
||
+when receiving fragmented traffic from distant peers.
|
||
.SH FILES
|
||
.TP 1.5i
|
||
.I /etc/fstab
|
||
Index: nfs-utils-1.1.3/utils/mount/nfsmount.c
|
||
===================================================================
|
||
--- nfs-utils-1.1.3.orig/utils/mount/nfsmount.c
|
||
+++ nfs-utils-1.1.3/utils/mount/nfsmount.c
|
||
@@ -263,6 +263,9 @@ parse_options(char *old_opts, struct nfs
|
||
if (!strcmp(opteq+1, "udp")) {
|
||
nfs_pmap->pm_prot = IPPROTO_UDP;
|
||
mnt_pmap->pm_prot = IPPROTO_UDP;
|
||
+ fprintf(stderr,
|
||
+ "Using NFS over UDP can cause data corruption.\n"
|
||
+ "Please refer to the WARNINGS section of the nfs(5) manual page.\n");
|
||
#if NFS_MOUNT_VERSION >= 2
|
||
data->flags &= ~NFS_MOUNT_TCP;
|
||
} else if (!strcmp(opteq+1, "tcp") &&
|
||
Index: nfs-utils-1.1.3/utils/mount/stropts.c
|
||
===================================================================
|
||
--- nfs-utils-1.1.3.orig/utils/mount/stropts.c
|
||
+++ nfs-utils-1.1.3/utils/mount/stropts.c
|
||
@@ -389,14 +389,28 @@ static struct mount_options *nfs_rewrite
|
||
po_remove_all(options, "proto");
|
||
}
|
||
if (strcmp(option, "udp") == 0) {
|
||
+ static int once = 0;
|
||
nfs_server.pmap.pm_prot = IPPROTO_UDP;
|
||
po_remove_all(options, "proto");
|
||
+ if (!once)
|
||
+ fprintf(stderr,
|
||
+ "Using NFS over UDP can cause data corruption.\n"
|
||
+ "Please refer to the WARNINGS section of the nfs(5) manual page.\n");
|
||
+ once = 1;
|
||
+
|
||
}
|
||
}
|
||
p = po_rightmost(options, "tcp", "udp");
|
||
switch (p) {
|
||
case PO_KEY2_RIGHTMOST:
|
||
nfs_server.pmap.pm_prot = IPPROTO_UDP;
|
||
+ {static int once = 0;
|
||
+ if (!once)
|
||
+ fprintf(stderr,
|
||
+ "Using NFS over UDP can cause data corruption.\n"
|
||
+ "Please refer to the WARNINGS section of the nfs(5) manual page.\n");
|
||
+ once = 1;
|
||
+ }
|
||
break;
|
||
case PO_KEY1_RIGHTMOST:
|
||
nfs_server.pmap.pm_prot = IPPROTO_TCP;
|
||
@@ -565,11 +579,24 @@ static int nfs_try_nfs23mount(struct nfs
|
||
static int nfs_try_nfs4mount(struct nfsmount_info *mi)
|
||
{
|
||
char **extra_opts = mi->extra_opts;
|
||
+ char *proto;
|
||
+ static int once = 0;
|
||
|
||
if (po_join(mi->options, extra_opts) == PO_FAILED) {
|
||
errno = EIO;
|
||
return 0;
|
||
}
|
||
+ if (po_contains(mi->options, "udp"))
|
||
+ proto = "udp";
|
||
+ else
|
||
+ proto = po_get(mi->options, "proto");
|
||
+ if (proto && strcmp(proto, "udp") == 0 && !once) {
|
||
+ fprintf(stderr,
|
||
+ "Using NFS over UDP can cause data corruption.\n"
|
||
+ "Please refer to the WARNINGS section of the nfs(5) manual page.\n");
|
||
+ once=1;
|
||
+ }
|
||
+
|
||
|
||
if (verbose)
|
||
printf(_("%s: text-based options: '%s'\n"),
|