Index: nfs-utils-1.1.3/utils/mount/nfs.man =================================================================== --- nfs-utils-1.1.3.orig/utils/mount/nfs.man +++ nfs-utils-1.1.3/utils/mount/nfs.man @@ -438,6 +438,8 @@ Specifying .B proto=udp forces all traffic types to use UDP. .IP +.B Before using NFS over UDP, please refer to the section WARNINGS below. +.IP If the .B proto mount option is not specified, the @@ -452,6 +454,8 @@ The option is an alternative to specifying .BR proto=udp. It is included for compatibility with other operating systems. +.IP +.B Before using NFS over UDP, please refer to the section WARNINGS below. .TP 1.5i .B tcp The @@ -799,7 +803,7 @@ dropped requests, but this can result in and server load. .P However, UDP can be quite effective in specialized settings where -the network’s MTU is large relative to NFS’s data transfer size (such +the network's MTU is large relative to NFS's data transfer size (such as network environments that enable jumbo Ethernet frames). In such environments, trimming the .B rsize @@ -811,6 +815,8 @@ in a single frame) is advised. This r the loss of a single MTU-sized network frame results in the loss of an entire large read or write request. .P +Please see also the WARNINGS section below. +.P TCP is the default transport protocol used for all modern NFS implementations. It performs well in almost every conceivable network environment and provides excellent guarantees against data @@ -1179,6 +1185,83 @@ of Access Control Lists that are semanti NFS version 4 ACLs are not fully compatible with POSIX ACLs; as such, some translation between the two is required in an environment that mixes POSIX ACLs and NFS version 4. +.SH WARNINGS +Using NFS over UDP on high-speed links such as Gigabit +.BR "can cause silent data corruption" . +.P +The problem can be triggered at high loads, and is caused by problems in +IP fragment reassembly. NFS read and writes typically transmit UDP packets +of 4 Kilobytes or more, which have to be broken up into several fragments +in order to be sent over the Ethernet link, which limits packets to 1500 +bytes by default. This process happens at the IP network layer and is +called fragmentation. +.P +In order to identify fragments that belong together, IP assigns a 16bit +.I IP ID +value to each packet; fragments generated from the same UDP packet +will have the same IP ID. The receiving system will collect these +fragments and combine them to form the original UDP packet. This process +is called reassembly. The default timeout for packet reassembly is +30 seconds; if the network stack does not receive all fragments of +a given packet within this interval, it assumes the missing fragment(s) +got lost and discards those it already received. +.P +The problem this creates over high-speed links is that it is possible +to send more than 65536 packets within 30 seconds. In fact, with +heavy NFS traffic one can observe that the IP IDs repeat after about +5 seconds. +.P +This has serious effects on reassembly: if one fragment gets lost, +another fragment +.I from a different packet +but with the +.I same IP ID +will arrive within the 30 second timeout, and the network stack will +combine these fragments to form a new packet. Most of the time, network +layers above IP will detect this mismatched reassembly - in the case +of UDP, the UDP checksum, which is a 16 bit checksum over the entire +packet payload, will usually not match, and UDP will discard the +bad packet. +.P +However, the UDP checksum is 16 bit only, so there is a chance of 1 in +65536 that it will match even if the packet payload is completely +random (which very often isn't the case). If that is the case, +silent data corruption will occur. +.P +This potential should be taken seriously, at least on Gigabit +Ethernet. +Network speeds of 100Mbit/s should be considered less +problematic, because with most traffic patterns IP ID wrap around +will take much longer than 30 seconds. +.P +It is therefore strongly recommended to use +.BR "NFS over TCP where possible" , +since TCP does not perform fragmentation. +.P +If you absolutely have to use NFS over UDP over Gigabit Ethernet, +some steps can be taken to mitigate the problem and reduce the +probability of corruption: +.TP +1.5i +.I Jumbo frames: +Many Gigabit network cards are capable of transmitting +frames bigger than the 1500 byte limit of traditional Ethernet, typically +9000 bytes. Using jumbo frames of 9000 bytes will allow you to run NFS over +UDP at a page size of 8K without fragmentation. Of course, this is +only feasible if all involved stations support jumbo frames. +.IP +To enable a machine to send jumbo frames on cards that support it, +it is sufficient to configure the interface for a MTU value of 9000. +.TP +1.5i +.I Lower reassembly timeout: +By lowering this timeout below the time it takes the IP ID counter +to wrap around, incorrect reassembly of fragments can be prevented +as well. To do so, simply write the new timeout value (in seconds) +to the file +.BR /proc/sys/net/ipv4/ipfrag_time . +.IP +A value of 2 seconds will greatly reduce the probability of IPID clashes on +a single Gigabit link, while still allowing for a reasonable timeout +when receiving fragmented traffic from distant peers. .SH FILES .TP 1.5i .I /etc/fstab Index: nfs-utils-1.1.3/utils/mount/nfsmount.c =================================================================== --- nfs-utils-1.1.3.orig/utils/mount/nfsmount.c +++ nfs-utils-1.1.3/utils/mount/nfsmount.c @@ -263,6 +263,9 @@ parse_options(char *old_opts, struct nfs if (!strcmp(opteq+1, "udp")) { nfs_pmap->pm_prot = IPPROTO_UDP; mnt_pmap->pm_prot = IPPROTO_UDP; + fprintf(stderr, + "Using NFS over UDP can cause data corruption.\n" + "Please refer to the WARNINGS section of the nfs(5) manual page.\n"); #if NFS_MOUNT_VERSION >= 2 data->flags &= ~NFS_MOUNT_TCP; } else if (!strcmp(opteq+1, "tcp") && Index: nfs-utils-1.1.3/utils/mount/stropts.c =================================================================== --- nfs-utils-1.1.3.orig/utils/mount/stropts.c +++ nfs-utils-1.1.3/utils/mount/stropts.c @@ -389,14 +389,28 @@ static struct mount_options *nfs_rewrite po_remove_all(options, "proto"); } if (strcmp(option, "udp") == 0) { + static int once = 0; nfs_server.pmap.pm_prot = IPPROTO_UDP; po_remove_all(options, "proto"); + if (!once) + fprintf(stderr, + "Using NFS over UDP can cause data corruption.\n" + "Please refer to the WARNINGS section of the nfs(5) manual page.\n"); + once = 1; + } } p = po_rightmost(options, "tcp", "udp"); switch (p) { case PO_KEY2_RIGHTMOST: nfs_server.pmap.pm_prot = IPPROTO_UDP; + {static int once = 0; + if (!once) + fprintf(stderr, + "Using NFS over UDP can cause data corruption.\n" + "Please refer to the WARNINGS section of the nfs(5) manual page.\n"); + once = 1; + } break; case PO_KEY1_RIGHTMOST: nfs_server.pmap.pm_prot = IPPROTO_TCP; @@ -565,11 +579,24 @@ static int nfs_try_nfs23mount(struct nfs static int nfs_try_nfs4mount(struct nfsmount_info *mi) { char **extra_opts = mi->extra_opts; + char *proto; + static int once = 0; if (po_join(mi->options, extra_opts) == PO_FAILED) { errno = EIO; return 0; } + if (po_contains(mi->options, "udp")) + proto = "udp"; + else + proto = po_get(mi->options, "proto"); + if (proto && strcmp(proto, "udp") == 0 && !once) { + fprintf(stderr, + "Using NFS over UDP can cause data corruption.\n" + "Please refer to the WARNINGS section of the nfs(5) manual page.\n"); + once=1; + } + if (verbose) printf(_("%s: text-based options: '%s'\n"),