From 03056832957e60714a68d3e21fee9f6a28dc0f09 Mon Sep 17 00:00:00 2001 From: binh <> Date: Fri, 11 Feb 2005 04:09:07 +0000 Subject: [PATCH] More consolidation, hence the removal of several files. Binh. --- .../Linux-Networking/Bandwidth-Limiting.xml | 63 - .../docbook/Linux-Networking/Bridges.xml | 751 -- .../Linux-Networking/Compressed-TCP.xml | 236 - .../Linux-Networking/Connectivity-Devices.xml | 1147 +++ .../Firewalling-and-Masquerading.xml | 322 - .../Linux-Networking/IP-Accounting.xml | 134 - .../docbook/Linux-Networking/IP-Aliasing.xml | 377 - LDP/guide/docbook/Linux-Networking/ISDN.xml | 101 - .../docbook/Linux-Networking/Internet.xml | 21 - .../Linux-Networking/Load-Balancing.xml | 24 - .../docbook/Linux-Networking/Multicast.xml | 1465 ---- LDP/guide/docbook/Linux-Networking/NIS.xml | 1618 ----- .../Networking-Management.xml | 60 - .../docbook/Linux-Networking/Overview.xml | 22 + .../Protocols-and-Standards.xml | 1721 +++++ .../docbook/Linux-Networking/Redundancy.xml | 11 - .../Linux-Networking/Redundant-Networking.xml | 19 - .../docbook/Linux-Networking/Routing.xml | 6446 ----------------- .../docbook/Linux-Networking/Services.xml | 1051 +++ 19 files changed, 3941 insertions(+), 11648 deletions(-) delete mode 100644 LDP/guide/docbook/Linux-Networking/Bandwidth-Limiting.xml delete mode 100644 LDP/guide/docbook/Linux-Networking/Bridges.xml delete mode 100644 LDP/guide/docbook/Linux-Networking/Compressed-TCP.xml delete mode 100644 LDP/guide/docbook/Linux-Networking/Firewalling-and-Masquerading.xml delete mode 100644 LDP/guide/docbook/Linux-Networking/IP-Accounting.xml delete mode 100644 LDP/guide/docbook/Linux-Networking/IP-Aliasing.xml delete mode 100644 LDP/guide/docbook/Linux-Networking/ISDN.xml delete mode 100644 LDP/guide/docbook/Linux-Networking/Internet.xml delete mode 100644 LDP/guide/docbook/Linux-Networking/Load-Balancing.xml delete mode 100644 LDP/guide/docbook/Linux-Networking/Multicast.xml delete mode 100644 LDP/guide/docbook/Linux-Networking/NIS.xml delete mode 100644 LDP/guide/docbook/Linux-Networking/Networking-Management.xml delete mode 100644 LDP/guide/docbook/Linux-Networking/Redundancy.xml delete mode 100644 LDP/guide/docbook/Linux-Networking/Redundant-Networking.xml delete mode 100644 LDP/guide/docbook/Linux-Networking/Routing.xml diff --git a/LDP/guide/docbook/Linux-Networking/Bandwidth-Limiting.xml b/LDP/guide/docbook/Linux-Networking/Bandwidth-Limiting.xml deleted file mode 100644 index d0c2fb9b..00000000 --- a/LDP/guide/docbook/Linux-Networking/Bandwidth-Limiting.xml +++ /dev/null @@ -1,63 +0,0 @@ - - -Bandwidth-Limiting - - -This section describes how to set up your Linux server to limit download -bandwidth or incoming traffic and how to use your internet link more -efficiently. It is meant to provide an easy solution for limiting -incoming traffic, thus preventing our LAN users from consuming all the -bandwidth of our internet link. This is useful when our internet link -is slow or our LAN users download tons of mp3s and the newest Linux -distro's *.iso files. - - -* Bandwidth Limiting HOWTO - -6. Miscellaneous - -6.1. Useful resources - -Squid Web Proxy Cache -[http://www.squid-cache.org] http://www.squid-cache.org - -Squid 2.4 Stable 1 Configuration manual -[http://www.visolve.com/squidman/Configuration%20Guide.html] http:// -www.visolve.com/squidman/Configuration%20Guide.html -[http://www.visolve.com/squidman/Delaypool%20parameters.htm] http:// -www.visolve.com/squidman/Delaypool%20parameters.htm - -Squid FAQ -[http://www.squid-cache.org/Doc/FAQ/FAQ-19.html#ss19.8] http:// -www.squid-cache.org/Doc/FAQ/FAQ-19.html#ss19.8 - -cbq-init script -[ftp://ftp.equinox.gu.net/pub/linux/cbq/] ftp://ftp.equinox.gu.net/pub/linux/ -cbq/ - -Linux 2.4 Advanced Routing HOWTO -[http://www.linuxdoc.org/HOWTO/Adv-Routing-HOWTO.html] http:// -www.linuxdoc.org/HOWTO/Adv-Routing-HOWTO.html - -Traffic control (in Polish) -[http://ceti.pl/~kravietz/cbq/] http://ceti.pl/~kravietz/cbq/ - -Securing and Optimizing Linux Red Hat Edition - A Hands on Guide -[http://www.linuxdoc.org/guides.html] http://www.linuxdoc.org/guides.html - -IPTraf -[http://cebu.mozcom.com/riker/iptraf/] http://cebu.mozcom.com/riker/iptraf/ - -IPCHAINS -[http://www.linuxdoc.org/HOWTO/IPCHAINS-HOWTO.html] http://www.linuxdoc.org/ -HOWTO/IPCHAINS-HOWTO.html - -Nylon socks proxy server -[http://mesh.eecs.umich.edu/projects/nylon/] http://mesh.eecs.umich.edu/ -projects/nylon/ - -Indonesian translation of this HOWTO by Rahmat Rafiudin mjl_id@yahoo.com -[http://raf.unisba.ac.id/resources/BandwidthLimitingHOWTO/index.html] http:// -raf.unisba.ac.id/resources/BandwidthLimitingHOWTO/index.html - - diff --git a/LDP/guide/docbook/Linux-Networking/Bridges.xml b/LDP/guide/docbook/Linux-Networking/Bridges.xml deleted file mode 100644 index e52f1106..00000000 --- a/LDP/guide/docbook/Linux-Networking/Bridges.xml +++ /dev/null @@ -1,751 +0,0 @@ - - -Bridges - - -This section describes how to setup an ethernet bridge. What is an ethernet -bridge? An ethernet bridge is a device that controls data packets within a -subnet in an attempt to cut down the amount of traffic. A bridge is usually -placed between two separate groups of computers that talk within themselves, -but not so much with the computers in the other group. A good example of this -is to consider a cluster of Macintoshes and a cluster of Unix machines. Both -of these groups of machines tend to be quite chatty amongst themselves, and -the traffic they produce on the network causes collisions for the other -machines who are trying to speak to one another. A bridge would be placed -between these groups of computers. The job of the bridge is then to examine -the destination of the data packets one at a time and decide whether or not -to pass the packets to the other side of the ethernet segment. The result is -a faster, quieter network with less collisions. - - - -Several bridges can work together to create even larger networks of Ethernets -using the IEEE 802.1 spanning tree algorithm. As this is a standard, Linux -bridges will interoperate properly with other third party bridge products. -Additional packages allow filtering based on IP, IPX or MAC addresses. - - - -The section immediately below provides a quick guide as on how to create -a bridge. - - -1. Setup - -  * Get Bridge Config: [ftp://ftp.tux.org/people/alan-cox/BRCFG.tgz] - BRCFG.tgz - -  * BRCFG may also be found at: [http://coledd.com/networking/bridge/] http:/ - /coledd.com/networking/bridge - -  * Enable multiple ethernet devices on your machine by adding this line to - your /etc/lilo.conf, and re-run lilo: - +---------------------------------------------------------------+ - |append = "ether=0,0,eth1" | - +---------------------------------------------------------------+ - - If you have three interfaces on your bridge, use this line instead: - +---------------------------------------------------------------+ - |append = "ether=0,0,eth1 ether=0,0,eth2" | - +---------------------------------------------------------------+ - - More interfaces can be found by adding more ether statements. By default - a stock Linux kernel probes for a single ethercard, and once one is found - the probe ceases. The above append statement tells the kernel to keep - probing for more ethernet devices after the first one is found. - Alternatively, the boot parameter can be used instead: - +---------------------------------------------------------------+ - |linux ether=0,0,eth1 | - +---------------------------------------------------------------+ - - Or, with 3 interfaces, use: - +---------------------------------------------------------------+ - |linux ether=0,0,eth1 ether=0,0,eth2 | - +---------------------------------------------------------------+ - -  * Recompile the kernel with BRIDGING enabled. - -  * A bridge should not have an IP address. It CAN, but a plain bridge - doesn't need one. To remove the IP address from your bridge, go to /etc/ - sysconfig/network-scripts/ (for a RedHat system) and copy ifcfg-lo0 to - ifcfg-eth0 & ifcfg-eth1. In these two new files, change the line - containing DEVICE=lo to DEVICE=eth0 and DEVICE=eth1. Since other - distributions may deviate from this, you may need to refer to additional - documentation. If there are more than 2 interfaces to this bridge, be - sure to make the corresponding configurations to those, as well. - -  * Reboot so you are running the new kernel with BRIDGING in it, and also to - make sure that an IP addresses are not bound to the network interfaces. - -  * Once the system is backed up, put the ethernet cards into promiscuous - mode, so they will look at every packet that passes by its interface: - - -This section provide a guide on how to create an ethernet bridge and -add a 'netfilter' system. - - - Setting up an ethernet bridge gives us the chance to integrate a sur­ - veying and/or regulating instance transparently into an existing net­ - work. This setup requires no changes to the logical network topology. - It is accomplished by plugging the ethernet bridge in the physical - network topology between the network itself and the routing instance - (that piece of hardware connected to the Internet). - - 1. Introduction - - Ethernet bridges connect two or more distinct ethernet segments - transparently. - An ethernet bridge distributes ethernet frames coming in on one port - to other ports associated to the bridge interface. This is - accomplished with brain: Whenever the bridge knows on which port the - MAC address to which the frame is to be delivered is located it - forwards this frame only to this only port instead of polluting all - ports together. - Ethernet interfaces can be added to an existing bridge interface and - become then (logical) ports of the bridge interface. - Putting a netfilter structure on top of a bridge interface renders the - bridge capable of servicing filtering mechanisms. This way, a - transparent filtering instance can be created. It even needs no IP - address assigned to work. Of course, you can assign an IP address to - the bridge interface for maintenance purposes ( certainly, with ssh - only ;-). - The advantage of this system is evident. Transparency alleviates the - network administrator of the pain of restructuring the network - topology. And users may not notice the existence of the bridge but - their connection beeing blocked. Also, users are not disturbed while - working (think of a company where network connection loss pays alot). - The other common case is a client beeing connected to the global web - via a leased router. As the providers seldomly grant administration - privileges on their leasing hardware, the client cannot change the - interconnecting configuration. But, of course, the client has a - network running, and wants to spend at least as possible, he does not - want to reconfigure his entire network. And he does not need to if he - uses a bridging device. - - 2. Required software - - This software setup is needed on the ethernet bridge computer. - According to our ``Testing grounds''. - - 2.1. Featured Linux kernel - - As of kernel version 2.4.18 there's already support for the Ethernet - Bridge capability built-in. No patches needed so far. - But if we intend to use netfilter capabilities, because we want to run - iptables on our new Linux router/fw box, we still need to apply a - patch. Any patches needed can be found and downloaded on the - ``sourceforge Ethernet Bridge homepage''. - - root@bridge:~> cd /usr/src/ - root@bridge:~> wget -c http://bridge.sourceforge.net/devel/bridge-nf/bridge-nf-0.0.7-against-2.4.18.diff - root@bridge:~> cd /usr/src/linux/ - root@bridge:~> patch -p1 -i ../bridge-nf/bridge-nf-0.0.7-against-2.4.18.diff - - Supposedly we want netfilter support on our bridge interface and we - have already patched the vanillal kernel we may now activate some - necessary kernel configuration items. On how to build a private kernel - image see the CD-Net-Install-HOWTO, Toolbox . Oh, yeah, it's still in German - only. Hm, I have to fix this some time.. - - Nevertheless, we start by now: In - - Code maturity level options - - we activate - - [*] Prompt for development and/or incomplete code/drivers - - and in - - Loadable module support - - [*] Enable loadable module support - [*] Set version information on all module symbols - [*] Kernel module loader - - Ok, so far so good. Now, we go to - - Networking options - - and mark - - [*] Network packet filtering (replaces ipchains) - [*] Network packet filtering debugging - - Furthermore, in - - IP: Netfilter Configuration ---> - - we mark any item we need as module. Now the long awaited item: acti­ - vate - - 802.1d Ethernet Bridging - - as well as - - [*] netfilter (firewalling) support - - Note: - The above entry is available only if we successfully patched our - kernel! - - Finally, we just need a successful - - root@bridge:~> make dep clean bzImage modules modules_install - - cycle and we're done. Don't forget to edit /etc/lilo.conf and do - - root@bridge:~> lilo -t - root@bridge:~> lilo - root@bridge:~> reboot - - , though. - - Hint: - Perhaps we might mark our new kernel as the bridge kernel? We vi - the toplevel Makefile in our kernel sources and edit the head - line called EXTRAVERSION =. We may actually set it to, say - bridge? ;-) - After the modules_install we find the fresh modules in - /lib/modules/2.4.18bridge - - 2.2. Userspace tool: brctl - - Once our kernel has the capabilities needed to perform Ethernet Bridge - and netfilter actions, we prepare the user space tool brctl. brctl is - the configuration tool we use to ``set up'' anything to suit our - needs. - - We ``download the source tarball'', unpack it and change directory - into it. - - root@bridge:~> wget -c http://bridge.sourceforge.net/bridge-utils/bridge-utils-0.9.5.tar.gz - root@bridge:~> tar xvzf bridge-utils-0.9.5.tar.gz - root@bridge:~> cd bridge-utils-0.9.5 - - At this time, read the README and the files in the doc/ subdirectory. - Then do a simple make and copy the resulting brctl/brctl executable to - /sbin/. - - root@bridge:~> make - root@bridge:~> cp -vi brctl/brctl /sbin/ - - This is it. Go for ``Setup'' now. - - 3. Set Linux up to serve - - 3.1. Setting up the bridge - - We need Linux to know about the bridge. First tell it that we want one - virtual ethernet bridge interface: (this is to be executed on host - bridge, of course. See ``Testing grounds'') - - root@bridge:~> brctl addbr br0 - - Second, we do not need the STP (Spanning Tree Protocol). I.e. we do - only have one single router, so a loop is highly improbable. We may - then deactivate this feature. (Results in less polluted networking - environment, too): - - root@bridge:~> brctl stp br0 off - - After these preparations, we now do finally some effective commands. - We add our two (or even more) physical ethernet interfaces. That - means, we attach them to the just born logical (virtual) bridge inter­ - face br0. - - root@bridge:~> brctl addif br0 eth0 - root@bridge:~> brctl addif br0 eth1 - - Now, our two previously physical ethernet interfaces became a logical - bridge port each. Erm, ok, there were and will be the physical - devices. They are still there, go have a look ;-) But now they became - part of the logical bridge device and therefore need no IP configura­ - tion any longer. So release the IPs: - - root@bridge:~> ifconfig eth0 down - root@bridge:~> ifconfig eth1 down - root@bridge:~> ifconfig eth0 0.0.0.0 up - root@bridge:~> ifconfig eth1 0.0.0.0 up - - Great! We now have a box w/o any IP attached. So if you were configur­ - ing your future fw/router via TP, go for your local console now ;-)) - You have a serial console? Happy one :-) - - Optional: - We tell Linux the new (logical) interface and associate one - single IP with it: - - root@bridge:~> ifconfig br0 10.0.3.129 up - - And we're done. - Read the ``Important Note''! - - 3.2. Setting up the routing - - In case we are configuring a gateway we enable the forwarding in the - linux kernel. - - root@bridge:~> echo "1" > /proc/sys/net/ipv4/ip_forward - - Our box already has an IP assigned but no default route. We solve this - now: - - root@bridge:~> route add default gw 10.0.3.129 - - Finally, we should have a working net from, to and through the gate­ - way. - - 4. Test your new bridged environment! - - 4.1. Testing Grounds - - We imagine this scenario or similar: - /\ - Ethernet Ethernet ATM /-/ \ - --------- --------- --------- /-/ | - | Box |----------|Bridge |----------|Router |-----| Inter- \ - --------- --------- --------- \ net ---| - ^ ^ ^ ^ \ / - | | | | \---/ - eth0 eth0 eth1 if0 ^ - | | | | | - 10.0.3.2 none/10.0.3.1 195.137.15.7 anything else - \ / - \ / - ^ \-br0-/ - | ^ ^ - | ^ | | - | | | | - own own foreign hostile - - Our administrative power includes only machines marked with own, the - Router is completely off-limits and so is the Internet, of course. - That means, if we want to control the flying bits'n'bytes on the eth­ - ernet wire we can chose to integrate a common firewall or file in a - bridge. - Drawback of the standard way is you have to change the default gateway - route on every and any single host in your net. And this is really a - heavy weighting drawback, nobody wants to change more than 5 default - routes on 5 different hosts more than one time. Keep the time in mind, - this will consume, also! Not to forget, this is a error-prone way to - handle the more about security.. - The other way is clean, less time-consuming, more secure and less - error-prone. More secure in that we won't have the need to assign any - IP address. No IP, no danger. So far the theory, we hope, our stacks - are safe. (Although this hope should better not relied on..) The over­ - all advantage is, this bridge-setup is completely transparent, no IP, - MAC, .. changes at all. - So it's up to you to chose your preferred method. But we will handle - just the fancy one here ;-) - - 4.2. Ping it, Jim! - - We will configure the Box' eth0 as usual. The bridge's interfaces are - configured as described in ``Setup''. - - If we are to use forwarding we might perhaps do this one: ;-) - - root@bridge:~> echo "1" > /proc/sys/net/ipv4/ip_forward - - Optionally, we set up a default route: - - root@bridge:~> route add default gw 10.0.3.129 - - Then we set up some iptables rules on host bridge: - - root@bridge:~> iptables -P FORWARD DROP - root@bridge:~> iptables -F FORWARD - root@bridge:~> iptables -I FORWARD -j ACCEPT - root@bridge:~> iptables -I FORWARD -j LOG - root@bridge:~> iptables -I FORWARD -j DROP - root@bridge:~> iptables -A FORWARD -j DROP - root@bridge:~> iptables -x -v --line-numbers -L FORWARD - - The last line gives us the following output: - - Chain FORWARD (policy DROP 0 packets, 0 bytes) - num pkts bytes target prot opt in out source destination - 1 0 0 DROP all -- any any anywhere anywhere - 2 0 0 LOG all -- any any anywhere anywhere LOG level warning - 3 0 0 ACCEPT all -- any any anywhere anywhere - 4 0 0 DROP all -- any any anywhere anywhere - - The LOG target logs every packet via syslogd. Beware, this is intended - for testing purposes only, remove in production environment. Else you - end up either with filled logs and harddisk partitions by you yourself - or anyone else does this Denial of Service to you. You've been warned. - Test this ruleset now. Ping the router interface's IP (195.137.15.7) - on host box: - - root@box:~> ping -c 3 195.137.15.7 - PING router.provider.net (195.137.15.7) from 10.0.3.2 : 56(84) bytes of data. - --- router.provider.net ping statistics --- - 3 packets transmitted, 0 received, 100% loss, time 2020ms - ^C - root@box:~> - - By default, we DROP everything. No response, no logged packet. This - netfilter setup is designed to DROP all packets unless we delete the - rule that drops every packet (rule no. 1 above) before the LOG target - matches: - - root@bridge:~> iptables -D FORWARD 1 - root@bridge:~> iptables -x -v --line-numbers -L FORWARD - - Now, the rules are: - - Chain FORWARD (policy DROP 0 packets, 0 bytes) - num pkts bytes target prot opt in out source destination - 2 0 0 LOG all -- any any anywhere anywhere LOG level warning - 3 0 0 ACCEPT all -- any any anywhere anywhere - 4 0 0 DROP all -- any any anywhere anywhere - - And any packet may pass through. Test it with a ping on host box: - - root@box:~> ping -c 3 195.137.15.7 - PING router.provider.net (195.137.15.7) from 10.0.3.2 : 56(84) bytes of data. - 64 bytes from router.provider.net (195.137.15.7): icmp_seq=1 ttl=255 time=0.103 ms - 64 bytes from router.provider.net (195.137.15.7): icmp_seq=2 ttl=255 time=0.082 ms - 64 bytes from router.provider.net (195.137.15.7): icmp_seq=3 ttl=255 time=0.083 ms - - --- router.provider.net ping statistics --- - 3 packets transmitted, 3 received, 0% loss, time 2002ms - rtt min/avg/max/mdev = 0.082/0.089/0.103/0.012 ms - root@box:~> - - Yippeah! The router is alive, up and running. (Well it has been all - day long.. ;-) - - Important Note: - When we just fired up the bridge interface it takes about - roughly 30 seconds until the bridge is fully operational. This - is due the 30-seconds-learning phase of the bridge interface. - During this phase, the bridge ports are learning what MAC - addresses exist on what port. The bridge author, Lennert, tells - us in his TODO file, the 30-seconds-learning phase is subjected - to some improvement in a timely manner some time. - During the test phase, no packet will we forwarded. No ping be - answered. Remind this! - - 4.3. Actual configuration - - This section is intended to give you, dear reader, some hints about - how your system should look and feel after having processed this howto - successfully. - - 4.3.1. Interface configuration - - The output of your ifconfig command might look similar to this: - - root@bridge:~> ifconfig - br0 Link encap:Ethernet HWaddr 00:04:75:81:D2:1D - inet addr:10.0.3.129 Bcast:195.30.198.255 Mask:255.255.255.128 - UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1 - RX packets:826 errors:0 dropped:0 overruns:0 frame:0 - TX packets:737 errors:0 dropped:0 overruns:0 carrier:0 - collisions:0 txqueuelen:0 - RX bytes:161180 (157.4 Kb) TX bytes:66708 (65.1 Kb) - - eth0 Link encap:Ethernet HWaddr 00:04:75:81:ED:B7 - UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1 - RX packets:5729 errors:0 dropped:0 overruns:0 frame:0 - TX packets:3115 errors:0 dropped:0 overruns:0 carrier:656 - collisions:0 txqueuelen:100 - RX bytes:1922290 (1.8 Mb) TX bytes:298837 (291.8 Kb) - Interrupt:11 Base address:0xe400 - - eth1 Link encap:Ethernet HWaddr 00:04:75:81:D2:1D - UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1 - RX packets:0 errors:0 dropped:0 overruns:1 frame:0 - TX packets:243 errors:0 dropped:0 overruns:0 carrier:0 - collisions:0 txqueuelen:100 - RX bytes:342 (342.0 b) TX bytes:48379 (47.2 Kb) - Interrupt:7 Base address:0xe800 - - lo Link encap:Local Loopback - inet addr:127.0.0.1 Mask:255.0.0.0 - UP LOOPBACK RUNNING MTU:16436 Metric:1 - RX packets:1034 errors:0 dropped:0 overruns:0 frame:0 - TX packets:1034 errors:0 dropped:0 overruns:0 carrier:0 - collisions:0 txqueuelen:0 - RX bytes:82068 (80.1 Kb) TX bytes:82068 (80.1 Kb) - - 4.3.2. Routing configuration - - The output of your route command might look similar to this: - - root@bridge:~> route -n - Kernel IP routing table - Destination Gateway Genmask Flags Metric Ref Use Iface - 10.0.3.129 0.0.0.0 255.255.255.128 U 0 0 0 br0 - 0.0.0.0 10.0.3.129 0.0.0.0 UG 0 0 0 br0 - root@bridge:~> - - 4.3.3. Iptables configuration - - Please have a look at the ``Ping it, Jim!'' section. - - 4.4. Note - - Apparently, there must have been a bug in the br-nf code: - - From: Bart De Schuymer - Date: Sun, 1 Sep 2002 21:52:46 +0200 - To: Nils Radtke - Subject: Re: Ethernet-Brigde-netfilter-HOWTO - - Hello Nils, - - [...] - Also, network packet filtering debugging is generally a bad idea with the - br-nf patch. It can gives a lot of false warnings (about bugs) in the logs. - [...] - - Personally, I never had false positives in my log. Maybe, that bug has - been fixed. This mailed to Bart, he wrote: - - From: Bart De Schuymer - Date: Mon, 2 Sep 2002 18:30:25 +0200 - To: Nils Radtke - Subject: Re: Ethernet-Brigde-netfilter-HOWTO - - On Monday 02 September 2002 00:39, Nils Radtke wrote: - > Will the revision of the nf-debug code in br-nf be subject of improvement? - - I must admit I haven't been running any kernel with netfilter debugging - lately. It sure used to give false positives a few months ago (the bridge - mailing list has posts about that), I've been lacking time to see why and if - it is still the case. It's on my todo list. - [...] - - But (as of writing this 2002-09-19) I haven't found an official - announcement, this particular bug has been closed. So have a constant - look at this topic on the ``ethernet bridge mailinglist'' , if you are - interested in it's cure. - - 5. Links - - The Howto's author may be contacted via e-mail . - Howto Author's homepage . - - 5.1. Ethernet-Bridge - - - · Ethernet Bridge Mailinglist - - - · User space utilities, patches, etc.: Home of Linux kernel Ethernet - Bridge - - · Bridge-STP-HOWTO - - · Firewalling for Free, Shawn Grimes - <./additional_docs/Firewalling_for_Free.pdf> - - 5.2. Related Topics - - · Filtering on frame level, Ethernet-Bridging-Tables: - ebtables, sourceforge - ebtables, homepage at pandora.be - - ebtables, supported features - - ebtables, examples: basic - , - advanced - - ebtables, in-depth documentation - - ebtables, Hacking HOWTO - - - · IP mode, Linux Bridge extension: IP mode, LVS - - - · Linux in High-Availability environments: High-Availability Linux - - - · Linux Virtual Server: LVS - - -This section describes how to unite two separate ethernet LANs with an IP -tunnel between them. It is short and designed for impatient people or those -who lack time. - - -1. How does it work? - -You can transparently bridge traffic between 2 ethernet LANs to unite them, -if both of them are connected to Internet. - -There is no way to do a "real" bridge, you can only bridge third level -protocols, which linux knows how to route, but ethernet traffic with those -protocols will seem bridged. You can make 2 ethernet bridges, to bridge IP -and/or IPX traffic. You cannot transparently bridge any other third level -protocols between distinct LANs. You should read the rest of this document to -determine whether you can bridge any other protocol. ------------------------------------------------------------------------------ - -1.1. Bridging IP over ethernet traffic between 2 LANs. - -If you have: -+---------------------------------------------------------------------------+ -|PC1 (192.168.0.1 /24)--| | -|PC3 (192.168.0.3 /24)--| | -|PC5 (192.168.0.5 /24)--|--[ eth0 - bridge_1 - eth1 (195.0.0.1) ] | -| | -|PC253 (192.168.0.253/24)--| | -| | (192.168.0.2 /24) PC2 | -| | (192.168.0.4 /24) PC4 | -|[ (192.0.0.1) eth1 - bridge_2 - eth0 ] --| (192.168.0.6 /24) PC6 | -| | -| | (192.168.0.254/24) PC254 | -+---------------------------------------------------------------------------+ - -bridge_1 and bridge_2 are your Linux bridges and externally connected to the -Internet interface eth1. So 195.0.0.1 and 192.0.0.1 can be any valid Internet -addresses given to you by your ISP. - -So, you should: - - 1. Get two linux computers with kernels 2.2 or 2.4. Kernels should be - compiled with PPP and Advanced Router. You also need the iproute2 package - properly installed. Information on iproute2 can be found in - Configure.help of your kernel in the comments under Advanced Router. You - also need the following utilities: - -   + pppd (PPP daemon) - [ftp://cs.anu.edu.au/pub/software/ppp/] ftp:// - cs.anu.edu.au/pub/software/ppp/ - -   + PopTop (PPTP server) - [http://poptop.lineo.com/] http:// - poptop.lineo.com - -   + PPTP (Linux PPTP Client, by C.S. Ananian) - [http:// - www.pdos.lcs.mit.edu/~cananian/Projects/PPTP/] http:// - www.pdos.lcs.mit.edu/~cananian/Projects/PPTP/ - -   + tarpd (a trivial proxy arp daemon) - [http://www.cs.hut.fi/~tricky/ - utils/net/tarpd-1.6.tar.gz] htp://www.cs.hut.fi/~tricky/utils/net/ - tarpd-1.6.tar.gz - - - You can also find them on [http://www.freshmeat.net] http:// - www.freshmeat.net - - Please, keep in mind that you need special patches for pppd and the - kernel if you want to do MS Chap and MS Encryption (MPPE). Refer to the - PoPTop manual for instructions on how to get and install these patches. - - 2. Connect your routers to Internet, or establish any other communication - between them with the exception of IP. - - 3. Make a PPTP tunnel between them. There are example configurations in the - PoPToP (server) and pptp (client) manuals. - - 4. Now you should have two bridges and an IP tunnel between then, possibly - encrypted (refer to the PPP manual). Let's configure bridging. - - 5. Remember that the bridge is really a router, so we need to run the - following commands on our bridges (this assumes bridge_1 and bridge_2 are - IP addresses, assigned to each end of the PPTP tunnel between bridges): - - +---------------------------------------------------------------+ - | bridge_1$ip route add 192.168.0.2 via bridge_2 | - | bridge_1$ip route add 192.168.0.4 via bridge_2 | - | bridge_1$ip route add 192.168.0.6 via bridge_2 | - | | - | bridge_1$ip route add 192.168.0.254 via bridge_2 | - | bridge_1$ip route add 192.168.0.255 via bridge_2 | - | | - +---------------------------------------------------------------+ - - On the other side: - +---------------------------------------------------------------+ - | bridge_2$ip route add 192.168.0.1 via bridge_1 | - | bridge_2$ip route add 192.168.0.3 via bridge_1 | - | bridge_2$ip route add 192.168.0.5 via bridge_1 | - | | - | bridge_2$ip route add 192.168.0.253 via bridge_1 | - | | - +---------------------------------------------------------------+ - - This will tell each of bridges which hosts are on the other side. You can - do the same with the old-style route command. It will look like: - +---------------------------------------------------------------+ - | bridge_1$route add -host 192.168.0.2 gw bridge_2 | - | bridge_1$route add -host 192.168.0.4 gw bridge_2 | - | bridge_1$route add -host 192.168.0.6 gw bridge_2 | - | | - | bridge_1$route add -host 192.168.0.254 gw bridge_2 | - | bridge_1$route add -host 192.168.0.255 gw bridge_2 | - | | - +---------------------------------------------------------------+ - - On the other side: - +---------------------------------------------------------------+ - | bridge_2$route add -host 192.168.0.1 gw bridge_1 | - | bridge_2$route add -host 192.168.0.3 gw bridge_1 | - | bridge_2$route add -host 192.168.0.5 gw bridge_1 | - | | - | bridge_2$route add -host 192.168.0.253 gw bridge_1 | - | | - +---------------------------------------------------------------+ - - Please note once more that bridge_1 and bridge_2 are not IP addresses - given by your ISP, but IP addresses which you assigned to each end of the - PPTP tunnel. - - 6. Now you have two bridges and each of them knows where to find a - particular IP. But how do you tell those computers to send their traffic - for the remote network to the local bridge? You need tarpd. - - tarpd is a very simple daemon, which replies to arp requests for certain - IP addresses. You only need to run a tarpd on each bridge, and specify - the list of IP addresses found on the remote end. - - For example, for those two bridges you should run: - +---------------------------------------------------------------+ - | bridge_1$tarpd eth0 192.168.0.2 255.255.255.255 \ | - | 192.168.0.4 255.255.255.255 \ | - | | - | 192.168.0.254 255.255.255.255 | - | | - +---------------------------------------------------------------+ - - On the other side: - +---------------------------------------------------------------+ - | bridge_2$tarpd eth0 192.168.0.1 255.255.255.255 \ | - | 192.168.0.3 255.255.255.255 \ | - | | - | 192.168.0.253 255.255.255.255 | - | | - +---------------------------------------------------------------+ - - You specify 128 remote pairs (IP/mask. Mask should be 255.255.255.255 in - order not to confuse tarpd!) on each bridge. - - 7. Enjoy your bridges! - ------------------------------------------------------------------------------ -1.2. What about other protocols? - -Really, I can say nothing about other protocol routing. I never used them. -But I suppose if you are familiar with other protocols, it should not be too -difficult to bridge it this way. ------------------------------------------------------------------------------ - - Related HOWTOs: - - · Bridge+Firewall - - - · Bridge - - diff --git a/LDP/guide/docbook/Linux-Networking/Compressed-TCP.xml b/LDP/guide/docbook/Linux-Networking/Compressed-TCP.xml deleted file mode 100644 index 214887ae..00000000 --- a/LDP/guide/docbook/Linux-Networking/Compressed-TCP.xml +++ /dev/null @@ -1,236 +0,0 @@ - - -Compressed-TCP - - -In the past, we used to compress files in order to save disk space. -Today, disk space is cheap - but bandwidth is limited. By compressing -data streams such as TCP/IP-Sessions using SSH-like tools, you achieve -two goals: - - - 1) You save bandwidth/transfered volume (that is important if you have - to pay for traffic or if your network is loaded.). - 2) Speeding up low-bandwidth connections (Modem, GSM, ISDN). - - -This HowTo explains how to save both bandwith and connection time by -using tools like SSH1, SSH2, OpenSSH or LSH. - - -2. Compressing HTTP/FTP,... - - -My office is connected with a 64KBit ISDN line to the internet, so the -maximum transfer rate is about 7K/s. You can speed up the connection -by compressing it: when I download files, Netscape shows up a transfer -rate of up to 40K/s (Logfiles are compressable by factor 15). SSH is a -tool that is mainly designed to build up secure connections over -unsecured networks. Further more, SSH is able to compress connections -and to do port forwarding (like rinetd or redir). So it is the -appropriate tool to compress any simple TCP/IP connection. "Simple" -means, that only one TCP-connection is opened. An FTP-connections or -the connection between M$-Outlook and MS-Exchange are not simple as -several connections are established. SSH uses the LempleZiv (LZ77) -compression algorithm - so you will achieve the same high compression -rate as winzip/pkzip. In order to compress all HTTP-connections from -my intranet to the internet, I just have to execute one command on my -dial-in machine: - - - - -ssh -l -C -L8080::80 -f sleep -10000 - - - - - - = host that is located at my ISP. SSH-access is required. - = my login-ID on - = the web proxy of my ISP - - - - -My browser is configured to use localhost:8080 as proxy. My laptop -connects to the same socket. The connection is compressed and -forwarded to the real proxy by SSH. The infrastructure looks like: - - - - - 64KBit ISDN - My PC--------------------------------A PC (Unix/Linux/Win-NT) at my ISP - SSH-Client compressed SSH-Server, Port 22 - Port 8080 | - | | - | | - | | - |10MBit Ethernet |100MBit - |not compressed |not compressed - | | - | | - My second PC ISP's WWW-proxy - with Netscape,... Port 80 - (Laptop) - - - -3. Compressing Email - -3.1. Incoming Emails (POP3, IMAP4) - - -Most people fetch their email from the mailserver via POP3. POP3 is a -protocol with many disadvantages: - - - 1. POP3 transfers password in clear text. (There are SSL- - implementations of POP/IMAP and a challenge/response - authentication, defined in RFC-2095/2195). - - 2. POP3 causes much protocol overhead: first the client requests a - message than the server sends the message. After that the client - requests the transferred article to be deleted. The server confirms - the deletion. After that the server is ready for the next - transaction. So 4 transactions are needed for each email. - - 3. POP3 transfers the mails without compression although email is - highly compressible (factor=3.5). - - -You could compress POP3 by forwarding localhost:110 through a -compressed connection to your ISP's POP3-socket. After that you have -to tell your mail client to connect to localhost:110 in order to -download mail. That secures and speeds up the connection -- but the -download time still suffers from the POP3-inherent protocol overhead. - - - -It makes sense to substitute POP3 by a more efficient protocol. The -idea is to download the entire mailbox at once without generating -protocol overhead. Furthermore it makes sense to compress the -connections. The appropriate tool which offers both features is SCP. -You can download your mail-file like this: - - - - - scp -C -l loginId:/var/spool/mail/loginid /tmp/newmail - - - - -But there is a problem: what happens if a new email arrives at the -server during the download of your mailbox? The new mail would be -lost. Therefore it makes more sense to use the following commands: - - - - - ssh -l loginid mailserver -f mv /var/spool/mail/loginid - /tmp/loginid_fetchme - scp -C -l loginid:/tmp/my_new_mail /tmp/loginid_fetchme - - - - -A move (mv) is a elementary operation, so you won't get into truble if -you receive new mail during the execution of the commands. But if the -mail server directories /tmp/ and /var/spool/mail are not on the same -disc you might get problems. A solution is to create a lockfile on the -server before you execute the mv: touch /var/spool/mail/loginid.lock. -You should remove it, after that. A better solution is to move the -file loginid in the same directory: - - - - - ssh -l loginid mailserver -f mv /var/spool/mail/loginid - /var/spool/mail/loginid_fetchme - - - - -After that you can use formail instead of procmail in order to filter -/tmp/newmail into the right folder(s): - - - - -formail -s procmail < /tmp/newmail - - - -3.2. Outgoing Email (SMTP) - - -You send email over compresses and encrypted SSH-connections, in order -to: - - - · Save network traffic - · Secure the connection (This does not make sense, if the mail is - transported over untrusted networks, later.) - · Authenticate the sender. Many mail servers deny mail relaying in - order to prevent abuse. If you send an email over an SSH- - connection, the remote mail server (i.e. sendmail or MS-exchange) - thinks to be connected, locally. - - -If you have SSH-access on the mail server, you need the following -command: - - - - - ssh -C -l loginid mailserver -L2525:mailserver:25 - - - - -If you don't have SSH-access on the mail server but to a server that -is allowed to use your mail server as relay, the command is: - - - - - ssh -C -l loginid other_server -L2525:mailserver:25 - - - - -After that you can configure your mail client (or mail server: see -"smarthost") to send out mails to localhost port 2525. - - -4. Thoughts about performance. - - -Of course compression/encryption takes CPU time. It turned out that an -old Pentium-133 is able to encrypt and compress about 1GB/hour -- -that's quite a lot. If you compile SSH with the option "--with-none" -you can tell SSH to use no encryption. That saves a little -performance. Here is a comprison between several download methods -(during the test, a noncompressed 6MB-file was transfered from a -133MHz-Pentium-1 to a 233MHz Pentium2 laptop over a 10MBit ethernet -without other load). - - - - - +-------------------+--------+----------+-----------+----------------------+ - | | FTP |encrypted |compressed |compressed & encrypted| - +-------------------+--------+----------+-----------+----------------------+ - | Elapsed Time | 17.6s | 26s | 9s | 23s | - +-------------------+--------+----------+-----------+----------------------+ - | Throughput | 790K/s | 232K/s | 320K/s | 264K/s | - +-------------------+--------+----------+-----------+----------------------+ - |Compression Factor | 1 | 1 | 3.8 | 3.8 | - +-------------------+--------+----------+-----------+----------------------+ - - - - diff --git a/LDP/guide/docbook/Linux-Networking/Connectivity-Devices.xml b/LDP/guide/docbook/Linux-Networking/Connectivity-Devices.xml index d050484c..cccd1e0a 100644 --- a/LDP/guide/docbook/Linux-Networking/Connectivity-Devices.xml +++ b/LDP/guide/docbook/Linux-Networking/Connectivity-Devices.xml @@ -84,3 +84,1150 @@ e.g. AOL might be called a gateway to the Internet. + + + +Routing + +12.3. Packets and routers + +What the browser wants to do is send a command to the Web server on +www.tldp.org that looks like this: +GET /LDP/HOWTO/Fundamentals.html HTTP/1.0 + +Here's how that happens. The command is made into a packet, a block of bits +like a telegram that is wrapped with three important things; the source +address (the IP address of your machine), the destination address +(152.19.254.81), and a service number or port number (80, in this case) that +indicates that it's a World Wide Web request. + +Your machine then ships the packet down the wire (your connection to your +ISP, or local network) until it gets to a specialized machine called a +router. The router has a map of the Internet in its memory ?? not always a +complete one, but one that completely describes your network neighborhood and +knows how to get to the routers for other neighborhoods on the Internet. + +Your packet may pass through several routers on the way to its destination. +Routers are smart. They watch how long it takes for other routers to +acknowledge having received a packet. They also use that information to +direct traffic over fast links. They use it to notice when another router (or +a cable) have dropped off the network, and compensate if possible by finding +another route. + +There's an urban legend that the Internet was designed to survive nuclear +war. This is not true, but the Internet's design is extremely good at getting +reliable performance out of flaky hardware in an uncertain world. This is +directly due to the fact that its intelligence is distributed through +thousands of routers rather than concentrated in a few massive and vulnerable +switches (like the phone network). This means that failures tend to be well +localized and the network can route around them. + +Once your packet gets to its destination machine, that machine uses the +service number to feed the packet to the web server. The web server can tell +where to reply to by looking at the command packet's source IP address. When +the web server returns this document, it will be broken up into a number of +packets. The size of the packets will vary according to the transmission +media in the network and the type of service. +----------------------------------------------------------------------------- + + 8.1. Router + + The Linux kernel has built-in support for routing functions. A Linux + box can act either as an IP or IPX router for a fraction of the cost + of a commercial router. Recent kernels include special options for + machines acting primarily as routers: + + · Multicasting: Allows the Linux machine to act as a router for IP + packets that have several destination addresses. It is needed on + the MBONE, a high bandwidth network on top of the Internet which + carries audio and video broadcasts. + + · IP policy routing: Normally a router decides what to do with a + received packet based solely on the packet's final destination + address, but routing can also take into account the originating + address and the network device from which the packet reached it. + + There are some related projects which include one aiming at building a + complete, running Linux router on a floppy disk: Linux router project + + +Linux Advanced Routing & Traffic Control HOWTO + +Linksys Blue Box Router HOWTO + + + + + + 6.6. IP Firewall (for Linux-2.0) + + IP Firewall and Firewalling issues are covered in more depth in the + Firewall-HOWTO. IP Firewalling allows you to secure your machine + against unauthorized network access by filtering or allowing datagrams + from or to IP addresses that you nominate. There are three different + classes of rules, incoming filtering, outgoing filtering and + forwarding filtering. Incoming rules are applied to datagrams that are + received by a network device. Outgoing rules are applied to datagrams + that are to be transmitted by a network device. Forwarding rules are + applied to datagrams that are received and are not for this machine, + ie datagrams that would be routed. + + Kernel Compile Options: + + + Networking options ---> + [*] Network firewalls + .... + [*] IP: forwarding/gatewaying + .... + [*] IP: firewalling + [ ] IP: firewall packet logging + + + + Configuration of the IP firewall rules is performed using the ipfwadm + command. As I mentioned earlier, security is not something I am expert + at, so while I will present an example you can use, you should do your + own research and develop your own rules if security is important to + you. + + Probably the most common use of IP firewall is when you are using your + linux machine as a router and firewall gateway to protect your local + network from unauthorized access from outside your network. + + + The following configuration is based on a contribution from Arnt + Gulbrandsen, . + + The example describes the configuration of the firewall rules on the + Linux firewall/router machine illustrated in this diagram: + + + + - - + \ | 172.16.37.0 + \ | /255.255.255.0 + \ --------- | + | 172.16.174.30 | Linux | | + NET =================| f/w |------| ..37.19 + | PPP | router| | -------- + / --------- |--| Mail | + / | | /DNS | + / | -------- + - - + + + + The following commands would normally be placed in an rc file so that + they were automatically started each time the system boots. For + maximum security they would be performed after the network interfaces + are configured, but before the interfaces are actually brought up to + prevent anyone gaining access while the firewall machine is rebooting. + + + + #!/bin/sh + + # Flush the 'Forwarding' rules table + # Change the default policy to 'accept' + # + /sbin/ipfwadm -F -f + /sbin/ipfwadm -F -p accept + # + # .. and for 'Incoming' + # + /sbin/ipfwadm -I -f + /sbin/ipfwadm -I -p accept + + # First off, seal off the PPP interface + # I'd love to use '-a deny' instead of '-a reject -y' but then it + # would be impossible to originate connections on that interface too. + # The -o causes all rejected datagrams to be logged. This trades + # disk space against knowledge of an attack of configuration error. + # + /sbin/ipfwadm -I -a reject -y -o -P tcp -S 0/0 -D 172.16.174.30 + + # Throw away certain kinds of obviously forged packets right away: + # Nothing should come from multicast/anycast/broadcast addresses + # + /sbin/ipfwadm -F -a deny -o -S 224.0/3 -D 172.16.37.0/24 + # + # and nothing coming from the loopback network should ever be + # seen on a wire + # + /sbin/ipfwadm -F -a deny -o -S 127.0/8 -D 172.16.37.0/24 + + # accept incoming SMTP and DNS connections, but only + # to the Mail/Name Server + # + /sbin/ipfwadm -F -a accept -P tcp -S 0/0 -D 172.16.37.19 25 53 + # + # DNS uses UDP as well as TCP, so allow that too + # for questions to our name server + # + /sbin/ipfwadm -F -a accept -P udp -S 0/0 -D 172.16.37.19 53 + # + # but not "answers" coming to dangerous ports like NFS and + # Larry McVoy's NFS extension. If you run squid, add its port here. + # + /sbin/ipfwadm -F -a deny -o -P udp -S 0/0 53 \ + -D 172.16.37.0/24 2049 2050 + + # answers to other user ports are okay + # + /sbin/ipfwadm -F -a accept -P udp -S 0/0 53 \ + -D 172.16.37.0/24 53 1024:65535 + + # Reject incoming connections to identd + # We use 'reject' here so that the connecting host is told + # straight away not to bother continuing, otherwise we'd experience + # delays while ident timed out. + # + /sbin/ipfwadm -F -a reject -o -P tcp -S 0/0 -D 172.16.37.0/24 113 + + # Accept some common service connections from the 192.168.64 and + # 192.168.65 networks, they are friends that we trust. + # + /sbin/ipfwadm -F -a accept -P tcp -S 192.168.64.0/23 \ + -D 172.16.37.0/24 20:23 + + # accept and pass through anything originating inside + # + /sbin/ipfwadm -F -a accept -P tcp -S 172.16.37.0/24 -D 0/0 + + # deny most other incoming TCP connections and log them + # (append 1:1023 if you have problems with ftp not working) + # + /sbin/ipfwadm -F -a deny -o -y -P tcp -S 0/0 -D 172.16.37.0/24 + + # ... for UDP too + # + /sbin/ipfwadm -F -a deny -o -P udp -S 0/0 -D 172.16.37.0/24 + + + + Good firewall configurations are a little tricky. This example should + be a reasonable starting point for you. The ipfwadm manual page offers + some assistance in how to use the tool. If you intend to configure a + firewall, be sure to ask around and get as much advice from sources + you consider reliable and get someone to test/sanity check your + configuration from the outside. + + 6.7. IP Firewall (for Linux-2.2) + + The new firewalling code is accessed via ``IP Firewall Chains''. See + the IP chanins home page for more information. Among other things, + you'll now need to use ipchains instead of ipfwadm to configure your + filters. (From Documentation/Changes in the latest kernel sources). + + We are aware that this is a sorely out of date statement and we are + currently working on getting this section more current. You can expect + a newer version in August of 1999. + + + 8.7. Firewall + + A firewall is a device that protects a private network from the public + part (the internet as a whole). It is designed to control the flow of + packets based on the source, destination, port and packet type + information contained in each packet. + + Different firewall toolkits exist for Linux as well as built-in + support in the kernel. Other firewalls are TIS and SOCKS. These + firewall toolkits are very complete and combined with other tools + allow blocking/redirection of all kinds of traffic and protocols. + Different policies can be implemented via configuration files or GUI + programs. + + + · TIS home page + + · SOCKS + + · Firewall HOWTO + + + 8.8. Port forwarding + + An increasing number of web sites are becoming interactive by having + cgi-bins or Java applets that access some database or other service. + Since this access may pose a security problem, the machine containing + the database should not be directly connected to the Internet. + + Port Forwarding can provide an almost ideal solution to this access + problem. On the firewall, IP packets that come in to a specific port + number can be re-written and forwarded to the internal server + providing the actual service. The reply packets from the internal + server are re-written to make it appear that they came from the + firewall. + + Port forwarding information may be found here + + + 8.3. IP Masquerade + + IP Masquerade is a developing networking function in Linux. If a Linux + host is connected to the Internet with IP Masquerade enabled, then + computers connecting to it (either on the same LAN or connected with + modems) can reach the Internet as well, even though they have no + officially assigned IP addresses. This allows for reduction of costs, + since many people may be able to access the Internet using a single + modem connection as well as contributes to increased security (in some + way the machine is acting as a firewall, since unofficially assigned + addresses cannot be accessed outside of that network). + + IP masquerade related pages and documents: + + · http://ipmasq.home.ml.org/ + · http://www.indyramp.com/masq/links.pfhtml + · http://metalab.unc.edu/mdw/HOWTO/IP-Masquerade-HOWTO.html + +Firewalling-and-Masquerading + + + + +Masquerading Made Simple HOWTO + +----------------------------------------------------------------------------- + +Chapter 8. Miscellaneous + +8.1. Useful Resources + +  * [http://ipmasq.webhop.net/] IP Masquerade Resource page Will have all the + current information for setting up IP Masquerade on 2.0.x, 2.2.x, and + even old 1.2 kernels! + +  * [http://juanjox.kernelnotes.org] Juan Jose Ciarlante's WWW site who is + one of the current Linux IP Masquerade maintainers. A mirror can be fount + at [http://ipmasq.webhop.net/juanjox/] ipmasq.webhop.net/juanjox + +  * IP Masquerade mailing list Archives contains the recent messages sent to + the mailing lists. + +  * David Ranch's Linux page including the TrinityOS Linux document and + current versions of the IP-MASQ-HOWTO.. Topics such as IP MASQ, strong + IPFWADM/IPCHAINS rulesets, PPP, Diald, Cablemodems, DNS, Sendmail, Samba, + NFS, Security, etc. are covered. + +  * The IP Masquerading Applications page: A comprehensive list of + applications that work or can be tuned to work through a Linux IP + masquerading server. + +  * For users setting up IP Masq on MkLinux, email Taro Fukunaga at [mailto: + tarozax@earthlink.net] tarozax@earthlink.net for a copy of his short + MkLinux version of this HOWTO. + +  * IP masquerade FAQ has some general information + +  * Paul Russel's [http://www.netfilter.org/ipchains/] http:// + www.netfilter.org/ipchains/ doc and its possibly older backup at Linux + IPCHAINS HOWTO. This HOWTO has lots of information for IPCHAINS usage, as + well as source and binaries for the ipchains tool. + +  * [http://www.xos.nl/linux/ipfwadm/] X/OS Ipfwadm page contains sources, + binaries, documentation, and other information about the ipfwadm package + +  * Check out the GreatCircle's Firewall mailing list for a great resource + about strong firewall rulesets. + +  * The LDP Network Administrator's Guide is a MUST for the beginner Linux + administrator trying to set up a network. + +  * The [http://www.tldp.org/HOWTO/Net-HOWTO/index.html] Linux NET HOWTO is + also another comprehensive document on how to setup and configure Linux + networking. + +  * Linux ISP Hookup HOWTO and [http://www.tldp.org/HOWTO/PPP-HOWTO/ + index.html] Linux PPP HOWTO gives you information on how to connect your + Linux host to the Internet + +  * Linux Ethernet-Howto is a good source of information about setting up a + LAN running over Ethernet. + +  * Donald Becker's NIC drivers and Support Utils + +  * You may also be interested in [http://www.tldp.org/HOWTO/ + Firewall-HOWTO.html] Linux Firewalling and Proxy Server HOWTO + +  * Linux Kernel HOWTO will guide you through the kernel compilation process + +  * Other [http://www.tldp.org/HOWTO/HOWTO-INDEX/howtos.html] Linux HOWTOs + such as Kernel HOWTO + +  * Posting to the USENET newsgroup: [news:comp.os.linux.networking] + comp.os.linux.networking + +----------------------------------------------------------------------------- + +8.2. Linux IP Masquerade Resource + +The [http://ipmasq.webhop.net/] Linux IP Masquerade Resource is a website +dedicated to Linux IP Masquerade information also maintained by Ambrose Au. +It has the latest information related to IP Masquerade and may have +information that is not being included in the HOWTO. + +You may find the Linux IP Masquerade Resource at the following locations: + +  * [http://ipmasq.webhop.net/] http://ipmasq.webhop.net/, Primary Site, + redirected to [http://ipmasq.webhop.net/] http://ipmasq.webhop.net/ + +  * [http://ipmasq2.webhop.net/] http://ipmasq2.webhop.net/, Secondary Site, + redirected to [http://www.e-infomax.com/ipmasq/] http://www.e-infomax.com + /ipmasq + + + + + +Bridges + + +This section describes how to setup an ethernet bridge. What is an ethernet +bridge? An ethernet bridge is a device that controls data packets within a +subnet in an attempt to cut down the amount of traffic. A bridge is usually +placed between two separate groups of computers that talk within themselves, +but not so much with the computers in the other group. A good example of this +is to consider a cluster of Macintoshes and a cluster of Unix machines. Both +of these groups of machines tend to be quite chatty amongst themselves, and +the traffic they produce on the network causes collisions for the other +machines who are trying to speak to one another. A bridge would be placed +between these groups of computers. The job of the bridge is then to examine +the destination of the data packets one at a time and decide whether or not +to pass the packets to the other side of the ethernet segment. The result is +a faster, quieter network with less collisions. + + + +Several bridges can work together to create even larger networks of Ethernets +using the IEEE 802.1 spanning tree algorithm. As this is a standard, Linux +bridges will interoperate properly with other third party bridge products. +Additional packages allow filtering based on IP, IPX or MAC addresses. + + + +The section immediately below provides a quick guide as on how to create +a bridge. + + +1. Setup + +  * Get Bridge Config: [ftp://ftp.tux.org/people/alan-cox/BRCFG.tgz] + BRCFG.tgz + +  * BRCFG may also be found at: [http://coledd.com/networking/bridge/] http:/ + /coledd.com/networking/bridge + +  * Enable multiple ethernet devices on your machine by adding this line to + your /etc/lilo.conf, and re-run lilo: + +---------------------------------------------------------------+ + |append = "ether=0,0,eth1" | + +---------------------------------------------------------------+ + + If you have three interfaces on your bridge, use this line instead: + +---------------------------------------------------------------+ + |append = "ether=0,0,eth1 ether=0,0,eth2" | + +---------------------------------------------------------------+ + + More interfaces can be found by adding more ether statements. By default + a stock Linux kernel probes for a single ethercard, and once one is found + the probe ceases. The above append statement tells the kernel to keep + probing for more ethernet devices after the first one is found. + Alternatively, the boot parameter can be used instead: + +---------------------------------------------------------------+ + |linux ether=0,0,eth1 | + +---------------------------------------------------------------+ + + Or, with 3 interfaces, use: + +---------------------------------------------------------------+ + |linux ether=0,0,eth1 ether=0,0,eth2 | + +---------------------------------------------------------------+ + +  * Recompile the kernel with BRIDGING enabled. + +  * A bridge should not have an IP address. It CAN, but a plain bridge + doesn't need one. To remove the IP address from your bridge, go to /etc/ + sysconfig/network-scripts/ (for a RedHat system) and copy ifcfg-lo0 to + ifcfg-eth0 & ifcfg-eth1. In these two new files, change the line + containing DEVICE=lo to DEVICE=eth0 and DEVICE=eth1. Since other + distributions may deviate from this, you may need to refer to additional + documentation. If there are more than 2 interfaces to this bridge, be + sure to make the corresponding configurations to those, as well. + +  * Reboot so you are running the new kernel with BRIDGING in it, and also to + make sure that an IP addresses are not bound to the network interfaces. + +  * Once the system is backed up, put the ethernet cards into promiscuous + mode, so they will look at every packet that passes by its interface: + + +This section provide a guide on how to create an ethernet bridge and +add a 'netfilter' system. + + + Setting up an ethernet bridge gives us the chance to integrate a sur­ + veying and/or regulating instance transparently into an existing net­ + work. This setup requires no changes to the logical network topology. + It is accomplished by plugging the ethernet bridge in the physical + network topology between the network itself and the routing instance + (that piece of hardware connected to the Internet). + + 1. Introduction + + Ethernet bridges connect two or more distinct ethernet segments + transparently. + An ethernet bridge distributes ethernet frames coming in on one port + to other ports associated to the bridge interface. This is + accomplished with brain: Whenever the bridge knows on which port the + MAC address to which the frame is to be delivered is located it + forwards this frame only to this only port instead of polluting all + ports together. + Ethernet interfaces can be added to an existing bridge interface and + become then (logical) ports of the bridge interface. + Putting a netfilter structure on top of a bridge interface renders the + bridge capable of servicing filtering mechanisms. This way, a + transparent filtering instance can be created. It even needs no IP + address assigned to work. Of course, you can assign an IP address to + the bridge interface for maintenance purposes ( certainly, with ssh + only ;-). + The advantage of this system is evident. Transparency alleviates the + network administrator of the pain of restructuring the network + topology. And users may not notice the existence of the bridge but + their connection beeing blocked. Also, users are not disturbed while + working (think of a company where network connection loss pays alot). + The other common case is a client beeing connected to the global web + via a leased router. As the providers seldomly grant administration + privileges on their leasing hardware, the client cannot change the + interconnecting configuration. But, of course, the client has a + network running, and wants to spend at least as possible, he does not + want to reconfigure his entire network. And he does not need to if he + uses a bridging device. + + 2. Required software + + This software setup is needed on the ethernet bridge computer. + According to our ``Testing grounds''. + + 2.1. Featured Linux kernel + + As of kernel version 2.4.18 there's already support for the Ethernet + Bridge capability built-in. No patches needed so far. + But if we intend to use netfilter capabilities, because we want to run + iptables on our new Linux router/fw box, we still need to apply a + patch. Any patches needed can be found and downloaded on the + ``sourceforge Ethernet Bridge homepage''. + + root@bridge:~> cd /usr/src/ + root@bridge:~> wget -c http://bridge.sourceforge.net/devel/bridge-nf/bridge-nf-0.0.7-against-2.4.18.diff + root@bridge:~> cd /usr/src/linux/ + root@bridge:~> patch -p1 -i ../bridge-nf/bridge-nf-0.0.7-against-2.4.18.diff + + Supposedly we want netfilter support on our bridge interface and we + have already patched the vanillal kernel we may now activate some + necessary kernel configuration items. On how to build a private kernel + image see the CD-Net-Install-HOWTO, Toolbox . Oh, yeah, it's still in German + only. Hm, I have to fix this some time.. + + Nevertheless, we start by now: In + + Code maturity level options + + we activate + + [*] Prompt for development and/or incomplete code/drivers + + and in + + Loadable module support + + [*] Enable loadable module support + [*] Set version information on all module symbols + [*] Kernel module loader + + Ok, so far so good. Now, we go to + + Networking options + + and mark + + [*] Network packet filtering (replaces ipchains) + [*] Network packet filtering debugging + + Furthermore, in + + IP: Netfilter Configuration ---> + + we mark any item we need as module. Now the long awaited item: acti­ + vate + + 802.1d Ethernet Bridging + + as well as + + [*] netfilter (firewalling) support + + Note: + The above entry is available only if we successfully patched our + kernel! + + Finally, we just need a successful + + root@bridge:~> make dep clean bzImage modules modules_install + + cycle and we're done. Don't forget to edit /etc/lilo.conf and do + + root@bridge:~> lilo -t + root@bridge:~> lilo + root@bridge:~> reboot + + , though. + + Hint: + Perhaps we might mark our new kernel as the bridge kernel? We vi + the toplevel Makefile in our kernel sources and edit the head + line called EXTRAVERSION =. We may actually set it to, say + bridge? ;-) + After the modules_install we find the fresh modules in + /lib/modules/2.4.18bridge + + 2.2. Userspace tool: brctl + + Once our kernel has the capabilities needed to perform Ethernet Bridge + and netfilter actions, we prepare the user space tool brctl. brctl is + the configuration tool we use to ``set up'' anything to suit our + needs. + + We ``download the source tarball'', unpack it and change directory + into it. + + root@bridge:~> wget -c http://bridge.sourceforge.net/bridge-utils/bridge-utils-0.9.5.tar.gz + root@bridge:~> tar xvzf bridge-utils-0.9.5.tar.gz + root@bridge:~> cd bridge-utils-0.9.5 + + At this time, read the README and the files in the doc/ subdirectory. + Then do a simple make and copy the resulting brctl/brctl executable to + /sbin/. + + root@bridge:~> make + root@bridge:~> cp -vi brctl/brctl /sbin/ + + This is it. Go for ``Setup'' now. + + 3. Set Linux up to serve + + 3.1. Setting up the bridge + + We need Linux to know about the bridge. First tell it that we want one + virtual ethernet bridge interface: (this is to be executed on host + bridge, of course. See ``Testing grounds'') + + root@bridge:~> brctl addbr br0 + + Second, we do not need the STP (Spanning Tree Protocol). I.e. we do + only have one single router, so a loop is highly improbable. We may + then deactivate this feature. (Results in less polluted networking + environment, too): + + root@bridge:~> brctl stp br0 off + + After these preparations, we now do finally some effective commands. + We add our two (or even more) physical ethernet interfaces. That + means, we attach them to the just born logical (virtual) bridge inter­ + face br0. + + root@bridge:~> brctl addif br0 eth0 + root@bridge:~> brctl addif br0 eth1 + + Now, our two previously physical ethernet interfaces became a logical + bridge port each. Erm, ok, there were and will be the physical + devices. They are still there, go have a look ;-) But now they became + part of the logical bridge device and therefore need no IP configura­ + tion any longer. So release the IPs: + + root@bridge:~> ifconfig eth0 down + root@bridge:~> ifconfig eth1 down + root@bridge:~> ifconfig eth0 0.0.0.0 up + root@bridge:~> ifconfig eth1 0.0.0.0 up + + Great! We now have a box w/o any IP attached. So if you were configur­ + ing your future fw/router via TP, go for your local console now ;-)) + You have a serial console? Happy one :-) + + Optional: + We tell Linux the new (logical) interface and associate one + single IP with it: + + root@bridge:~> ifconfig br0 10.0.3.129 up + + And we're done. + Read the ``Important Note''! + + 3.2. Setting up the routing + + In case we are configuring a gateway we enable the forwarding in the + linux kernel. + + root@bridge:~> echo "1" > /proc/sys/net/ipv4/ip_forward + + Our box already has an IP assigned but no default route. We solve this + now: + + root@bridge:~> route add default gw 10.0.3.129 + + Finally, we should have a working net from, to and through the gate­ + way. + + 4. Test your new bridged environment! + + 4.1. Testing Grounds + + We imagine this scenario or similar: + /\ + Ethernet Ethernet ATM /-/ \ + --------- --------- --------- /-/ | + | Box |----------|Bridge |----------|Router |-----| Inter- \ + --------- --------- --------- \ net ---| + ^ ^ ^ ^ \ / + | | | | \---/ + eth0 eth0 eth1 if0 ^ + | | | | | + 10.0.3.2 none/10.0.3.1 195.137.15.7 anything else + \ / + \ / + ^ \-br0-/ + | ^ ^ + | ^ | | + | | | | + own own foreign hostile + + Our administrative power includes only machines marked with own, the + Router is completely off-limits and so is the Internet, of course. + That means, if we want to control the flying bits'n'bytes on the eth­ + ernet wire we can chose to integrate a common firewall or file in a + bridge. + Drawback of the standard way is you have to change the default gateway + route on every and any single host in your net. And this is really a + heavy weighting drawback, nobody wants to change more than 5 default + routes on 5 different hosts more than one time. Keep the time in mind, + this will consume, also! Not to forget, this is a error-prone way to + handle the more about security.. + The other way is clean, less time-consuming, more secure and less + error-prone. More secure in that we won't have the need to assign any + IP address. No IP, no danger. So far the theory, we hope, our stacks + are safe. (Although this hope should better not relied on..) The over­ + all advantage is, this bridge-setup is completely transparent, no IP, + MAC, .. changes at all. + So it's up to you to chose your preferred method. But we will handle + just the fancy one here ;-) + + 4.2. Ping it, Jim! + + We will configure the Box' eth0 as usual. The bridge's interfaces are + configured as described in ``Setup''. + + If we are to use forwarding we might perhaps do this one: ;-) + + root@bridge:~> echo "1" > /proc/sys/net/ipv4/ip_forward + + Optionally, we set up a default route: + + root@bridge:~> route add default gw 10.0.3.129 + + Then we set up some iptables rules on host bridge: + + root@bridge:~> iptables -P FORWARD DROP + root@bridge:~> iptables -F FORWARD + root@bridge:~> iptables -I FORWARD -j ACCEPT + root@bridge:~> iptables -I FORWARD -j LOG + root@bridge:~> iptables -I FORWARD -j DROP + root@bridge:~> iptables -A FORWARD -j DROP + root@bridge:~> iptables -x -v --line-numbers -L FORWARD + + The last line gives us the following output: + + Chain FORWARD (policy DROP 0 packets, 0 bytes) + num pkts bytes target prot opt in out source destination + 1 0 0 DROP all -- any any anywhere anywhere + 2 0 0 LOG all -- any any anywhere anywhere LOG level warning + 3 0 0 ACCEPT all -- any any anywhere anywhere + 4 0 0 DROP all -- any any anywhere anywhere + + The LOG target logs every packet via syslogd. Beware, this is intended + for testing purposes only, remove in production environment. Else you + end up either with filled logs and harddisk partitions by you yourself + or anyone else does this Denial of Service to you. You've been warned. + Test this ruleset now. Ping the router interface's IP (195.137.15.7) + on host box: + + root@box:~> ping -c 3 195.137.15.7 + PING router.provider.net (195.137.15.7) from 10.0.3.2 : 56(84) bytes of data. + --- router.provider.net ping statistics --- + 3 packets transmitted, 0 received, 100% loss, time 2020ms + ^C + root@box:~> + + By default, we DROP everything. No response, no logged packet. This + netfilter setup is designed to DROP all packets unless we delete the + rule that drops every packet (rule no. 1 above) before the LOG target + matches: + + root@bridge:~> iptables -D FORWARD 1 + root@bridge:~> iptables -x -v --line-numbers -L FORWARD + + Now, the rules are: + + Chain FORWARD (policy DROP 0 packets, 0 bytes) + num pkts bytes target prot opt in out source destination + 2 0 0 LOG all -- any any anywhere anywhere LOG level warning + 3 0 0 ACCEPT all -- any any anywhere anywhere + 4 0 0 DROP all -- any any anywhere anywhere + + And any packet may pass through. Test it with a ping on host box: + + root@box:~> ping -c 3 195.137.15.7 + PING router.provider.net (195.137.15.7) from 10.0.3.2 : 56(84) bytes of data. + 64 bytes from router.provider.net (195.137.15.7): icmp_seq=1 ttl=255 time=0.103 ms + 64 bytes from router.provider.net (195.137.15.7): icmp_seq=2 ttl=255 time=0.082 ms + 64 bytes from router.provider.net (195.137.15.7): icmp_seq=3 ttl=255 time=0.083 ms + + --- router.provider.net ping statistics --- + 3 packets transmitted, 3 received, 0% loss, time 2002ms + rtt min/avg/max/mdev = 0.082/0.089/0.103/0.012 ms + root@box:~> + + Yippeah! The router is alive, up and running. (Well it has been all + day long.. ;-) + + Important Note: + When we just fired up the bridge interface it takes about + roughly 30 seconds until the bridge is fully operational. This + is due the 30-seconds-learning phase of the bridge interface. + During this phase, the bridge ports are learning what MAC + addresses exist on what port. The bridge author, Lennert, tells + us in his TODO file, the 30-seconds-learning phase is subjected + to some improvement in a timely manner some time. + During the test phase, no packet will we forwarded. No ping be + answered. Remind this! + + 4.3. Actual configuration + + This section is intended to give you, dear reader, some hints about + how your system should look and feel after having processed this howto + successfully. + + 4.3.1. Interface configuration + + The output of your ifconfig command might look similar to this: + + root@bridge:~> ifconfig + br0 Link encap:Ethernet HWaddr 00:04:75:81:D2:1D + inet addr:10.0.3.129 Bcast:195.30.198.255 Mask:255.255.255.128 + UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1 + RX packets:826 errors:0 dropped:0 overruns:0 frame:0 + TX packets:737 errors:0 dropped:0 overruns:0 carrier:0 + collisions:0 txqueuelen:0 + RX bytes:161180 (157.4 Kb) TX bytes:66708 (65.1 Kb) + + eth0 Link encap:Ethernet HWaddr 00:04:75:81:ED:B7 + UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1 + RX packets:5729 errors:0 dropped:0 overruns:0 frame:0 + TX packets:3115 errors:0 dropped:0 overruns:0 carrier:656 + collisions:0 txqueuelen:100 + RX bytes:1922290 (1.8 Mb) TX bytes:298837 (291.8 Kb) + Interrupt:11 Base address:0xe400 + + eth1 Link encap:Ethernet HWaddr 00:04:75:81:D2:1D + UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1 + RX packets:0 errors:0 dropped:0 overruns:1 frame:0 + TX packets:243 errors:0 dropped:0 overruns:0 carrier:0 + collisions:0 txqueuelen:100 + RX bytes:342 (342.0 b) TX bytes:48379 (47.2 Kb) + Interrupt:7 Base address:0xe800 + + lo Link encap:Local Loopback + inet addr:127.0.0.1 Mask:255.0.0.0 + UP LOOPBACK RUNNING MTU:16436 Metric:1 + RX packets:1034 errors:0 dropped:0 overruns:0 frame:0 + TX packets:1034 errors:0 dropped:0 overruns:0 carrier:0 + collisions:0 txqueuelen:0 + RX bytes:82068 (80.1 Kb) TX bytes:82068 (80.1 Kb) + + 4.3.2. Routing configuration + + The output of your route command might look similar to this: + + root@bridge:~> route -n + Kernel IP routing table + Destination Gateway Genmask Flags Metric Ref Use Iface + 10.0.3.129 0.0.0.0 255.255.255.128 U 0 0 0 br0 + 0.0.0.0 10.0.3.129 0.0.0.0 UG 0 0 0 br0 + root@bridge:~> + + 4.3.3. Iptables configuration + + Please have a look at the ``Ping it, Jim!'' section. + + 4.4. Note + + Apparently, there must have been a bug in the br-nf code: + + From: Bart De Schuymer + Date: Sun, 1 Sep 2002 21:52:46 +0200 + To: Nils Radtke + Subject: Re: Ethernet-Brigde-netfilter-HOWTO + + Hello Nils, + + [...] + Also, network packet filtering debugging is generally a bad idea with the + br-nf patch. It can gives a lot of false warnings (about bugs) in the logs. + [...] + + Personally, I never had false positives in my log. Maybe, that bug has + been fixed. This mailed to Bart, he wrote: + + From: Bart De Schuymer + Date: Mon, 2 Sep 2002 18:30:25 +0200 + To: Nils Radtke + Subject: Re: Ethernet-Brigde-netfilter-HOWTO + + On Monday 02 September 2002 00:39, Nils Radtke wrote: + > Will the revision of the nf-debug code in br-nf be subject of improvement? + + I must admit I haven't been running any kernel with netfilter debugging + lately. It sure used to give false positives a few months ago (the bridge + mailing list has posts about that), I've been lacking time to see why and if + it is still the case. It's on my todo list. + [...] + + But (as of writing this 2002-09-19) I haven't found an official + announcement, this particular bug has been closed. So have a constant + look at this topic on the ``ethernet bridge mailinglist'' , if you are + interested in it's cure. + + 5. Links + + The Howto's author may be contacted via e-mail . + Howto Author's homepage . + + 5.1. Ethernet-Bridge + + + · Ethernet Bridge Mailinglist + + + · User space utilities, patches, etc.: Home of Linux kernel Ethernet + Bridge + + · Bridge-STP-HOWTO + + · Firewalling for Free, Shawn Grimes + <./additional_docs/Firewalling_for_Free.pdf> + + 5.2. Related Topics + + · Filtering on frame level, Ethernet-Bridging-Tables: + ebtables, sourceforge + ebtables, homepage at pandora.be + + ebtables, supported features + + ebtables, examples: basic + , + advanced + + ebtables, in-depth documentation + + ebtables, Hacking HOWTO + + + · IP mode, Linux Bridge extension: IP mode, LVS + + + · Linux in High-Availability environments: High-Availability Linux + + + · Linux Virtual Server: LVS + + +This section describes how to unite two separate ethernet LANs with an IP +tunnel between them. It is short and designed for impatient people or those +who lack time. + + +1. How does it work? + +You can transparently bridge traffic between 2 ethernet LANs to unite them, +if both of them are connected to Internet. + +There is no way to do a "real" bridge, you can only bridge third level +protocols, which linux knows how to route, but ethernet traffic with those +protocols will seem bridged. You can make 2 ethernet bridges, to bridge IP +and/or IPX traffic. You cannot transparently bridge any other third level +protocols between distinct LANs. You should read the rest of this document to +determine whether you can bridge any other protocol. +----------------------------------------------------------------------------- + +1.1. Bridging IP over ethernet traffic between 2 LANs. + +If you have: ++---------------------------------------------------------------------------+ +|PC1 (192.168.0.1 /24)--| | +|PC3 (192.168.0.3 /24)--| | +|PC5 (192.168.0.5 /24)--|--[ eth0 - bridge_1 - eth1 (195.0.0.1) ] | +| | +|PC253 (192.168.0.253/24)--| | +| | (192.168.0.2 /24) PC2 | +| | (192.168.0.4 /24) PC4 | +|[ (192.0.0.1) eth1 - bridge_2 - eth0 ] --| (192.168.0.6 /24) PC6 | +| | +| | (192.168.0.254/24) PC254 | ++---------------------------------------------------------------------------+ + +bridge_1 and bridge_2 are your Linux bridges and externally connected to the +Internet interface eth1. So 195.0.0.1 and 192.0.0.1 can be any valid Internet +addresses given to you by your ISP. + +So, you should: + + 1. Get two linux computers with kernels 2.2 or 2.4. Kernels should be + compiled with PPP and Advanced Router. You also need the iproute2 package + properly installed. Information on iproute2 can be found in + Configure.help of your kernel in the comments under Advanced Router. You + also need the following utilities: + +   + pppd (PPP daemon) - [ftp://cs.anu.edu.au/pub/software/ppp/] ftp:// + cs.anu.edu.au/pub/software/ppp/ + +   + PopTop (PPTP server) - [http://poptop.lineo.com/] http:// + poptop.lineo.com + +   + PPTP (Linux PPTP Client, by C.S. Ananian) - [http:// + www.pdos.lcs.mit.edu/~cananian/Projects/PPTP/] http:// + www.pdos.lcs.mit.edu/~cananian/Projects/PPTP/ + +   + tarpd (a trivial proxy arp daemon) - [http://www.cs.hut.fi/~tricky/ + utils/net/tarpd-1.6.tar.gz] htp://www.cs.hut.fi/~tricky/utils/net/ + tarpd-1.6.tar.gz + + + You can also find them on [http://www.freshmeat.net] http:// + www.freshmeat.net + + Please, keep in mind that you need special patches for pppd and the + kernel if you want to do MS Chap and MS Encryption (MPPE). Refer to the + PoPTop manual for instructions on how to get and install these patches. + + 2. Connect your routers to Internet, or establish any other communication + between them with the exception of IP. + + 3. Make a PPTP tunnel between them. There are example configurations in the + PoPToP (server) and pptp (client) manuals. + + 4. Now you should have two bridges and an IP tunnel between then, possibly + encrypted (refer to the PPP manual). Let's configure bridging. + + 5. Remember that the bridge is really a router, so we need to run the + following commands on our bridges (this assumes bridge_1 and bridge_2 are + IP addresses, assigned to each end of the PPTP tunnel between bridges): + + +---------------------------------------------------------------+ + | bridge_1$ip route add 192.168.0.2 via bridge_2 | + | bridge_1$ip route add 192.168.0.4 via bridge_2 | + | bridge_1$ip route add 192.168.0.6 via bridge_2 | + | | + | bridge_1$ip route add 192.168.0.254 via bridge_2 | + | bridge_1$ip route add 192.168.0.255 via bridge_2 | + | | + +---------------------------------------------------------------+ + + On the other side: + +---------------------------------------------------------------+ + | bridge_2$ip route add 192.168.0.1 via bridge_1 | + | bridge_2$ip route add 192.168.0.3 via bridge_1 | + | bridge_2$ip route add 192.168.0.5 via bridge_1 | + | | + | bridge_2$ip route add 192.168.0.253 via bridge_1 | + | | + +---------------------------------------------------------------+ + + This will tell each of bridges which hosts are on the other side. You can + do the same with the old-style route command. It will look like: + +---------------------------------------------------------------+ + | bridge_1$route add -host 192.168.0.2 gw bridge_2 | + | bridge_1$route add -host 192.168.0.4 gw bridge_2 | + | bridge_1$route add -host 192.168.0.6 gw bridge_2 | + | | + | bridge_1$route add -host 192.168.0.254 gw bridge_2 | + | bridge_1$route add -host 192.168.0.255 gw bridge_2 | + | | + +---------------------------------------------------------------+ + + On the other side: + +---------------------------------------------------------------+ + | bridge_2$route add -host 192.168.0.1 gw bridge_1 | + | bridge_2$route add -host 192.168.0.3 gw bridge_1 | + | bridge_2$route add -host 192.168.0.5 gw bridge_1 | + | | + | bridge_2$route add -host 192.168.0.253 gw bridge_1 | + | | + +---------------------------------------------------------------+ + + Please note once more that bridge_1 and bridge_2 are not IP addresses + given by your ISP, but IP addresses which you assigned to each end of the + PPTP tunnel. + + 6. Now you have two bridges and each of them knows where to find a + particular IP. But how do you tell those computers to send their traffic + for the remote network to the local bridge? You need tarpd. + + tarpd is a very simple daemon, which replies to arp requests for certain + IP addresses. You only need to run a tarpd on each bridge, and specify + the list of IP addresses found on the remote end. + + For example, for those two bridges you should run: + +---------------------------------------------------------------+ + | bridge_1$tarpd eth0 192.168.0.2 255.255.255.255 \ | + | 192.168.0.4 255.255.255.255 \ | + | | + | 192.168.0.254 255.255.255.255 | + | | + +---------------------------------------------------------------+ + + On the other side: + +---------------------------------------------------------------+ + | bridge_2$tarpd eth0 192.168.0.1 255.255.255.255 \ | + | 192.168.0.3 255.255.255.255 \ | + | | + | 192.168.0.253 255.255.255.255 | + | | + +---------------------------------------------------------------+ + + You specify 128 remote pairs (IP/mask. Mask should be 255.255.255.255 in + order not to confuse tarpd!) on each bridge. + + 7. Enjoy your bridges! + +----------------------------------------------------------------------------- +1.2. What about other protocols? + +Really, I can say nothing about other protocol routing. I never used them. +But I suppose if you are familiar with other protocols, it should not be too +difficult to bridge it this way. +----------------------------------------------------------------------------- + + Related HOWTOs: + + · Bridge+Firewall + + + · Bridge + + diff --git a/LDP/guide/docbook/Linux-Networking/Firewalling-and-Masquerading.xml b/LDP/guide/docbook/Linux-Networking/Firewalling-and-Masquerading.xml deleted file mode 100644 index ef7d89cc..00000000 --- a/LDP/guide/docbook/Linux-Networking/Firewalling-and-Masquerading.xml +++ /dev/null @@ -1,322 +0,0 @@ - - - 6.6. IP Firewall (for Linux-2.0) - - IP Firewall and Firewalling issues are covered in more depth in the - Firewall-HOWTO. IP Firewalling allows you to secure your machine - against unauthorized network access by filtering or allowing datagrams - from or to IP addresses that you nominate. There are three different - classes of rules, incoming filtering, outgoing filtering and - forwarding filtering. Incoming rules are applied to datagrams that are - received by a network device. Outgoing rules are applied to datagrams - that are to be transmitted by a network device. Forwarding rules are - applied to datagrams that are received and are not for this machine, - ie datagrams that would be routed. - - Kernel Compile Options: - - - Networking options ---> - [*] Network firewalls - .... - [*] IP: forwarding/gatewaying - .... - [*] IP: firewalling - [ ] IP: firewall packet logging - - - - Configuration of the IP firewall rules is performed using the ipfwadm - command. As I mentioned earlier, security is not something I am expert - at, so while I will present an example you can use, you should do your - own research and develop your own rules if security is important to - you. - - Probably the most common use of IP firewall is when you are using your - linux machine as a router and firewall gateway to protect your local - network from unauthorized access from outside your network. - - - The following configuration is based on a contribution from Arnt - Gulbrandsen, . - - The example describes the configuration of the firewall rules on the - Linux firewall/router machine illustrated in this diagram: - - - - - - - \ | 172.16.37.0 - \ | /255.255.255.0 - \ --------- | - | 172.16.174.30 | Linux | | - NET =================| f/w |------| ..37.19 - | PPP | router| | -------- - / --------- |--| Mail | - / | | /DNS | - / | -------- - - - - - - - The following commands would normally be placed in an rc file so that - they were automatically started each time the system boots. For - maximum security they would be performed after the network interfaces - are configured, but before the interfaces are actually brought up to - prevent anyone gaining access while the firewall machine is rebooting. - - - - #!/bin/sh - - # Flush the 'Forwarding' rules table - # Change the default policy to 'accept' - # - /sbin/ipfwadm -F -f - /sbin/ipfwadm -F -p accept - # - # .. and for 'Incoming' - # - /sbin/ipfwadm -I -f - /sbin/ipfwadm -I -p accept - - # First off, seal off the PPP interface - # I'd love to use '-a deny' instead of '-a reject -y' but then it - # would be impossible to originate connections on that interface too. - # The -o causes all rejected datagrams to be logged. This trades - # disk space against knowledge of an attack of configuration error. - # - /sbin/ipfwadm -I -a reject -y -o -P tcp -S 0/0 -D 172.16.174.30 - - # Throw away certain kinds of obviously forged packets right away: - # Nothing should come from multicast/anycast/broadcast addresses - # - /sbin/ipfwadm -F -a deny -o -S 224.0/3 -D 172.16.37.0/24 - # - # and nothing coming from the loopback network should ever be - # seen on a wire - # - /sbin/ipfwadm -F -a deny -o -S 127.0/8 -D 172.16.37.0/24 - - # accept incoming SMTP and DNS connections, but only - # to the Mail/Name Server - # - /sbin/ipfwadm -F -a accept -P tcp -S 0/0 -D 172.16.37.19 25 53 - # - # DNS uses UDP as well as TCP, so allow that too - # for questions to our name server - # - /sbin/ipfwadm -F -a accept -P udp -S 0/0 -D 172.16.37.19 53 - # - # but not "answers" coming to dangerous ports like NFS and - # Larry McVoy's NFS extension. If you run squid, add its port here. - # - /sbin/ipfwadm -F -a deny -o -P udp -S 0/0 53 \ - -D 172.16.37.0/24 2049 2050 - - # answers to other user ports are okay - # - /sbin/ipfwadm -F -a accept -P udp -S 0/0 53 \ - -D 172.16.37.0/24 53 1024:65535 - - # Reject incoming connections to identd - # We use 'reject' here so that the connecting host is told - # straight away not to bother continuing, otherwise we'd experience - # delays while ident timed out. - # - /sbin/ipfwadm -F -a reject -o -P tcp -S 0/0 -D 172.16.37.0/24 113 - - # Accept some common service connections from the 192.168.64 and - # 192.168.65 networks, they are friends that we trust. - # - /sbin/ipfwadm -F -a accept -P tcp -S 192.168.64.0/23 \ - -D 172.16.37.0/24 20:23 - - # accept and pass through anything originating inside - # - /sbin/ipfwadm -F -a accept -P tcp -S 172.16.37.0/24 -D 0/0 - - # deny most other incoming TCP connections and log them - # (append 1:1023 if you have problems with ftp not working) - # - /sbin/ipfwadm -F -a deny -o -y -P tcp -S 0/0 -D 172.16.37.0/24 - - # ... for UDP too - # - /sbin/ipfwadm -F -a deny -o -P udp -S 0/0 -D 172.16.37.0/24 - - - - Good firewall configurations are a little tricky. This example should - be a reasonable starting point for you. The ipfwadm manual page offers - some assistance in how to use the tool. If you intend to configure a - firewall, be sure to ask around and get as much advice from sources - you consider reliable and get someone to test/sanity check your - configuration from the outside. - - 6.7. IP Firewall (for Linux-2.2) - - The new firewalling code is accessed via ``IP Firewall Chains''. See - the IP chanins home page for more information. Among other things, - you'll now need to use ipchains instead of ipfwadm to configure your - filters. (From Documentation/Changes in the latest kernel sources). - - We are aware that this is a sorely out of date statement and we are - currently working on getting this section more current. You can expect - a newer version in August of 1999. - - - 8.7. Firewall - - A firewall is a device that protects a private network from the public - part (the internet as a whole). It is designed to control the flow of - packets based on the source, destination, port and packet type - information contained in each packet. - - Different firewall toolkits exist for Linux as well as built-in - support in the kernel. Other firewalls are TIS and SOCKS. These - firewall toolkits are very complete and combined with other tools - allow blocking/redirection of all kinds of traffic and protocols. - Different policies can be implemented via configuration files or GUI - programs. - - - · TIS home page - - · SOCKS - - · Firewall HOWTO - - - 8.8. Port forwarding - - An increasing number of web sites are becoming interactive by having - cgi-bins or Java applets that access some database or other service. - Since this access may pose a security problem, the machine containing - the database should not be directly connected to the Internet. - - Port Forwarding can provide an almost ideal solution to this access - problem. On the firewall, IP packets that come in to a specific port - number can be re-written and forwarded to the internal server - providing the actual service. The reply packets from the internal - server are re-written to make it appear that they came from the - firewall. - - Port forwarding information may be found here - - - 8.3. IP Masquerade - - IP Masquerade is a developing networking function in Linux. If a Linux - host is connected to the Internet with IP Masquerade enabled, then - computers connecting to it (either on the same LAN or connected with - modems) can reach the Internet as well, even though they have no - officially assigned IP addresses. This allows for reduction of costs, - since many people may be able to access the Internet using a single - modem connection as well as contributes to increased security (in some - way the machine is acting as a firewall, since unofficially assigned - addresses cannot be accessed outside of that network). - - IP masquerade related pages and documents: - - · http://ipmasq.home.ml.org/ - · http://www.indyramp.com/masq/links.pfhtml - · http://metalab.unc.edu/mdw/HOWTO/IP-Masquerade-HOWTO.html - -Firewalling-and-Masquerading - - - - -Masquerading Made Simple HOWTO - ------------------------------------------------------------------------------ - -Chapter 8. Miscellaneous - -8.1. Useful Resources - -  * [http://ipmasq.webhop.net/] IP Masquerade Resource page Will have all the - current information for setting up IP Masquerade on 2.0.x, 2.2.x, and - even old 1.2 kernels! - -  * [http://juanjox.kernelnotes.org] Juan Jose Ciarlante's WWW site who is - one of the current Linux IP Masquerade maintainers. A mirror can be fount - at [http://ipmasq.webhop.net/juanjox/] ipmasq.webhop.net/juanjox - -  * IP Masquerade mailing list Archives contains the recent messages sent to - the mailing lists. - -  * David Ranch's Linux page including the TrinityOS Linux document and - current versions of the IP-MASQ-HOWTO.. Topics such as IP MASQ, strong - IPFWADM/IPCHAINS rulesets, PPP, Diald, Cablemodems, DNS, Sendmail, Samba, - NFS, Security, etc. are covered. - -  * The IP Masquerading Applications page: A comprehensive list of - applications that work or can be tuned to work through a Linux IP - masquerading server. - -  * For users setting up IP Masq on MkLinux, email Taro Fukunaga at [mailto: - tarozax@earthlink.net] tarozax@earthlink.net for a copy of his short - MkLinux version of this HOWTO. - -  * IP masquerade FAQ has some general information - -  * Paul Russel's [http://www.netfilter.org/ipchains/] http:// - www.netfilter.org/ipchains/ doc and its possibly older backup at Linux - IPCHAINS HOWTO. This HOWTO has lots of information for IPCHAINS usage, as - well as source and binaries for the ipchains tool. - -  * [http://www.xos.nl/linux/ipfwadm/] X/OS Ipfwadm page contains sources, - binaries, documentation, and other information about the ipfwadm package - -  * Check out the GreatCircle's Firewall mailing list for a great resource - about strong firewall rulesets. - -  * The LDP Network Administrator's Guide is a MUST for the beginner Linux - administrator trying to set up a network. - -  * The [http://www.tldp.org/HOWTO/Net-HOWTO/index.html] Linux NET HOWTO is - also another comprehensive document on how to setup and configure Linux - networking. - -  * Linux ISP Hookup HOWTO and [http://www.tldp.org/HOWTO/PPP-HOWTO/ - index.html] Linux PPP HOWTO gives you information on how to connect your - Linux host to the Internet - -  * Linux Ethernet-Howto is a good source of information about setting up a - LAN running over Ethernet. - -  * Donald Becker's NIC drivers and Support Utils - -  * You may also be interested in [http://www.tldp.org/HOWTO/ - Firewall-HOWTO.html] Linux Firewalling and Proxy Server HOWTO - -  * Linux Kernel HOWTO will guide you through the kernel compilation process - -  * Other [http://www.tldp.org/HOWTO/HOWTO-INDEX/howtos.html] Linux HOWTOs - such as Kernel HOWTO - -  * Posting to the USENET newsgroup: [news:comp.os.linux.networking] - comp.os.linux.networking - ------------------------------------------------------------------------------ - -8.2. Linux IP Masquerade Resource - -The [http://ipmasq.webhop.net/] Linux IP Masquerade Resource is a website -dedicated to Linux IP Masquerade information also maintained by Ambrose Au. -It has the latest information related to IP Masquerade and may have -information that is not being included in the HOWTO. - -You may find the Linux IP Masquerade Resource at the following locations: - -  * [http://ipmasq.webhop.net/] http://ipmasq.webhop.net/, Primary Site, - redirected to [http://ipmasq.webhop.net/] http://ipmasq.webhop.net/ - -  * [http://ipmasq2.webhop.net/] http://ipmasq2.webhop.net/, Secondary Site, - redirected to [http://www.e-infomax.com/ipmasq/] http://www.e-infomax.com - /ipmasq - - diff --git a/LDP/guide/docbook/Linux-Networking/IP-Accounting.xml b/LDP/guide/docbook/Linux-Networking/IP-Accounting.xml deleted file mode 100644 index 0d98395b..00000000 --- a/LDP/guide/docbook/Linux-Networking/IP-Accounting.xml +++ /dev/null @@ -1,134 +0,0 @@ - - -IP-Accounting - - -This option of the Linux kernel keeps track of IP network traffic, -performs packet logging and produces some statistics. A series of -rules may be defined so when a packet matches a given pattern, some -action is performed: a counter is increased, it is accepted/rejected, -etc. - - - -6.3. IP Accounting (for Linux-2.0) -The IP accounting features of the Linux kernel allow you to collect -and analyze some network usage data. The data collected comprises the -number of packets and the number of bytes accumulated since the -figures were last reset. You may specify a variety of rules to -categorize the figures to suit whatever purpose you may have. This -option has been removed in kernel 2.1.102, because the old ipfwadm- -based firewalling was replaced by ``ipfwchains''. - - - - - Kernel Compile Options: - - Networking options ---> - [*] IP: accounting - - - - -After you have compiled and installed the kernel you need to use the -ipfwadm command to configure IP accounting. There are many different -ways of breaking down the accounting information that you might -choose. I've picked a simple example of what might be useful to use, -you should read the ipfwadm man page for more information. -Scenario: You have a ethernet network that is linked to the internet -via a PPP link. On the ethernet you have a machine that offers a -number of services and that you are interested in knowing how much -traffic is generated by each of ftp and world wide web traffic, as -well as total tcp and udp traffic. - - - -You might use a command set that looks like the following, which is -shown as a shell script: - - - - - #!/bin/sh - # - # Flush the accounting rules - ipfwadm -A -f - # - # Set shortcuts - localnet=44.136.8.96/29 - any=0/0 - # Add rules for local ethernet segment - ipfwadm -A in -a -P tcp -D $localnet ftp-data - ipfwadm -A out -a -P tcp -S $localnet ftp-data - ipfwadm -A in -a -P tcp -D $localnet www - ipfwadm -A out -a -P tcp -S $localnet www - ipfwadm -A in -a -P tcp -D $localnet - ipfwadm -A out -a -P tcp -S $localnet - ipfwadm -A in -a -P udp -D $localnet - ipfwadm -A out -a -P udp -S $localnet - # - # Rules for default - ipfwadm -A in -a -P tcp -D $any ftp-data - ipfwadm -A out -a -P tcp -S $any ftp-data - ipfwadm -A in -a -P tcp -D $any www - ipfwadm -A out -a -P tcp -S $any www - ipfwadm -A in -a -P tcp -D $any - ipfwadm -A out -a -P tcp -S $any - ipfwadm -A in -a -P udp -D $any - ipfwadm -A out -a -P udp -S $any - # - # List the rules - ipfwadm -A -l -n - # - - - - -The names ``ftp-data'' and ``www'' refer to lines in /etc/services. -The last command lists each of the Accounting rules and displays the -collected totals. - - - -An important point to note when analyzing IP accounting is that totals -for all rules that match will be incremented so that to obtain -differential figures you need to perform appropriate maths. For -example if I wanted to know how much data was not ftp nor www I would -substract the individual totals from the rule that matches all ports. - - - - - root# ipfwadm -A -l -n - IP accounting rules - pkts bytes dir prot source destination ports - 0 0 in tcp 0.0.0.0/0 44.136.8.96/29 * -> 20 - 0 0 out tcp 44.136.8.96/29 0.0.0.0/0 20 -> * - 10 1166 in tcp 0.0.0.0/0 44.136.8.96/29 * -> 80 - 10 572 out tcp 44.136.8.96/29 0.0.0.0/0 80 -> * - 252 10943 in tcp 0.0.0.0/0 44.136.8.96/29 * -> * - 231 18831 out tcp 44.136.8.96/29 0.0.0.0/0 * -> * - 0 0 in udp 0.0.0.0/0 44.136.8.96/29 * -> * - 0 0 out udp 44.136.8.96/29 0.0.0.0/0 * -> * - 0 0 in tcp 0.0.0.0/0 0.0.0.0/0 * -> 20 - 0 0 out tcp 0.0.0.0/0 0.0.0.0/0 20 -> * - 10 1166 in tcp 0.0.0.0/0 0.0.0.0/0 * -> 80 - 10 572 out tcp 0.0.0.0/0 0.0.0.0/0 80 -> * - 253 10983 in tcp 0.0.0.0/0 0.0.0.0/0 * -> * - 231 18831 out tcp 0.0.0.0/0 0.0.0.0/0 * -> * - 0 0 in udp 0.0.0.0/0 0.0.0.0/0 * -> * - 0 0 out udp 0.0.0.0/0 0.0.0.0/0 * -> * - - - - -6.4. IP Accounting (for Linux-2.2) - -The new accounting code is accessed via ``IP Firewall Chains''. See -the IP chains home page for more information. Among other things, -you'll now need to use ipchains instead of ipfwadm to configure your -filters. (From Documentation/Changes in the latest kernel sources). - - - diff --git a/LDP/guide/docbook/Linux-Networking/IP-Aliasing.xml b/LDP/guide/docbook/Linux-Networking/IP-Aliasing.xml deleted file mode 100644 index 7595053b..00000000 --- a/LDP/guide/docbook/Linux-Networking/IP-Aliasing.xml +++ /dev/null @@ -1,377 +0,0 @@ - - -IP-Aliasing - - -This is a cookbook recipe on how to set up and run IP aliasing on a Linux box -and how to set up the machine to receive e-mail on the aliased IP addresses. - - - -This feature of the Linux kernel provides the possibility of setting -multiple network addresses on the same low-level network device driver -(e.g two IP addresses in one Ethernet card). It is typically used for -services that act differently based on the address they listen on -(e.g. "multihosting" or "virtual domains" or "virtual hosting -services". - - - -There are some applications where being able to configure multiple IP -addresses to a single network device is useful. Internet Service -Providers often use this facility to provide a `customized' to their -World Wide Web and ftp offerings for their customers. You can refer to -the ``IP-Alias mini-HOWTO'' for more information than you find here. - - - -Quickstart: - - - -After compiling and installing your kernel with IP_Alias support -configuration is very simple. The aliases are added to virtual network -devices associated with the actual network device. A simple naming -convention applies to these devices being :, -e.g. eth0:0, ppp0:10 etc. Note that the the ifname:number device can -only be configured after the main interface has been set up. - - - -For example, assume you have an ethernet network that supports two -different IP subnetworks simultaneously and you wish your machine to -have direct access to both, you could use something like: - - - - - root# ifconfig eth0 192.168.1.1 netmask 255.255.255.0 up - root# route add -net 192.168.1.0 netmask 255.255.255.0 eth0 - root# ifconfig eth0:0 192.168.10.1 netmask 255.255.255.0 up - root# route add -net 192.168.10.0 netmask 255.255.255.0 eth0:0 - - - ------------------------------------------------------------------------------ - -1. My Setup - - - -  * IP Alias is standard in kernels 2.0.x and 2.2.x, and available as a - compile-time option in 2.4.x (IP Alias has been deprecated in 2.4.x and - replaced by a more powerful firewalling mechanism.) -  * IP Alias compiled as a loadable module. You would have indicated in the - "make config" command to make your kernel, that you want the IP Masq to - be compiled as a (M)odule. Check the Modules HOW-TO (if that exists) or - check the info in /usr/src/linux/Documentation/modules.txt. -  * I have to support 2 additional IPs over and above the IP already - allocated to me. -  * A D-Link DE620 pocket adapter (not important, works with any Linux - supported network adapter). - - - - - Kernel Compile Options: - - Networking options ---> - .... - [*] Network aliasing - .... - <*> IP: aliasing support - - - - ------------------------------------------------------------------------------ - - -2. Commands - - - -1. Load the IP Alias module (you can skip this step if you compiled the -module into the kernel): - - - - - /sbin/insmod /lib/modules/`uname -r`/ipv4/ip_alias.o - - - - -2. Setup the loopback, eth0, and all the IP addresses beginning with the -main IP address for the eth0 interface: - - - - - /sbin/ifconfig lo 127.0.0.1 - /sbin/ifconfig eth0 up - /sbin/ifconfig eth0 172.16.3.1 - /sbin/ifconfig eth0:0 172.16.3.10 - /sbin/ifconfig eth0:1 172.16.3.100 - - - - -172.16.3.1 is the main IP address, while .10 and .100 are the aliases. -The magic is the eth0:x where x=0,1,2,...n for the different IP -addresses. The main IP address does not need to be aliased. - - - -3. Setup the routes. First route the loopback, then the net, and finally, -the various IP addresses starting with the default (originally allocated) -one: - - - - - /sbin/route add -net 127.0.0.0 - /sbin/route add -net 172.16.3.0 dev eth0 - /sbin/route add -host 172.16.3.1 dev eth0 - /sbin/route add -host 172.16.3.10 dev eth0:0 - /sbin/route add -host 172.16.3.100 dev eth0:1 - /sbin/route add default gw 172.16.3.200 - - - - -That's it. - - - -In the example IP address above, I am using the Private IP addresses (RFC -1918) for illustrative purposes. Substitute them with your own official or -private IP addresses. - - - -The example shows only 3 IP addresses. The max is defined to be 256 in /usr/ -include/linux/net_alias.h. 256 IP addresses on ONE card is a lot :-)! - - - -Here's what my /sbin/ifconfig looks like: - - - - -lo Link encap:Local Loopback - inet addr:127.0.0.1 Bcast:127.255.255.255 Mask:255.0.0.0 - UP BROADCAST LOOPBACK RUNNING MTU:3584 Metric:1 - RX packets:5088 errors:0 dropped:0 overruns:0 - TX packets:5088 errors:0 dropped:0 overruns:0 - -eth0 Link encap:10Mbps Ethernet HWaddr 00:8E:B8:83:19:20 - inet addr:172.16.3.1 Bcast:172.16.3.255 Mask:255.255.255.0 - UP BROADCAST RUNNING PROMISC MULTICAST MTU:1500 Metric:1 - RX packets:334036 errors:0 dropped:0 overruns:0 - TX packets:11605 errors:0 dropped:0 overruns:0 - Interrupt:7 Base address:0x378 - -eth0:0 Link encap:10Mbps Ethernet HWaddr 00:8E:B8:83:19:20 - inet addr:172.16.3.10 Bcast:172.16.3.255 Mask:255.255.255.0 - UP BROADCAST RUNNING MTU:1500 Metric:1 - RX packets:0 errors:0 dropped:0 overruns:0 - TX packets:0 errors:0 dropped:0 overruns:0 - -eth0:1 Link encap:10Mbps Ethernet HWaddr 00:8E:B8:83:19:20 - inet addr:172.16.3.100 Bcast:172.16.3.255 Mask:255.255.255.0 - UP BROADCAST RUNNING MTU:1500 Metric:1 - RX packets:1 errors:0 dropped:0 overruns:0 - TX packets:0 errors:0 dropped:0 overruns:0 - - - - -And /proc/net/aliases: - - - - -device family address -eth0:0 2 172.16.3.10 -eth0:1 2 172.16.3.100 - - - - -And /proc/net/alias_types: - - - - -type name n_attach -2 ip 2 - - - - -Of course, the stuff in /proc/net was created by the ifconfig command and not -by hand! - ------------------------------------------------------------------------------ - - -3. Troubleshooting: Questions and Answers - - - -3.1. Question: How can I keep the settings through a reboot? - - - -Answer: Whether you are using BSD-style or SysV-style (Redhat?? for example) -init, you can always include it in /etc/rc.d/rc.local. Here's what I have on -my SysV init system (Redhat?? 3.0.3 and 4.0): - - - -My /etc/rc.d/rc.local: (edited to show the relevant portions) - - - - -#setting up IP alias interfaces -echo "Setting 172.16.3.1, 172.16.3.10, 172.16.3.100 IP Aliases ..." -/sbin/ifconfig lo 127.0.0.1 -/sbin/ifconfig eth0 up -/sbin/ifconfig eth0 172.16.3.1 -/sbin/ifconfig eth0:0 172.16.3.10 -/sbin/ifconfig eth0:1 172.16.3.100 -#setting up the routes -echo "Setting IP routes ..." -/sbin/route add -net 127.0.0.0 -/sbin/route add -net 172.16.3.0 dev eth0 -/sbin/route add -host 172.16.3.1 eth0 -/sbin/route add -host 172.16.3.10 eth0:0 -/sbin/route add -host 172.16.3.100 eth0:1 -/sbin/route add default gw 172.16.3.200 -# - - ------------------------------------------------------------------------------ - - -3.2. Question: How do I set up the IP aliased machine to receive e-mail on -the various aliased IP addresses (on a machine using sendmail)? - - - -Answer: Create (if it doesn't already exist) a file called, /etc/ -mynames.cw,for example. The file does not have to be this exact name nor in -the /etc directory. - - - -In that file, place the official domain names of the aliased IP addresses. If -these aliased IP addresses do not have a domain name, then you can place the -IP address itself. - - - -The /etc/mynames.cw might look like this: - - - - -# /etc/mynames.cw - include all aliases for your machine here; # is a comment -domain.one.net -domain.two.com -domain.three.org -4.5.6.7 - - - - -In your sendmail.cf file, where it defines a file class macro Fw, add the -following: - - - - -################## -# local info # -################## - -# file containing names of hosts for which we receive email -Fw/etc/mynames.cw - -That should do it. Test out the new setting by invoking sendmail in test -mode. The following is an example: -ganymede$ /usr/lib/sendmail -bt -ADDRESS TEST MODE (ruleset 3 NOT automatically invoked) -Enter < ruleset> < address> -> 0 me@4.5.6.7 -rewrite: ruleset 0 input: me @ 4 . 5 . 6 . 7 -rewrite: ruleset 98 input: me @ 4 . 5 . 6 . 7 -rewrite: ruleset 98 returns: me @ 4 . 5 . 6 . 7 -rewrite: ruleset 97 input: me @ 4 . 5 . 6 . 7 -rewrite: ruleset 3 input: me @ 4 . 5 . 6 . 7 -rewrite: ruleset 96 input: me < @ 4 . 5 . 6 . 7 > -rewrite: ruleset 96 returns: me < @ 4 . 5 . 6 . 7 . > -rewrite: ruleset 3 returns: me < @ 4 . 5 . 6 . 7 . > -rewrite: ruleset 0 input: me < @ 4 . 5 . 6 . 7 . > -rewrite: ruleset 98 input: me < @ 4 . 5 . 6 . 7 . > -rewrite: ruleset 98 returns: me < @ 4 . 5 . 6 . 7 . > -rewrite: ruleset 0 returns: $# local $: me -rewrite: ruleset 97 returns: $# local $: me -rewrite: ruleset 0 returns: $# local $: me -> 0 me@4.5.6.8 -rewrite: ruleset 0 input: me @ 4 . 5 . 6 . 8 -rewrite: ruleset 98 input: me @ 4 . 5 . 6 . 8 -rewrite: ruleset 98 returns: me @ 4 . 5 . 6 . 8 -rewrite: ruleset 97 input: me @ 4 . 5 . 6 . 8 -rewrite: ruleset 3 input: me @ 4 . 5 . 6 . 8 -rewrite: ruleset 96 input: me < @ 4 . 5 . 6 . 8 > -rewrite: ruleset 96 returns: me < @ 4 . 5 . 6 . 8 > -rewrite: ruleset 3 returns: me < @ 4 . 5 . 6 . 8 > -rewrite: ruleset 0 input: me < @ 4 . 5 . 6 . 8 > -rewrite: ruleset 98 input: me < @ 4 . 5 . 6 . 8 > -rewrite: ruleset 98 returns: me < @ 4 . 5 . 6 . 8 > -rewrite: ruleset 95 input: < > me < @ 4 . 5 . 6 . 8 > -rewrite: ruleset 95 returns: me < @ 4 . 5 . 6 . 8 > -rewrite: ruleset 0 returns: $# smtp $@ 4 . 5 . 6 . 8 $: me < @ 4 . 5 . 6 . 8 > -rewrite: ruleset 97 returns: $# smtp $@ 4 . 5 . 6 . 8 $: me < @ 4 . 5 . 6 . 8 > -rewrite: ruleset 0 returns: $# smtp $@ 4 . 5 . 6 . 8 $: me < @ 4 . 5 . 6 . 8 > -> - - - - -Notice when I tested me@4.5.6.7, it delivered the mail to the local machine, -while me@4.5.6.8 was handed off to the smtp mailer. That is the correct -response. - - - -3.3. Question: How do I delete an alias? - - - -Answer: To delete an alias you simply add a `-' to the end of its name and -refer to it and is as simple as: - - - - - root# ifconfig eth0:0- 0 - - - - -All routes associated with that alias will also be deleted -automatically. - - - - - -You are all set now. - - - diff --git a/LDP/guide/docbook/Linux-Networking/ISDN.xml b/LDP/guide/docbook/Linux-Networking/ISDN.xml deleted file mode 100644 index 1b658f76..00000000 --- a/LDP/guide/docbook/Linux-Networking/ISDN.xml +++ /dev/null @@ -1,101 +0,0 @@ - - -ISDN - - -The Integrated Services Digital Network (ISDN) is a series of -standards that specify a general purpose switched digital data -network. An ISDN `call' creates a synchronous point to point data -service to the destination. ISDN is generally delivered on a high -speed link that is broken down into a number of discrete channels. -There are two different types of channels, the `B Channels' which will -actually carry the user data and a single channel called the `D -channel' which is used to send control information to the ISDN -exchange to establish calls and other functions. In Australia for -example, ISDN may be delivered on a 2Mbps link that is broken into 30 -discrete 64kbps B channels with one 64kbps D channel. Any number of -channels may be used at a time and in any combination. You could for -example establish 30 separate calls to 30 different destinations at -64kbps each, or you could establish 15 calls to 15 different -destinations at 128kbps each (two channels used per call), or just a -small number of calls and leave the rest idle. A channel may be used -for either incoming or outgoing calls. The original intention of ISDN -was to allow Telecommunications companies to provide a single data -service which could deliver either telephone (via digitised voice) or -data services to your home or business without requiring you to make -any special configuration changes. - - - -There are a few different ways to connect your computer to an ISDN -service. One way is to use a device called a `Terminal Adaptor' which -plugs into the Network Terminating Unit that you telecommunications -carrier will have installed when you got your ISDN service and -presents a number of serial interfaces. One of those interfaces is -used to enter commands to establish calls and configuration and the -others are actually connected to the network devices that will use the -data circuits when they are established. Linux will work in this sort -of configuration without modification, you just treat the port on the -Terminal Adaptor like you would treat any other serial device. -Another way, which is the way the kernel ISDN support is designed for -allows you to install an ISDN card into your Linux machine and then -has your Linux software handle the protocols and make the calls -itself. - - - -The Linux kernel has built-in ISDN capabilies. Isdn4linux controls -ISDN PC cards and can emulate a modem with the Hayes command set ("AT" -commands). The possibilities range from simply using a terminal -program to connections via HDLC (using included devices) to full -connection to the Internet with PPP to audio applications. - -· FAQ for isdn4linux: http://ww.isdn4linux.de/faq/ - - - - - Kernel Compile Options: - - ISDN subsystem ---> - <*> ISDN support - [ ] Support synchronous PPP - [ ] Support audio via ISDN - < > ICN 2B and 4B support - < > PCBIT-D support - < > Teles/NICCY1016PC/Creatix support - - - - -The Linux implementation of ISDN supports a number of different types -of internal ISDN cards. These are those listed in the kernel -configuration options: - - - · ICN 2B and 4B - · Octal PCBIT-D - · Teles ISDN-cards and compatibles - - -Some of these cards require software to be downloaded to them to make -them operational. There is a separate utility to do this with. - - - -Full details on how to configure the Linux ISDN support is available -from the /usr/src/linux/Documentation/isdn/ directory and an FAQ -dedicated to isdn4linux is available at www.lrz-muenchen.de. (You can -click on the english flag to get an english version). - - - -A note about PPP. The PPP suite of protocols will operate over either -asynchronous or synchronous serial lines. The commonly distributed PPP -daemon for Linux `pppd' supports only asynchronous mode. If you wish -to run the PPP protocols over your ISDN service you need a specially -modified version. Details of where to find it are available in the -documentation referred to above. - - - diff --git a/LDP/guide/docbook/Linux-Networking/Internet.xml b/LDP/guide/docbook/Linux-Networking/Internet.xml deleted file mode 100644 index 507e1e5b..00000000 --- a/LDP/guide/docbook/Linux-Networking/Internet.xml +++ /dev/null @@ -1,21 +0,0 @@ - - -Internet - - -Internet is not described as a network of any single kind. It can be construed -as a large set of heterogenous networks that support a certain set of protocols -(TCP/IP) and provide certain common services. A good way to learn about the -Internet is to use the Internet! - - - -Linux is a great platform to act as an Intranet / Internet server. The -term Intranet refers to the application of Internet technologies -inside an organisation mainly for the purpose of distributing and -making available information inside the company. Internet and Intranet -services offered by Linux include mail, news, WWW servers and many -more that will be outlined further on in this document. - - - diff --git a/LDP/guide/docbook/Linux-Networking/Load-Balancing.xml b/LDP/guide/docbook/Linux-Networking/Load-Balancing.xml deleted file mode 100644 index c1dbc13b..00000000 --- a/LDP/guide/docbook/Linux-Networking/Load-Balancing.xml +++ /dev/null @@ -1,24 +0,0 @@ - - -Load-Balancing - - -Demand for load balancing usually arises in database/web access when -many clients make simultaneous requests to a server. It would be -desirable to have multiple identical servers and redirect requests to -the less loaded server. This can be achieved through Network Address -Translation techniques (NAT) of which IP masquerading is a subset. -Network administrators can replace a single server providing Web -services - or any other application - with a logical pool of servers -sharing a common IP address. Incoming connections are directed to a -particular server using one load-balancing algorithm. The virtual -server rewrites incoming and outgoing packets to give clients the -appearance that only one server exists. - - - -Linux IP-NAT information may be found here - - - diff --git a/LDP/guide/docbook/Linux-Networking/Multicast.xml b/LDP/guide/docbook/Linux-Networking/Multicast.xml deleted file mode 100644 index 7543a8c2..00000000 --- a/LDP/guide/docbook/Linux-Networking/Multicast.xml +++ /dev/null @@ -1,1465 +0,0 @@ - - -Multicasting - - - This HOWTO tries to cover most aspects related to multicast over - TCP/IP networks. So, a lot of information within it is not Linux-spe- - cific (just in case you don't use GNU/Linux... yet). Multicast is cur- - rently an active area of research and, at the time of writing, many of - the "standards" are merely drafts. Keep it in mind while reading the - lines that follow. - - 1. Introduction. - - I'll try to give here the most wide range, up to date and accurate - information related to multicasting over TCP/IP networks that I can. - Any feedback is very welcome. If you find any mistakes in this - document, have any comments about its contents or an update or - addition, please send them to me at the address listed at the top of - this howto. - 1.1. What is Multicast. - - Multicast is... a need. Well, at least in some scenarios. If you have - information (a lot of information, usually) that should be transmitted - to various (but usually not all) hosts over an internet, then - Multicast is the answer. One common situation in which it is used is - when distributing real time audio and video to the set of hosts which - have joined a distributed conference. - - Multicast is much like radio or TV in the sense that only those who - have tuned their receivers (by selecting a particular frequency they - are interested on) receive the information. That is: you hear the - channel you are interested in, but not the others. - - 1.2. The problem with Unicast. - - Unicast is anything that is not broadcast nor multicast. All right, - the definition is not very bright... When you send a packet and there - is only one sender process -yours- and one recipient process (the one - you are sending the packet to), then this is unicast. TCP is, by its - own nature, unicast oriented. UDP supports a lot more paradigms, but - if you are sending UDP packets and there is only one precess supposed - to receive them, this is unicast too. - - For years unicast transmissions proved to be enough for the Internet. - It was not until 1993 when the first implementation of multicast saw - the light in the 4.4 BSD release. It seems nobody needed it until - then. Which were those new problems that multicast addressed? - - Needless to say that the Internet has changed a lot since the "early - days". Particularly, the appearance of the Web strongly transformed - the situation: people didn't just want connections to remote hosts, - mail and FTP. First they wanted to see the pictures people placed in - their home pages, but later they also wanted to see and hear that - people. - - With today's technology it is possible to afford the "cost" of making - a unicast connection with everyone who wants to see your web page. - However, if you are to send audio and video, which needs a huge amount - of bandwidth compared with web applications, you have -you had, until - multicast came into scene- two options: to establish a separate - unicast connection with each of the recipients, or to use broadcast. - The first solution is not affordable: if we said that a single - connection sending audio/video consumes a huge bandwidth, imagine - having to establish hundreds or, may be, thousands of those - connections. Both the sending computer and your network would - collapse. - - Broadcast seems to be a solution, but it's not certainly the solution. - If you want all the hosts in your LAN to attend the conference, you - may use broadcast. Packets will be sent only once and every host will - receive them as they are sent to the broadcast address. The problem is - that perhaps only a few of the hosts and not all are interested in - those packets. Furthermore: perhaps some hosts are really interested - in your conference, but they are outside of your LAN, a few routers - away. And you know that broadcast works fine inside a LAN, but - problems arise when you want broadcast packets to be routed across - different LANs. - - The best solution seems to be one in which you send packets to a - certain special address (a certain frequency in radio/TV - transmissions). Then, all hosts which have decided to join the - conference will be aware of packets with that destination address, - read them when they traverse the network, and pass them to the IP - layer to be demultiplexed. This is similar to broadcasting in that you - send only one broadcast packet and all the hosts in the network - recognize and read it; it differs, however, in that not all multicast - packets are read and processed, but only those that were previously - registered in the kernel as being "of interest". - - Those special packets are routed at kernel level like any packet - because they are IP packets. The only difference might reside in the - routing algorithm which tells the kernel where to route or not to - route them. - - - - 2. Multicast Explained. - - 2.1. Multicast addresses. - - As you probably know, the range of IP addresses is divided into - "classes" based on the high order bits of a 32 bits IP address: - - - ______________________________________________________________________ - Bit --> 0 31 Address Range: - +-+----------------------------+ - |0| Class A Address | 0.0.0.0 - 127.255.255.255 - +-+----------------------------+ - +-+-+--------------------------+ - |1 0| Class B Address | 128.0.0.0 - 191.255.255.255 - +-+-+--------------------------+ - +-+-+-+------------------------+ - |1 1 0| Class C Address | 192.0.0.0 - 223.255.255.255 - +-+-+-+------------------------+ - +-+-+-+-+----------------------+ - |1 1 1 0| MULTICAST Address | 224.0.0.0 - 239.255.255.255 - +-+-+-+-+----------------------+ - +-+-+-+-+-+--------------------+ - |1 1 1 1 0| Reserved | 240.0.0.0 - 247.255.255.255 - +-+-+-+-+-+--------------------+ - ______________________________________________________________________ - - - - The one which concerns us is the "Class D Address". Every IP datagram - whose destination address starts with "1110" is an IP Multicast - datagram. - - The remaining 28 bits identify the multicast "group" the datagram is - sent to. Following with the previous analogy, you have to tune your - radio to hear a program that is transmitted at some specific - frequency, in the same way you have to "tune" your kernel to receive - packets sent to an specific multicast group. When you do that, it's - said that the host has joined that group in the interface you - specified. More on this later. - - There are some special multicast groups, say "well known multicast - groups", you should not use in your particular applications due the - special purpose they are destined to: - - - o 224.0.0.1 is the all-hosts group. If you ping that group, all - multicast capable hosts on the network should answer, as every - multicast capable host must join that group at start-up on all it's - multicast capable interfaces. - - - o 224.0.0.2 is the all-routers group. All multicast routers must join - that group on all it's multicast capable interfaces. - - o 224.0.0.4 is the all DVMRP routers, 224.0.0.5 the all OSPF routers, - 224.0.013 the all PIM routers, etc. - - All this special multicast groups are regularly published in the - "Assigned Numbers" RFC. - - In any case, range 224.0.0.0 through 224.0.0.255 is reserved for local - purposes (as administrative and maintenance tasks) and datagrams - destined to them are never forwarded by multicast routers. Similarly, - the range 239.0.0.0 to 239.255.255.255 has been reserved for - "administrative scoping" (see section 2.3.1 for information on - administrative scoping). - - - - 2.2. Levels of conformance. - - Hosts can be in three different levels of conformance with the - Multicast specification, according to the requirements they meet. - - Level 0 is the "no support for IP Multicasting" level. Lots of hosts - and routers in the Internet are in this state, as multicast support is - not mandatory in IPv4 (it is, however, in IPv6). Not too much - explanation is needed here: hosts in this level can neither send nor - receive multicast packets. They must ignore the ones sent by other - multicast capable hosts. - - Level 1 is the "support for sending but not receiving multicast IP - datagrams" level. Thus, note that it is not necessary to join a - multicast group to be able to send datagrams to it. Very few additions - are needed in the IP module to make a "Level 0" host "Level - 1-compliant", as shown in section 2.3. - - Level 2 is the "full support for IP multicasting" level. Level 2 hosts - must be able to both send and receive multicast traffic. They must - know the way to join and leave multicast groups and to propagate this - information to multicast routers. Thus, they must include an Internet - Group Management Protocol (IGMP) implementation in their TCP/IP stack. - - - - 2.3. Sending Multicast Datagrams. - - By now, it should be obvious that multicast traffic is handled at the - transport layer with UDP, as TCP provides point-to-point connections, - not feasibles for multicast traffic. (Heavy research is taking place - to define and implement new multicast-oriented transport protocols. - See section ``Multicast Transport Protocols'' for details). - - In principle, an application just needs to open a UDP socket and fill - with a class D multicast address the destination address where it - wants to send data to. However, there are some operations that a - sending process must be able to control. - - - - 2.3.1. TTL. - - The TTL (Time To Live) field in the IP header has a double - significance in multicast. As always, it controls the live time of the - datagram to avoid it being looped forever due to routing errors. - Routers decrement the TTL of every datagram as it traverses from one - network to another and when its value reaches 0 the packet is dropped. - The TTL in IPv4 multicasting has also the meaning of "threshold". Its - use becomes evident with an example: suppose you set a long, bandwidth - consuming, video conference between all the hosts belonging to your - department. You want that huge amount of traffic to remain in your - LAN. Perhaps your department is big enough to have various LANs. In - that case you want those hosts belonging to each of your LANs to - attend the conference, but in any case you want to collapse the entire - Internet with your multicast traffic. There is a need to limit how - "long" multicast traffic will expand across routers. That's what the - TTL is used for. Routers have a TTL threshold assigned to each of its - interfaces, and only datagrams with a TTL greater than the interface's - threshold are forwarded. Note that when a datagram traverses a router - with a certain threshold assigned, the datagram's TTL is not - decremented by the value of the threshold. Only a comparison is made. - (As before, the TTL is decremented by 1 each time a datagram passes - across a router). - - A list of TTL thresholds and their associated scope follows: - - - ______________________________________________________________________ - TTL Scope - ---------------------------------------------------------------------- - 0 Restricted to the same host. Won't be output by any interface. - 1 Restricted to the same subnet. Won't be forwarded by a router. - <32 Restricted to the same site, organization or department. - <64 Restricted to the same region. - <128 Restricted to the same continent. - <255 Unrestricted in scope. Global. - ______________________________________________________________________ - - - - Nobody knows what "site" or "region" mean exactly. It is up to the - administrators to decide what this limits apply to. - - The TTL-trick is not always flexible enough for all needs, specially - when dealing with overlapping regions or trying to establish - geographic, topologic and bandwidth limits simultaneously. To solve - this problems, administratively scoped IPv4 multicast regions were - established in 1994. (see D. Meyer's "Administratively Scoped IP - Multicast" Internet draft). It does scoping based on multicast - addresses rather than on TTLs. The range 239.0.0.0 to 239.255.255.255 - is reserved for this administrative scoping. - - - - 2.3.2. Loopback. - - When the sending host is Level 2 conformant and is also a member of - the group datagrams are being sent to, a copy is looped back by - default. This does not mean that the interface card reads its own - transmission, recognizes it as belonging to a group the interface - belongs to, and reads it from the network. On the contrary, is the IP - layer which, by default, recognizes the to-be-sent datagram and copies - and queues it on the IP input queue before sending it. - - This feature is desirable in some cases, but not in others. So the - sending process can turn it on and off at wish. - - 2.3.3. Interface selection. - - Hosts attached to more than one network should provide a way for - applications to decide which network interface will be used to output - the transmissions. If not specified, the kernel chooses a default one - based on system administrator's configuration. - - 2.4. Receiving Multicast Datagrams. - - 2.4.1. Joining a Multicast Group. - - Broadcast is (in comparison) easier to implement than multicast. It - doesn't require processes to give the kernel some rules regarding what - to do with broadcast packets. The kernel just knows what to do: read - and deliver all of them to the proper applications. - - With multicast, however, it is necessary to advise the kernel which - multicast groups we are interested in. That is, we have to ask the - kernel to "join" those multicast groups. Depending on the underlying - hardware, multicast datagrams are filtered by the hardware or by the - IP layer (and, in some cases, by both). Only those with a destination - group previously registered via a join are accepted. - - Essentially, when we join a group we are telling the kernel: "OK. I - know that, by default, you ignore multicast datagrams, but remember - that I am interested in this multicast group. So, do read and deliver - (to any process interested in them, not only to me) any datagram that - you see in this network interface with this multicast group in its - destination field". - - Some considerations: first, note that you don't just join a group. - You join a group on a particular network interface. Of course, it is - possible to join the same group on more than one interface. If you - don't specify a concrete interface, then the kernel will choose it - based on its routing tables when datagrams are to be sent. It is also - possible that more than one process joins the same multicast group on - the same interface. They will all receive the datagrams sent to that - group via that interface. - - As said before, any multicast-capable hosts join the all-hosts group - at start-up , so "pinging" 224.0.0.1 returns all hosts in the network - that have multicast enabled. - - Finally, consider that for a process to receive multicast datagrams it - has to ask the kernel to join the group and bind the port those - datagrams were being sent to. The UDP layer uses both the destination - address and port to demultiplex the packets and decide which socket(s) - deliver them to. - - 2.4.2. Leaving a Multicast Group. - - When a process is no longer interested in a multicast group, it - informs the kernel that it wants to leave that group. It is important - to understand that this doesn't mean that the kernel will no longer - accept multicast datagrams destined to that multicast group. It will - still do so if there are more precesses who issued a "multicast join" - petition for that group and are still interested. In that case the - host remains member of the group, until all the processes decide to - leave the group. - - Even more: if you leave the group, but remain bound to the port you - were receiving the multicast traffic on, and there are more processes - that joined the group, you will still receive the multicast - transmissions. - - The idea is that joining a multicast group only tells the IP and data - link layer (which in some cases explicitly tells the hardware) to - accept multicast datagrams destined to that group. It is not a per- - process membership, but a per-host membership. - - - - 2.4.3. Mapping of IP Multicast Addresses to Ethernet/FDDI addresses. - - Both Ethernet and FDDI frames have a 48 bit destination address field. - In order to avoid a kind of multicast ARP to map multicast IP - addresses to ethernet/FDDI ones, the IANA reserved a range of - addresses for multicast: every ethernet/FDDI frame with its - destination in the range 01-00-5e-00-00-00 to 01-00-5e-ff-ff-ff (hex) - contains data for a multicast group. The prefix 01-00-5e identifies - the frame as multicast, the next bit is always 0 and so only 23 bits - are left to the multicast address. As IP multicast groups are 28 bits - long, the mapping can not be one-to-one. Only the 23 least significant - bits of the IP multicast group are placed in the frame. The remaining - 5 high-order bits are ignored, resulting in 32 different multicast - groups being mapped to the same ethernet/FDDI address. This means that - the ethernet layer acts as an imperfect filter, and the IP layer will - have to decide whether to accept the datagrams the data-link layer - passed to it. The IP layer acts as a definitive perfect filter. - - Full details on IP Multicasting over FDDI are given in RFC 1390: - "Transmission of IP and ARP over FDDI Networks". For more information - on mapping IP Multicast addresses to ethernet ones, you may consult - draft-ietf-mboned-intro-multicast-03.txt: "Introduction to IP - Multicast Routing". - - If you are interested in IP Multicasting over Token-Ring Local Area - Networks, see RFC 1469 for details. - - - - 3. Kernel requirements and configuration. - - Linux is, of course (you doubted it?), full Level-2 Multicast- - Compliant. It meets all requirements to send, receive and act as a - router (mrouter) for multicast datagrams. - - If you want just to send and receive, you must say yes to "IP: - multicasting" when configuring your kernel. If you also want your - Linux box to act as a multicast router (mrouter) you also need to - enable multicast routing in the kernel by selecting "IP: - forwarding/gatewaying", "IP: multicast routing" and "IP: tunneling", - the latter because new versions of mrouted relay on IP tunneling to - send multicast datagrams encapsulated into unicast ones. This is - necessary when establishing tunnels between multicast hosts separated - by unicast-only networks and routers. (The mrouted is a daemon that - implements the multicast routing algorithm -the routing policy- and - instructs the kernel on how to route multicast datagrams). - - Some kernel versions label multicast routing as "EXPERIMENTAL", so you - should enable "Prompt for development and/or incomplete code/drivers" - in the "Code maturity level options" section. - - If, when running the mrouted, traffic generated in the same network - your Linux box is connected to is correctly forwarded to the other - network, but you can't see the other's network traffic on your local - network, check whether you are receiving ICMP protocol error messages. - Almost sure you forgot to turn on IP tunneling in your Linux router. - It's a kind of stupid error when you know it but, believe me, its - quite time-consuming when you don't, and there is no apparent reason - that explains what is going wrong. A sniffer proves to be quite useful - in these situations! - - (You can see more on multicast routing on section ``Routing Policies - and Forwarding Techniques''; mrouted and tunnels are also explained in - sections ``The MBone'' and ``Multicast applications''). - - Once you have compiled and installed your new kernel, you should - provide a default route for multicast traffic. The goal is to add a - route to the network 224.0.0.0. - - The problem most people seem to face in this stage of the - configuration is with the value of the mask to supply. If you have - read Terry Dawson's excellent NET-3-HOWTO, it should not be difficult - to guess the correct value, though. As explained there, the netmask is - a 32 bit number filled with all-1s in the network part of your IP - address, and with all-0s in the host part. Recall from section 2.1 - that a class D multicast address has no netwok/host sections. Instead - it has a 28-bit group identifier and a 4-bit class D identifier. Well, - this 4 bits are the network part and the remaining 28 the host part. - So the netmask needed is 11110000000000000000000000000000 or, easier - to read: 240.0.0.0. Then, the full command should be: - - - route add 224.0.0.0 netmask 240.0.0.0 dev eth0 - - - - Depending on how old your route program is, you might need to add the - -net flag after the add. - - Here we supposed that eth0 was multicast-capable and that, when not - otherwise specified, we wanted multicast traffic to be output there. - If this is not your case, change the dev parameter as appropriate. - - The /proc filesystem proves here to be useful once again: you can - check /proc/net/igmp to see the groups your host is currently - subscribed to. - - - - 4. The MBone. - - Using a new technology usually carries some advantages and - disadvantages. The advantages of multicast are -I think- clear. The - main disadvantage is that hundreds of hosts and, specially, routers - don't support it yet. As a consequence, people who started working on - multicast, bought new equipment, modified their operating systems, and - built multicast islands in their local places. Then they discovered - that it was difficult to communicate with people doing similar things - because if only one of the routers between them didn't support - multicast there was nothing to do... - - The solution was clear: they decided to build a virtual multicast - network in the top of the Internet. That is: sites with multicast - routers between them could communicate directly. But sites joined - across unicast routers would send their island's multicast traffic - encapsulated in unicast packets to other multicast islands. Routers in - the middle would not have problems, as they would be dealing with - unicast traffic. Finally, in the receiving site, traffic would be de- - encapsulated, and sent to the island in the original multicast way. - Two ends converting from multicast to unicast, and then again to - multicast define what is called a multicast tunnel. - - The MBone or Multicast Backbone is that virtual multicast network - based on multicast islands connected by multicast tunnels. - - Several activities take place in the MBone daily, but it deserves to - be remarked the profusion of tele-conferences with real time audio and - video taking place across the whole Internet. As an example, it was - recently transmitted (live) the talk Linus Torvalds gave to the - Silicon Valley Linux Users Group. - - For more information on the MBone, see: - - - - - - 5. Multicast applications. - - Most people dealing with multicast, sooner or later decide to connect - to the MBone, and then they usually need an mrouted. You'll also need - it if you don't have a multicast-capable router and you want multicast - traffic generated in one of your subnets to be "heard" on another. - mrouted does circunvect the problem of sending multicast traffic - across unicast routers -it encapsulates multicast datagrams into - unicast ones (IP into IP)- but this is not the only feature it - provides. Most important, it instructs the kernel on how to route (or - not-to-route) multicast datagrams based on their source and - destination. So, even having a multicast capable router, mrouted can - be used to tell it what to do with the datagrams (note I said what, - and not how; mrouted says "forward this to the network connected to - that interface", but actual forwarding is performed by the kernel). - This distinction between actual-forwarding and the algorithm that - decides who and how to forward is very useful as it allows to write - forwarding code only once and place it into the kernel. Forwarding - algorithms and policies are then implemented in user space daemons, so - it is very easy to change from one policy to another without the need - of kernel re-compilation. - - You can get a version of mrouted ported to Linux from: - - . This site is mirrored - all across the world. Be sure to read the - file to choose the one - nearest you. - - Next, we'll focus specially on multicast applications written to - connect to the MBone, which have been ported to Linux. The list is - picked up from Michael Esler's "Linux Multicast Information" page - . I recommend you that - page for lots of information and resources on multicast and Linux. - - - Audio Conferencing - - o NeVoT - Network Voice Terminal - - o RAT - UCL Robust-Audio Tool - - o vat - LBL visual audio tool - - Video Conferencing - - o ivs - Inria video conferencing system - - - o nv - Network video tool - o nv w/ Meteor - Release of nv w/ support for the Matrox Meteor (UVa) - - - o vic - LBL video conferencing tool - - o vic w/ Meteor - Release of vic w/ support for the Matrox Meteor - (UVa) - - - Other Utilities - - o mmphone Multimedia phone service - - - o wb - LBL shared white board - - o webcast - Reliable multicast application for linking Mosaic - browsers - - - Session Tools - - I placed session tools later because I think they deserve some - explanation. When a conference takes places, several multicast groups - and ports are assigned to each service you want for your conference - (audio, video, shared white-boards, etc...) Announces of the - conferences that will take place, along with information on multicast - groups, ports and programs that will be used (vic, vat, ...) are - periodically multicasted to the MBone. Session tools "hear" this - information and present you in an easy way which conferences are - taking (or will take) place, so you can decide which interest you. - Also, they facilitate the task of joining a session. Instead of - launching each program that will be used and telling which multicast - group/port to join, you usually just need to click and the session - tool launches the proper programs suppling them all information needed - to join the conference. Session tools usually let you announce your - own conferences on the MBone. - - - o gwTTS - University of Virginia tele-tutoring system - - - o isc - Integrated session controller - - - o mmcc - Multimedia conference control - - - o sd - LBL session directory tool - - - o sd-snoop - Tenet Group session directory snoop utility - - - o sdr - UCL's next generation session directory - - - - - 6. Multicast programming. - - Multicast programming... or writing your own multicast applications. - - Several extensions to the programming API are needed in order to - support multicast. All of them are handled via two system calls: - setsockopt() (used to pass information to the kernel) and getsockopt() - (to retrieve information regarded multicast behavior). This does not - mean that 2 new system calls were added to support multicast. The pair - setsockopt()/getsockopt() has been there for years. Since 4.2 BSD at - least. The addition consists on a new set of options (multicast - options) that are passed to these system calls, that the kernel must - understand. - - The following are the setsockopt()/getsockopt() function prototypes: - - - int getsockopt(int s, int level, int optname, void* optval, int* optlen); - - int setsockopt(int s, int level, int optname, const void* optval, int optlen); - - - - The first parameter, s, is the socket the system call applies to. For - multicasting, it must be a socket of the family AF_INET and its type - may be either SOCK_DGRAM or SOCK_RAW. The most common use is with - SOCK_DGRAM sockets, but if you plan to write a routing daemon or - modify some existing one, you will probably need to use SOCK_RAW ones. - - The second one, level, identifies the layer that is to handle the - option, message or query, whatever you want to call it. So, SOL_SOCKET - is for the socket layer, IPPROTO_IP for the IP layer, etc... For - multicast programming, level will always be IPPROTO_IP. - - optname identifies the option we are setting/getting. Its value - (either supplied by the program or returned by the kernel) is optval. - The optnames involved in multicast programming are the following: - - ______________________________________________________________________ - setsockopt() getsockopt() - IP_MULTICAST_LOOP yes yes - IP_MULTICAST_TTL yes yes - IP_MULTICAST_IF yes yes - IP_ADD_MEMBERSHIP yes no - IP_DROP_MEMBERSHIP yes no - ______________________________________________________________________ - - optlen carries the size of the data structure optval points to. Note - that in getsockopt() it is a value-result rather than a value: the - kernel writes the value of optname in the buffer pointed by optval and - informs us of that value's size via optlen. - - Both setsockopt() and getsockopt() return 0 on success and -1 on - error. - - 6.1. IP_MULTICAST_LOOP. - - You have to decide, as the application writer, whether you want the - data you send to be looped back to your host or not. If you plan to - have more than one process or user "listening", loopback must be - enabled. On the other hand, if you are sending the images your video - camera is producing, you probably don't want loopback, even if you - want to see yourself on the screen. In that latter case, your - application will probably receive the images from a device attached to - the computer and send them to the socket. As the application already - "has" that data, it is improbable it wants to receive it again on the - socket. Loopback is by default enabled. - - Regard that optval is a pointer. You can't write: - - setsockopt(socket, IPPROTO_IP, IP_MULTICAST_LOOP, 0, 1); - - to disable loopback. Instead write: - - u_char loop; - setsockopt(socket, IPPROTO_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); - - and set loop to 1 to enable loopback or 0 to disable it. - - To know whether a socket is currently looping-back or not use - something like: - - u_char loop; - int size; - - getsockopt(socket, IPPROTO_IP, IP_MULTICAST_LOOP, &loop, &size) - - 6.2. IP_MULTICAST_TTL. - - If not otherwise specified, multicast datagrams are sent with a - default value of 1, to prevent them to be forwarded beyond the local - network. To change the TTL to the value you desire (from 0 to 255), - put that value into a variable (here I name it "ttl") and write - somewhere in your program: - - u_char ttl; - setsockopt(socket, IPPROTO_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); - - The behavior with getsockopt() is similar to the one seen on - IP_MULTICAST_LOOP. - - 6.3. IP_MULTICAST_IF. - - Usually, the system administrator specifies the default interface - multicast datagrams should be sent from. The programmer can override - this and choose a concrete outgoing interface for a given socket with - this option. - - struct in_addr interface_addr; - setsockopt (socket, IPPROTO_IP, IP_MULTICAST_IF, &interface_addr, sizeof(interface_addr)); - - From now on, all multicast traffic generated in this socket will be - output from the interface chosen. To revert to the original behavior - and let the kernel choose the outgoing interface based on the system - administrator's configuration, it is enough to call setsockopt() with - this same option and INADDR_ANY in the interface field. - - In determining or selecting outgoing interfaces, the following ioctls - might be useful: SIOCGIFADDR (to get an interface's address), - SIOCGIFCONF (to get the list of all the interfaces) and SIOCGIFFLAGS - (to get an interface's flags and, thus, determine whether the - interface is multicast capable or not -the IFF_MULTICAST flag-). - - If the host has more than one interface and the IP_MULTICAST_IF option - is not set, multicast transmissions are sent from the default - interface, although the remainding interfaces might be used for - multicast forwarding if the host is acting as a multicast router. - - 6.4. IP_ADD_MEMBERSHIP. - - Recall that you need to tell the kernel which multicast groups you are - interested in. If no process is interested in a group, packets - destined to it that arrive to the host are discarded. In order to - inform the kernel of your interests and, thus, become a member of that - group, you should first fill a ip_mreq structure which is passed later - to the kernel in the optval field of the setsockopt() system call. - - The ip_mreq structure (taken from /usr/include/linux/in.h) has the - following members: - - - struct ip_mreq - { - struct in_addr imr_multiaddr; /* IP multicast address of group */ - struct in_addr imr_interface; /* local IP address of interface */ - }; - - - - (Note: the "physical" definition of the structure is in the file above - specified. Nonetheless, you should not include if you - want your code to be portable. Instead, include which, - in turn, includes itself). - - The first member, imr_multiaddr, holds the group address you want to - join. Remember that memberships are also associated with interfaces, - not just groups. This is the reason you have to provide a value for - the second member: imr_interface. This way, if you are in a multihomed - host, you can join the same group in several interfaces. You can - always fill this last member with the wildcard address (INADDR_ANY) - and then the kernel will deal with the task of choosing the interface. - - With this structure filled (say you defined it as: struct ip_mreq - mreq;) you just have to call setsockopt() this way: - - - setsockopt (socket, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); - - - - Notice that you can join several groups to the same socket, not just - one. The limit to this is IP_MAX_MEMBERSHIPS and, as of version - 2.0.33, it has the value of 20. - 6.5. IP_DROP_MEMBERSHIP. - - The process is quite similar to joining a group: - - - struct ip_mreq mreq; - setsockopt (socket, IPPROTO_IP, IP_DROP_MEMBERSHIP, &mreq, sizeof(mreq)); - - - - where mreq is the same structure with the same data used when joining - the group. If the imr_interface member is filled with INADDR_ANY, the - first matching group is dropped. - - If you have joined a lot of groups to the same socket, you don't need - to drop memberships in all of them in order to terminate. When you - close a socket, all memberships associated with it are dropped by the - kernel. The same occurs if the process that opened the socket is - killed. - - Finally, keep in mind that a process dropping membership for a group - does not imply that the host will stop receiving datagrams for that - group. If another socket joined that group in that same interface - previously to this IP_DROP_MEMBERSHIP, the host will keep being a - member of that group. - - Both ADD_MEMBERSHIP and DROP_MEMBERSHIP are nonblocking operations. - They should return immediately indicating either success or failure. - - - - 7. The internals. - - This section's aim is to provide some information, not needed to reach - a basic understanding on how multicast works nor to be able to write - multicast programs, but which is very interesting, gives some insight - on the underlying multicast protocols and implementations, and may be - useful to avoid common errors and misunderstandings. - - - - 7.1. IGMP. - - When talking about IP_ADD_MEMBERSHIP and IP_DROP_MEMBERSHIP, we said - that the information provided by this "commands" was used by the - kernel to choose which multicast datagrams accept or discard. This is - true, but it is not all the truth. Such a simplification would imply - that multicast datagrams for all multicast groups around the world - would be received by our host, and then it would check the memberships - issued by processes running on it to decide whether to pass the - traffic to them or to throw it out. As you can imagine, this is a - complete bandwidth waste. - - What actually happens is that hosts instruct their routers telling - them which multicast groups they are interested in; then, those - routers tell their up-stream routers they want to receive that - traffic, and so on. Algorithms employed for making the decision of - when to ask for a group's traffic or saying that it is not desired - anymore, vary a lot. There's something, however, that never changes: - how this information is transmitted. IGMP is used for that. It stands - for Internet Group Management Protocol. It is a new protocol, similar - in many aspects to ICMP, with a protocol number of 2, whose messages - are carried in IP datagrams, and which all level 2-compliant host are - required to implement. - As said before, it is used both by hosts giving membership information - to its routers, and by routers to communicate between themselves. In - the following I'll cover only the hosts-routers relationships, mainly - because I was unable to find information describing router to router - communication other than the mrouted source code (rfc 1075 describing - the Distance Vector Multicast Routing Protocol is now obsoleted, and - mrouted implements a modified DVMRP not yet documented). - - IGMP version 0 is specified in RFC-988 which is now obsoleted. Almost - no one uses version 0 now. - - IGMP version 1 is described in RFC-1112 and, although it is updated by - RFC-2236 (IGMP version 2) it is in wide use still. The Linux kernel - implements the full IGMP version 1 and parts of version 2 - requirements, but not all. - - Now I'll try to give an informal description of the protocol. You can - check RFC-2236 for an in-proof formal description, with lots of state - diagrams and time-out boundaries. - - All IGMP messages have the following structure: - - ______________________________________________________________________ - 0 1 2 3 - 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Type | Max Resp Time | Checksum | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Group Address | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - ______________________________________________________________________ - - - - IGMP version 1 (hereinafter IGMPv1) labels the "Max Resp Time" as - "Unused", zeroes it when sent, and ignores it when received. Also, it - brakes the "Type" field in two 4-bits wide fields: "Version" and - "Type". As IGMPv1 identifies a "Membership Query" message as 0x11 - (version 1, type 1) and IGMPv2 as 0x11 too, the 8 bits have the same - effective interpretation. - - I think it is more instructive to give first the IGMPv1 description - and next point out the IGMPv2 additions, as they are mainly that, - additions. - - For the following discussions it is important to remember that - multicast routers receive all IP multicast datagrams. - - - - 7.1.1. IGMP version 1. - - Routers periodically send IGMP Host Membership Queries to the all- - hosts group (224.0.0.1) with a TTL of 1 (once every minute or two). - All multicast-capable hosts hear them, but don't answer immediately to - avoid an IGMP Host Membership Report storm. Instead, they start a - random delay timer for each group they belong to on the interface they - received the query. - - Sooner or later, the timer expires in one of the hosts, and it sends - an IGMP Host Membership Report (also with TTL 1) to the multicast - address of the group being reported. As it is sent to the group, all - hosts that joined the group -and which are currently waiting for their - own timer to expire- receive it, too. Then, they stop their timers and - don't generate any other report. Just one is generated -by the host - that chose the smaller timeout-, and that is enough for the router. It - only needs to know that there are members for that group in the - subnet, not how many nor which. - - When no reports are received for a given group after a certain number - of queries, the router assumes that no members are left, and thus it - doesn't have to forward traffic for that group on that subnet. Note - that in IGMPv1 there are no "Leave Group messages". - - When a host joins a new group, the kernel sends a report for that - group, so that the respective process needs not to wait a minute or - two until a new membership query is received. As you can see this IGMP - packet is generated by the kernel as a response to the - IP_ADD_MEMBERSHIP command, seen in section ``IP_ADD_MEMBERSHIP''. - Note the emphasis in the adjective "new": if a process issues an - IP_ADD_MEMBERSHIP command for a group the host is already a member of, - no IGMP packets are sent as we must already be receiving traffic for - that group; instead, a counter for that group's use is incremented. - IP_DROP_MEMBERSHIP generates no datagrams in IGMPv1. - - Host Membership Queries are identified by Type 0x11, and Host - Membership Reports by Type 0x12. - - No reports are sent for the all-hosts group. Membership in this group - is permanent. - - - - 7.1.2. IGMP version 2. - - One important addition to the above is the inclusion of a Leave Group - message (Type 0x17). The reason is to reduce the bandwidth waste - between the time the last host in the subnet drops membership and the - time the router times-out for its queries and decides there are no - more members present for that group (leave latency). Leave Group - messages should be addressed to the all-routers group (224.0.0.2) - rather than to the group being left, as that information is of no use - for other members (kernel versions up to 2.0.33 send them to the - group; although it does no harm to the hosts, it's a waste of time as - they have to process them, but don't gain useful information). There - are certain subtle details regarding when and when-not to send Leave - Messages; if interested, see the RFC. - - When an IGMPv2 router receives a Leave Message for a group, it sends - Group-Specific Queries to the group being left. This is another - addition. IGMPv1 has no group-specific queries. All queries are sent - to the all-hosts group. The Type in the IGMP header does not change - (0x11, as before), but the "Group Address" is filled with the address - of the multicast group being left. - - The "Max Resp Time" field, which was set to 0 in transmission and - ignored on reception in IGMPv1, is meaningful only in "Membership - Query" messages. It gives the maximum time allowed before sending a - report in units of 1/10 second. It is used as a tune mechanism. - - IGMPv2 adds another message type: 0x16. It is a "Version 2 Membership - Report" sent by IGMPv2 hosts if they detect an IGMPv2 router is - present (an IGMPv2 host knows an IGMPv1 router is present when it - receives a query with the "Max Response" field set to 0). - - When more than one router claims to act as querier, IGMPv2 provides a - mechanism to avoid "discussions": the router with the lowest IP - address is designed to be querier. The other routers keep timeouts. If - the router with lower IP address crashes or is shutdown, the decision - of who will be the querier is taken again after the timers expire. - - - 7.2. Kernel corner. - - This sub-section gives some start-points to study the multicast - implementation of the Linux kernel. It does not explain that - implementation. It just says where to find things. - - The study was carried over version 2.0.32, so it could be a bit - outdated by the time you read it (network code seems to have changed A - LOT in 2.1.x releases, for instance). - - Multicast code in the Linux kernel is always surrounded by #ifdef - CONFIG_IP_MULTICAST / #endif pairs, so that you can include/ exclude - it from your kernel based on your needs (this inclusion/exclusion is - done at compile time, as you probably know if reading that section... - #ifdefs are handled by the preprocessor. The decision is made based - in what you selected when doing either a make config, make menuconfig - or make xconfig). - - You might want multicast features, but if your Linux box is not going - to act as a multicast router you will probably not want multicast - router features included in your new kernel. For this you have the - multicast routing code surrounded by #ifdef CONFIG_IP_MROUTE / #endif - pairs. - - Kernel sources are usually placed in /usr/src/linux. However, the - place may change so, both for accuracy and brevity, I will refer to - the root directory of the kernel sources as just LINUX. Then, - something like LINUX/net/ipv4/udp.c should be the same as - /usr/src/linux/net/ipv4/udp.c if you unpacked the kernel sources in - the /usr/src/linux directory. - - All multicast interfaces with user programs shown in the section - devoted to multicast programming were driven across the setsockopt()/ - getsockopt() system calls. Both of them are implemented by means of - functions that make some tests to verify the parameters passed to them - and which, in turn, call another function that makes some additional - tests, demultiplexes the call based on the level parameter to either - system call, and then calls another function which... (if interested - in all this jumps, you can follow them in LINUX/net/socket.c - (functions sys_socketcall() and sys_setsockopt(), - LINUX/net/ipv4/af_inet.c (function inet_setsockopt()) and - LINUX/net/ipv4/ip_sockglue.c (function ip_setsockopt()) ). - - The one which interests us is LINUX/net/ipv4/ip_sockglue.c. Here we - find ip_setsockopt() and ip_getsockopt() which are mainly a switch - (after some error checking) verifying each possible value for optname. - Along with unicast options, all multicast ones seen here are handled: - IP_MULTICAST_TTL, IP_MULTICAST_LOOP, IP_MULTICAST_IF, - IP_ADD_MEMBERSHIP and IP_DROP_MEMBERSHIP. Previously to the switch, a - test is made to determine whether the options are multicast router - specific, and if so, they are routed to the ip_mroute_setsockopt() and - ip_mroute_getsockopt() functions (file LINUX/net/ipv4/ipmr.c). - - In LINUX/net/ipv4/af_inet.c we can see the default values we talked - about in previous sections (loopback enabled, TTL=1) provided when the - socket is created (taken from function inet_create() in this file): - - - - ______________________________________________________________________ - - #ifdef CONFIG_IP_MULTICAST - sk->ip_mc_loop=1; - sk->ip_mc_ttl=1; - *sk->ip_mc_name=0; - sk->ip_mc_list=NULL; - #endif - ______________________________________________________________________ - - - - Also, the assertion of "closing a socket makes the kernel drop all - memberships this socket had" is corroborated by: - - ______________________________________________________________________ - #ifdef CONFIG_IP_MULTICAST - /* Applications forget to leave groups before exiting */ - ip_mc_drop_socket(sk); - #endif - ______________________________________________________________________ - - - taken from inet_release(), on the same file as before. - - Device independent operations for the Link Layer are kept in - LINUX/net/core/dev_mcast.c. - - Two important functions are still missing: the processing of input and - output multicast datagrams. As any other datagrams, incoming datagrams - are passed from the device drivers to the ip_rcv() function - (LINUX/net/ipv4/ip_input.c). In this function is where the perfect - filtering is applied to multicast packets that crossed the devices - layer (recall that lower layers only perform best-effort filtering and - is IP who 100% knows whether we are interested in that multicast group - or not). If the host is acting as a multicast router, this function - decides too whether the datagram should be forwarded and calls - ipmr_forward() appropriately. (ipmr_forward() is implemented in - LINUX/net/ipv4/ipmr.c). - - Code in charge of out-putting packets is kept in - LINUX/net/ipv4/ip_output.c. Here is where the IP_MULTICAST_LOOP - option takes effect, as it is checked to see whether to loop back the - packets or not (function ip_queue_xmit()). Also the TTL of the - outgoing packet is selected based on whether it is a multicast or - unicast one. In the former case, the argument passed to the - IP_MULTICAST_TTL option is used (function (ip_build_xmit()). - - While working with mrouted (a program which gives the kernel - information about how to route multicast datagrams), we detected that - all multicast packets originated on the local network were properly - routed..., except the ones from the Linux box that was acting as the - multicast router!! ip_input.c was working OK, but it seemed - ip_output.c wasn't. Reading the source code for the output functions, - we found that outgoing datagrams were not being passed to - ipmr_forward(), the function that had to decide whether they should be - routed or not. The packets were outputed to the local network but, as - network cards are usually unable to read their own transmissions, - those datagrams were never routed. We added the necessary code to the - ip_build_xmit() function and everything was OK again. (Having the - sources for your kernel is not a luxury or pedantry; it's a need!) - - ipmr_forward() has been mentioned a couple of times. It is an - important function as it solves one important misunderstanding that - appears to be widely expanded. When routing multicast traffic, it is - not mrouted who makes the copies and sends them to the proper - recipients. mrouted receives all multicast traffic and, based on that - information, computes the multicast routing tables and tells the - kernel how to route: "datagrams for this group coming from that - interface should be forwarded to those interfaces". This information - is passed to the kernel by calls to setsockopt() on a raw socket - opened by the mrouted daemon (the protocol specified when the raw - socket was created must be IPPROTO_IGMP). This options are handled in - the ip_mroute_setsockopt() function from LINUX/net/ipv4/ipmr.c. The - first option (would be better to call them commands rather than - options) issued on that socket must be MRT_INIT. All other commands - are ignored (returning -EACCES) if MRT_INIT is not issued first. Only - one instance of mrouted can be running at the same time in the same - host. To keep track of this, when the first MRT_INIT is received, an - important variable, struct sock* mroute_socket, is pointed to the - socket MRT_INIT was received on. If mroute_socket is not null when - attending an MRT_INIT this means another mrouted is already running - and -EADDRINUSE is returned. All resting commands (MRT_DONE, - MRT_ADD_VIF, MRT_DEL_VIF, MRT_ADD_MFC, MRT_DEL_MFC and MRT_ASSERT) - return -EACCES if they come from a socket different than - mroute_socket. - - As routed multicast datagrams can be received/sent across either - physical interfaces or tunnels, a common abstraction for both was - devised: VIFs, Virtual InterFaces. mrouted passes vif structures to - the kernel, indicating physical or tunnel interfaces to add to its - routing tables, and multicast forwarding entries saying where to - forward datagrams. - - VIFs are added with MRT_ADD_VIF and deleted with MRT_DEL_VIF. Both - pass a struct vifctl to the kernel (defined in - /usr/include/linux/mroute.h) with the following information: - - ______________________________________________________________________ - struct vifctl { - vifi_t vifc_vifi; /* Index of VIF */ - unsigned char vifc_flags; /* VIFF_ flags */ - unsigned char vifc_threshold; /* ttl limit */ - unsigned int vifc_rate_limit; /* Rate limiter values (NI) */ - struct in_addr vifc_lcl_addr; /* Our address */ - struct in_addr vifc_rmt_addr; /* IPIP tunnel addr */ - }; - ______________________________________________________________________ - - - - With this information a vif_device structure is built: - - ______________________________________________________________________ - struct vif_device - { - struct device *dev; /* Device we are using */ - struct route *rt_cache; /* Tunnel route cache */ - unsigned long bytes_in,bytes_out; - unsigned long pkt_in,pkt_out; /* Statistics */ - unsigned long rate_limit; /* Traffic shaping (NI) */ - unsigned char threshold; /* TTL threshold */ - unsigned short flags; /* Control flags */ - unsigned long local,remote; /* Addresses(remote for tunnels)*/ - }; - ______________________________________________________________________ - - - - Note the dev entry in the structure. The device structure is defined - in /usr/include/linux/netdevice.h file. It is a big structure, but the - field that interests us is: - ______________________________________________________________________ - struct ip_mc_list* ip_mc_list; /* IP multicast filter chain */ - ______________________________________________________________________ - - - - The ip_mc_list structure -defined in /usr/include/linux/igmp.h- is as - follows: - - ______________________________________________________________________ - struct ip_mc_list - { - struct device *interface; - unsigned long multiaddr; - struct ip_mc_list *next; - struct timer_list timer; - short tm_running; - short reporter; - int users; - }; - ______________________________________________________________________ - - - - So, the ip_mc_list member from the dev structure is a pointer to a - linked list of ip_mc_list structures, each containing an entry for - each multicast group the network interface is a member of. Here again - we see membership is associated to interfaces. - LINUX/net/ipv4/ip_input.c traverses this linked list to decide whether - the received datagram is destined to any group the interface that - received the datagram belongs to: - - ______________________________________________________________________ - #ifdef CONFIG_IP_MULTICAST - if(!(dev->flags&IFF_ALLMULTI) && brd==IS_MULTICAST - && iph->daddr!=IGMP_ALL_HOSTS - && !(dev->flags&IFF_LOOPBACK)) - { - /* - * Check it is for one of our groups - */ - struct ip_mc_list *ip_mc=dev->ip_mc_list; - do - { - if(ip_mc==NULL) - { - kfree_skb(skb, FREE_WRITE); - return 0; - } - if(ip_mc->multiaddr==iph->daddr) - break; - ip_mc=ip_mc->next; - } - while(1); - } - #endif - ______________________________________________________________________ - - - - The users field in the ip_mc_list structure is used to implement what - was said in section ``IGMP version 1'': if a process joins a group and - the interface is already a member of that group (ie, another process - joined that same group in that same interface before) only the count - of members (users) is incremented. No IGMP messages are sent, as you - can see in the following code (taken from ip_mc_inc_group(), called by - ip_mc_join_group(), both in LINUX/net/ipv4/igmp.c): - - ______________________________________________________________________ - for(i=dev->ip_mc_list;i!=NULL;i=i->next) - { - if(i->multiaddr==addr) - { - i->users++; - return; - } - } - ______________________________________________________________________ - - - - When dropping memberships, the counter is decremented and additional - operations are performed only when the count reaches 0 - (ip_mc_dec_group()). - - MRT_ADD_MFC and MRT_DEL_MFC set or delete forwarding entries in the - multicast routing tables. Both pass a struct mfcctl to the kernel - (also defined in /usr/include/linux/mroute.h) with this information: - - ______________________________________________________________________ - struct mfcctl - { - struct in_addr mfcc_origin; /* Origin of mcast */ - struct in_addr mfcc_mcastgrp; /* Group in question */ - vifi_t mfcc_parent; /* Where it arrived */ - unsigned char mfcc_ttls[MAXVIFS]; /* Where it is going */ - }; - ______________________________________________________________________ - - - - With all this information in hand, ipmr_forward() "walks" across the - VIFs, and if a matching is found it duplicates the datagram and calls - ipmr_queue_xmit() which, in turn, uses the output device specified by - the routing table and the proper destination address if the packet is - to be sent across a tunnel (ie, the unicast destination address of the - other end of the tunnel). - - Function ip_rt_event() (not directly related to output, but which is - in ip_output.c too) receives events related to a network device, like - the device going up. This function assures that then the device joins - the ALL-HOSTS multicast group. - - IGMP functions are implemented in LINUX/net/ipv4/igmp.c. Important - information for that functions appears in /usr/include/linux/igmp.h - and /usr/include/linux/mroute.h. The IGMP entry in the /proc/net - directory is created with ip_init() in LINUX/net/ipv4/ip_output.c. - - - - 8. Routing Policies and Forwarding Techniques. - - One trivial algorithm to make worldwide multicast traffic available - everywhere could be to send it... everywhere, despite someone wants it - or not. As this does not seem quite optimized, several routing - algorithms and forwarding techniques have been implemented. - - DVMRP (Distance Vector Multicast Routing Protocol) is, perhaps, the - one most multicast routers use now. It is a dense mode routing - protocol, that is, it performs well in environments with high - bandwidth and densely distributed members. However, in sparse mode - scenarios, it suffers from scalability problems. - - Together with DVMRP we can find other dense mode routing protocols, - such as MOSPF (Multicast Extensions to OSPF -Open Shortest Path - First-) and PIM-DM (Protocol-Independent Multicast Dense Mode). - - To perform routing in sparse mode environments, we have PIM-SM - (Protocol Independent Multicast Sparse Mode) and CBT (Core Based - Trees). - - OSPF version 2 is explained in RFC 1583, and MOSPF in RFC 1584. PIM- - SM and CBT specifications can be found in RFC 2117 and 2201, - respectively. - - All this routing protocols use some type of multicast forwarding, such - as flooding, Reverse Path Broadcasting (RPB), Truncated Reverse Path - Broadcasting (TRPB), Reverse Path Multicasting (RPM) or Shared Trees. - - It would be too long to explain them here and, as short descriptions - for them are publicly available, I'll just recommend reading the - draft-ietf-mboned-in.txt text. You can find it in the same places RFCs - are available, and it explains in some detail all the above techniques - and policies. - - - - 9. Multicast Transport Protocols. - - So far we have been talking about multicast transmissions using UDP. - This is the usual practice, as it is impossible to do it with TCP. - However, intense research is taking place since a couple of years in - order to develop some new multicast transport protocols. - - Several of these protocols have been implemented and are being tested. - A good lesson from them is that it seems no multicast transport - protocol is general and good enough for all types of multicast - applications. - - If transport protocols are complex and difficult to tune, imagine - dealing with delays (in multimedia conferences), data loss, ordering, - retransmissions, flow and congestion control, group management, etc, - when the receiver is not one, but perhaps hundreds or thousands of - sparse hosts. Here scalability is an issue, and new techniches are - implemented, such as not giving acknowledges for every packet received - but, instead, send negative acknowledges (NACKs) for data not - received. RFC 1458 gives the proposed requirements for multicast - protocols. - - Giving descriptions of those multicast protocols is out of the scope - of this section. Instead, I'll give you the names of some of them and - point you to some sources of information: Real-Time Transport Protocol - (RTP) is concerned with multi-partite multimedia conferences, Scalable - Reliable Multicast (SRM) is used by the wb (the distributed White- - Board tool, see section ``Multicast applications''), Uniform Reliable - Group Communication Protocol (URGC) enforces reliable and ordered - transactions based in a centralized control, Muse was developed as an - application specific protocol: to multicast news articles over the - MBone, the Multicast File Transfer Protocol (MFTP) is quite - descriptive by itself and people "join" to file transmission - (previously announced) much in the same way they would join a - conference, Log-Based Receiver-reliable Multicast (LBRM) is a curious - protocol that keeps track of all packets sent in a logging server that - tells the sender whether it has to retransmit the data or can drop it - safely as all receivers got it. One protocol with a funny name - -especially for a multicast protocol- is STORM (STructure-Oriented - Resilient Multicast). Lots and lots of multicast protocols can be - found searching the Web, along with some interesting papers proposing - new activities for multicast (for instance, www page distribution - using multicast). - - A good page providing comparisons between reliable multicast protocols - is - - . - - A very good and up-to-date site, with lots of interesting links - (Internet drafts, RFCs, papers, links to other sites) is: - - . - - is also a good source of - information on the subject. - - Katia Obraczka's "Multicast Transport Protocols: A Survey and - Taxonomy" article gives short descriptions for each protocol and tries - to classify them according to different features. You can read it in - the IEEE Communications magazine, January 1998, vol. 36, No. 1. - - - - 10. References. - - 10.1. RFCs. - - - o RFC 1112 "Host Extensions for IP Multicasting". Steve Deering. - August 1989. - - o RFC 2236 "Internet Group Management Protocol, version 2". W. - Fenner. November 1997. - - o RFC 1458 "Requirements for Multicast Protocols". Braudes, R and - Zabele, S. May 1993. - - o RFC 1469 "IP Multicast over Token-Ring Local Area Networks". T. - Pusateri. June 1993. - - o RFC 1390 "Transmission of IP and ARP over FDDI Networks". D. Katz. - January 1993. - - o RFC 1583 "OSPF Version 2". John Moy. March 1994. - - o RFC 1584 "Multicast Extensions to OSPF". John Moy. March 1994. - - o RFC 1585 "MOSPF: Analysis and Experience". John Moy. March 1994. - - o RFC 1812 "Requirements for IP version 4 Routers". Fred Baker, - Editor. June 1995 - - o RFC 2117 "Protocol Independent Multicast-Sparse Mode (PIM-SM): - Protocol Specification". D. Estrin, D. Farinacci, A. Helmy, D. - Thaler; S. Deering, M. Handley, V. Jacobson, C. Liu, P. Sharma, and - L. Wei. July 1997. - - o RFC 2189 "Core Based Trees (CBT version 2) Multicast Routing". A. - Ballardie. September 1997. - - o RFC 2201 "Core Based Trees (CBT) Multicast Routing Architecture". - A. Ballardie. September 1997. - - - - 10.2. Internet Drafts. - - - o "Introduction to IP Multicast Routing". draft-ietf-mboned-intro- - multicast- 03.txt. T. Maufer, C. Semeria. July 1997. - - o "Administratively Scoped IP Multicast". draft-ietf-mboned-admin-ip- - space-03.txt. D. Meyer. June 10, 1997. - - 10.3. Web pages. - - - o Linux Multicast Homepage. - - - o Linux Multicast FAQ. - - o Multicast and MBONE on Linux. - - - o Christian Daudt's MBONE-Linux Page. - - - o Reliable Multicast Links - - - o Multicast Transport Protocols - - 10.4. Books. - - o "TCP/IP Illustrated: Volume 1 The Protocols". Stevens, W. Richard. - Addison Wesley Publishing Company, Reading MA, 1994 - - o "TCP/IP Illustrated: Volume 2, The Implementation". Wright, Gary - and W. Richard Stevens. Addison Wesley Publishing Company, Reading - MA, 1995 - - o "UNIX Network Programming Volume 1. Networking APIs: Sockets and - XTI". Stevens, W. Richard. Second Edition, Prentice Hall, Inc. - 1998. - - o "Internetworking with TCP/IP Volume 1 Principles, Protocols, and - Architecture". Comer, Douglas E. Second Edition, Prentice Hall, - Inc. Englewood Cliffs, New Jersey, 1991 - - diff --git a/LDP/guide/docbook/Linux-Networking/NIS.xml b/LDP/guide/docbook/Linux-Networking/NIS.xml deleted file mode 100644 index 5783391c..00000000 --- a/LDP/guide/docbook/Linux-Networking/NIS.xml +++ /dev/null @@ -1,1618 +0,0 @@ - - -NIS - - -The Network Information Service (NIS) provides a simple network lookup -service consisting of databases and processes. Its purpose is to -provide information that has to be known throughout the network to all -machines on the network. For example, it enables an administrator to -allow users access to any machine in a network running NIS without a -password entry existing on each machine; only the main database needs -to be maintained. This section describes how to configure Linux as -NIS(YP) or NIS+ client and how to install an NIS(YP) server. -Don't forget to read Section 5. - - ------------------------------------------------------------------------------ -2.2. Some General Information - - -The next four lines are quoted from the Sun(tm) System & Network -Administration Manual: - - - - -+---------------------------------------------------------------------------+ -| "NIS was formerly known as Sun Yellow Pages (YP) but | -| the name Yellow Pages(tm) is a registered trademark | -| in the United Kingdom of British Telecom plc and may | -| not be used without permission." | -+---------------------------------------------------------------------------+ - - - - -NIS stands for Network Information Service. Its purpose is to provide -information, that has to be known throughout the network, to all machines on -the network. Information likely to be distributed by NIS is: - - - -  * login names/passwords/home directories (/etc/passwd) -  * group information (/etc/group) - - - -If, for example, your password entry is recorded in the NIS passwd database, -you will be able to login on all machines on the network which have the NIS -client programs running. - - - -Sun is a trademark of Sun Microsystems, Inc. licensed to SunSoft, Inc. - ------------------------------------------------------------------------------ - -3. NIS, NYS or NIS+ ? - -3.1. libc 4/5 with traditional NIS or NYS ? - - -The choice between "traditional NIS" or the NIS code in the NYS library is a -choice between laziness and maturity vs. flexibility and love of adventure. -The "traditional NIS" code is in the standard C library and has been around -longer and sometimes suffers from its age and slight inflexibility. -The NIS code in the NYS library requires you to recompile the libc library to -include the NYS code into it (or maybe you can get a precompiled version of -libc from someone who has already done it). -Another difference is that the traditional NIS code has some support for NIS -Netgroups, which the NYS code doesn't. On the other hand the NYS code allows -you to handle Shadow Passwords in a transparent way. The "traditonal NIS" -code doesn't support Shadow Passwords over NIS. - - ------------------------------------------------------------------------------ - -3.2. glibc 2 and NIS/NIS+ - - -Forgot all this if you use the new GNU C Library 2.x (aka libc6). It has real -NSS (name switch service) support, which makes it very flexible, and contains -support for the following NIS/NIS+ maps: aliases, ethers, group, hosts, -netgroups, networks, protocols, publickey, passwd, rpc, services and shadow. -The GNU C Library has no problems with shadow passwords over NIS. - - ------------------------------------------------------------------------------ - -3.3. NIS or NIS+ ? - - -The choice between NIS and NIS+ is easy - use NIS+ only if you have severe -security needs. NIS+ is much more problematic to administer (it's pretty easy -to handle on the client side, but the server side is horrible). Another -problem is that the support for NIS+ under Linux contains a lot of bugs and -that the development has stopped. - - ------------------------------------------------------------------------------ - -4. How it works - -4.1. How NIS works - - -Within a network there must be at least one machine acting as a NIS server. -You can have multiple NIS servers, each serving different NIS "domains" - or -you can have cooperating NIS servers, where one is the master NIS server, and -all the other are so-called slave NIS servers (for a certain NIS "domain", -that is!) - or you can have a mix of them... - - - -Slave servers only have copies of the NIS databases and receive these copies -from the master NIS server whenever changes are made to the master's -databases. Depending on the number of machines in your network and the -reliability of your network, you might decide to install one or more slave -servers. Whenever a NIS server goes down or is too slow in responding to -requests, a NIS client connected to that server will try to find one that is -up or faster. - - - -NIS databases are in so-called DBM format, derived from ASCII databases. For -example, the files /etc/passwd and /etc/group can be directly converted to -DBM format using ASCII-to-DBM translation software (makedbm, included with -the server software). The master NIS server should have both, the ASCII -databases and the DBM databases. - - - -Slave servers will be notified of any change to the NIS maps, (via the yppush -program), and automatically retrieve the necessary changes in order to -synchronize their databases. NIS clients do not need to do this since they -always talk to the NIS server to read the information stored in it's DBM -databases. - - - -Old ypbind versions do a broadcast to find a running NIS server. This is -insecure, due the fact that anyone may install a NIS server and answer the -broadcast queries. Newer Versions of ypbind (ypbind-3.3 or ypbind-mt) are -able to get the server from a configuration file - thus no need to broadcast. - - ------------------------------------------------------------------------------ - -4.2. How NIS+ works - - -NIS+ is a new version of the network information nameservice from Sun. The -biggest difference between NIS and NIS+ is that NIS+ has support for data -encryption and authentication over secure RPC. - - - -The naming model of NIS+ is based upon a tree structure. Each node in the -tree corresponds to an NIS+ object, from which we have six types: directory, -entry, group, link, table and private. - - - -The NIS+ directory that forms the root of the NIS+ namespace is called the -root directory. There are two special NIS+ directories: org_dir and -groups_dir. The org_dir directory consists of all administration tables, such -as passwd, hosts, and mail_aliases. The groups_dir directory consists of NIS+ -group objects which are used for access control. The collection of org_dir, -groups_dir and their parent directory is referred to as an NIS+ domain. - - ------------------------------------------------------------------------------ - -5. The RPC Portmapper - - -To run any of the software mentioned below you will need to run the program / -sbin/portmap. Some Linux distributions already have the code in the /sbin/ -init.d/ or /etc/rc.d/ files to start up this daemon. All you have to do is to -activate it and reboot your Linux machine. Read your Linux Distribution -Documentation how to do this. - - - -The RPC portmapper (portmap(8)) is a server that converts RPC program numbers -into TCP/IP (or UDP/IP) protocol port numbers. It must be running in order to -make RPC calls (which is what the NIS/NIS+ client software does) to RPC -servers (like a NIS or NIS+ server) on that machine. When an RPC server is -started, it will tell portmap what port number it is listening to, and what -RPC program numbers it is prepared to serve. When a client wishes to make an -RPC call to a given program number, it will first contact portmap on the -server machine to determine the port number where RPC packets should be sent. - - - -Since RPC servers could be started by inetd(8), portmap should be running -before inetd is started. - - - -For secure RPC, the portmapper needs the Time service. Make sure, that the -Time service is enabled in /etc/inetd.conf on all hosts: - - - - -+---------------------------------------------------------------------------+ -|# | -|# Time service is used for clock syncronization. | -|# | -|time stream tcp nowait root internal | -|time dgram udp wait root internal | -+---------------------------------------------------------------------------+ - - - -IMPORTANT: Don't forget to restart inetd after changes on its configuration -file ! - - ------------------------------------------------------------------------------ - -6. What do you need to set up NIS? - -6.1. Determine whether you are a Server, Slave or Client. - - -To answer this question you have to consider two cases: - - - - 1. Your machine is going to be part of a network with existing NIS servers - 2. You do not have any NIS servers in the network yet - - - -In the first case, you only need the client programs (ypbind, ypwhich, ypcat, -yppoll, ypmatch). The most important program is ypbind. This program must be -running at all times, which means, it should always appear in the list of -processes. It is a daemon process and needs to be started from the system's -startup file (eg. /etc/init.d/nis, /sbin/init.d/ypclient, /etc/rc.d/init.d/ -ypbind, /etc/rc.local). As soon as ypbind is running your system has become a -NIS client. - - - -In the second case, if you don't have NIS servers, then you will also need a -NIS server program (usually called ypserv). Section 9 describes how to set up -a NIS server on your Linux machine using the ypserv daemon. - - ------------------------------------------------------------------------------ - -6.2. The Software - - -The system library "/usr/lib/libc.a" (version 4.4.2 and better) or the shared -library "/lib/libc.so.x" contain all necessary system calls to succesfully -compile the NIS client and server software. For the GNU C Library 2 (glibc -2.x), you also need /lib/libnsl.so.1. - - - -Some people reported that NIS only works with "/usr/lib/libc.a" version -4.5.21 and better so if you want to play it safe don't use older libc's. The -NIS client software can be obtained from: - - - - -+----------------------------------------------------------------------------------+ -| Site Directory File Name | -| | -| ftp.kernel.org /pub/linux/utils/net/NIS yp-tools-2.8.tar.gz | -| ftp.kernel.org /pub/linux/utils/net/NIS ypbind-mt-1.13.tar.gz | -| ftp.kernel.org /pub/linux/utils/net/NIS ypbind-3.3.tar.gz | -| ftp.kernel.org /pub/linux/utils/net/NIS ypbind-3.3-glibc5.diff.gz| -+----------------------------------------------------------------------------------+ - - - - -Once you obtained the software, please follow the instructions which come -with the software. yp-clients 2.2 are for use with libc4 and libc5 until -5.4.20. libc 5.4.21 and glibc 2.x needs yp-tools 1.4.1 or later. The new -yp-tools 2.4 should work with every Linux libc. Since there was a bug in the -NIS code, you shouldn't use libc 5.4.21-5.4.35. Use libc 5.4.36 or later -instead, or the most YP programs will not work. ypbind 3.3 will work with all -libraries, too. If you use gcc 2.8.x or greater, egcs or glibc 2.x, you -should add the ypbind-3.3-glibc5.diff patch to ypbind 3.3. If possible you -should avoid the use of ypbind 3.3 for security reasons. ypbind-mt is a new, -multithreaded daemon. It needs a Linux 2.2 kernel and glibc 2.1 or later. - - ------------------------------------------------------------------------------ - -7. Setting Up the NIS Client - -7.1. The ypbind daemon - - -After you have succesfully compiled the software you are now ready to install -it. A suitable place for the ypbind daemon is the directory /usr/sbin. Some -people may tell you that you don't need ypbind on a system with NYS. This is -wrong. ypwhich and ypcat need it always. - - - -You must do this as root of course. The other binaries (ypwhich, ypcat, -yppasswd, yppoll, ypmatch) should go in a directory accessible by all users, -normally /usr/bin. - - - -Newer ypbind versions have a configuration file called /etc/yp.conf. You can -hardcode a NIS server there - for more info see the manual page for ypbind -(8). You also need this file for NYS. An example: - - - - -+---------------------------------------------------------------------------+ -|ypserver 10.10.0.1 | -|ypserver 10.0.100.8 | -|ypserver 10.3.1.1 | -+---------------------------------------------------------------------------+ - - - - -If the system can resolve the hostnames without NIS, you may use the name, -otherwise you have to use the IP address. ypbind 3.3 has a bug and will only -use the last entry (ypserver 10.3.1.1 in the example). All other entries are -ignored. ypbind-mt handle this correct and uses that one, which answerd at -first. - - - -It might be a good idea to test ypbind before incorporating it in the startup -files. To test ypbind do the following: - - -  * Make sure you have your YP-domain name set. If it is not set then issue - the command: - - +---------------------------------------------------------------+ - | /bin/domainname nis.domain | - +---------------------------------------------------------------+ - - where nis.domain should be some string _NOT_ normally associated with the - DNS-domain name of your machine! The reason for this is that it makes it - a little harder for external crackers to retreive the password database - from your NIS servers. If you don't know what the NIS domain name is on - your network, ask your system/network administrator. - -  * Start up "/sbin/portmap" if it is not already running. - -  * Create the directory /var/yp if it does not exist. - -  * Start up /usr/sbin/ypbind - -  * Use the command rpcinfo -p localhost to check if ypbind was able to - register its service with the portmapper. The output should look like: - +---------------------------------------------------------------+ - | program vers proto port | - | 100000 2 tcp 111 portmapper | - | 100000 2 udp 111 portmapper | - | 100007 2 udp 637 ypbind | - | 100007 2 tcp 639 ypbind | - +---------------------------------------------------------------+ - or - +---------------------------------------------------------------+ - | program vers proto port | - | 100000 2 tcp 111 portmapper | - | 100000 2 udp 111 portmapper | - | 100007 2 udp 758 ypbind | - | 100007 1 udp 758 ypbind | - | 100007 2 tcp 761 ypbind | - | 100007 1 tcp 761 ypbind | - +---------------------------------------------------------------+ - Depending on the ypbind version you are using. - -  * You may also run rpcinfo -u localhost ypbind. This command should produce - something like: - +---------------------------------------------------------------+ - | program 100007 version 2 ready and waiting | - +---------------------------------------------------------------+ - or - +---------------------------------------------------------------+ - | program 100007 version 1 ready and waiting | - | program 100007 version 2 ready and waiting | - +---------------------------------------------------------------+ - The output depends on the ypbind version you have installed. Important is - only the "version 2" message. - -At this point you should be able to use NIS client programs like ypcat, -etc... For example, ypcat passwd.byname will give you the entire NIS password -database. - -IMPORTANT: If you skipped the test procedure then make sure you have set the -domain name, and created the directory - -+---------------------------------------------------------------------------+ -| /var/yp | -+---------------------------------------------------------------------------+ - - -This directory MUST exist for ypbind to start up succesfully. - - - -To check if the domainname is set correct, use the /bin/ypdomainname from -yp-tools 2.2. It uses the yp_get_default_domain() function which is more -restrict. It doesn't allow for example the "(none)" domainname, which is the -default under Linux and makes a lot of problems. - - - -If the test worked you may now want to change your startupd files so that -ypbind will be started at boot time and your system will act as a NIS client. -Make sure that the domainname will be set before you start ypbind. - - - -Well, that's it. Reboot the machine and watch the boot messages to see if -ypbind is actually started. - - ------------------------------------------------------------------------------ - -7.2. Setting up a NIS Client using Traditional NIS - - -For host lookups you must set (or add) "nis" to the lookup order line in your -/etc/host.conf file. Please read the manpage "resolv+.8" for more details. - - - -Add the following line to /etc/passwd on your NIS clients: - - - - -+---------------------------------------------------------------------------+ -|+:::::: | -+---------------------------------------------------------------------------+ - - - - -You can also use the + and - characters to include/exclude or change users. -If you want to exclude the user guest just add -guest to your /etc/passwd -file. You want to use a different shell (e.g. ksh) for the user "linux"? No -problem, just add "+linux::::::/bin/ksh" (without the quotes) to your /etc/ -passwd. Fields that you don't want to change have to be left empty. You could -also use Netgroups for user control. - - - -For example, to allow login-access only to miquels, dth and ed, and all -members of the sysadmin netgroup, but to have the account data of all other -users available use: - - - - -+---------------------------------------------------------------------------+ -| +miquels::::::: | -| +ed::::::: | -| +dth::::::: | -| +@sysadmins::::::: | -| -ftp | -| +:*::::::/etc/NoShell | -+---------------------------------------------------------------------------+ - - - - -Note that in Linux you can also override the password field, as we did in -this example. We also remove the login "ftp", so it isn't known any longer, -and anonymous ftp will not work. - - - -The netgroup would look like - - - - -+---------------------------------------------------------------------------+ -|sysadmins (-,software,) (-,kukuk,) | -+---------------------------------------------------------------------------+ - - - - -IMPORTANT: The netgroup feature is implemented starting from libc 4.5.26. If -you have a version of libc earlier than 4.5.26, every user in the NIS -password database can access your linux machine if you run "ypbind" ! - - ------------------------------------------------------------------------------ - -7.3. Setting up a NIS Client using NYS - - -All that is required is that the NIS configuration file (/etc/yp.conf) points -to the correct server(s) for its information. Also, the Name Services Switch -configuration file (/etc/nsswitch.conf) must be correctly set up. - - - -You should install ypbind. It isn't needed by the libc, but the NIS(YP) tools -need it. - - - -If you wish to use the include/exclude user feature (+/-guest/+@admins), you -have to use "passwd: compat" and "group: compat" in nsswitch.conf. Note that -there is no "shadow: compat"! You have to use "shadow: files nis" in this -case. - - - -The NYS sources are part of the libc 5 sources. When run configure, say the -first time "NO" to the "Values correct" question, then say "YES" to "Build a -NYS libc from nys". - - ------------------------------------------------------------------------------ - -7.4. Setting up a NIS Client using glibc 2.x - - -The glibc uses "traditional NIS", so you need to start ypbind. The Name -Services Switch configuration file (/etc/nsswitch.conf) must be correctly set -up. If you use the compat mode for passwd, shadow or group, you have to add -the "+" at the end of this files and you can use the include/exclude user -feature. The configuration is excatly the same as under Solaris 2.x. - - ------------------------------------------------------------------------------ - -7.5. The nsswitch.conf File - - -The Network Services switch file /etc/nsswitch.conf determines the order of -lookups performed when a certain piece of information is requested, just like -the /etc/host.conf file which determines the way host lookups are performed. -For example, the line - - - - -+---------------------------------------------------------------------------+ -| hosts: files nis dns | -+---------------------------------------------------------------------------+ - - - - -specifies that host lookup functions should first look in the local /etc/ -hosts file, followed by a NIS lookup and finally through the domain name -service (/etc/resolv.conf and named), at which point if no match is found an -error is returned. This file must be readable for every user! You can find -more information in the man-page nsswitch.5 or nsswitch.conf.5. - - - -A good /etc/nsswitch.conf file for NIS is: - - - - -+---------------------------------------------------------------------------+ -|# | -|# /etc/nsswitch.conf | -|# | -|# An example Name Service Switch config file. This file should be | -|# sorted with the most-used services at the beginning. | -|# | -|# The entry '[NOTFOUND=return]' means that the search for an | -|# entry should stop if the search in the previous entry turned | -|# up nothing. Note that if the search failed due to some other reason | -|# (like no NIS server responding) then the search continues with the | -|# next entry. | -|# | -|# Legal entries are: | -|# | -|# nisplus Use NIS+ (NIS version 3) | -|# nis Use NIS (NIS version 2), also called YP | -|# dns Use DNS (Domain Name Service) | -|# files Use the local files | -|# db Use the /var/db databases | -|# [NOTFOUND=return] Stop searching if not found so far | -|# | -| | -|passwd: compat | -|group: compat | -|# For libc5, you must use shadow: files nis | -|shadow: compat | -| | -|passwd_compat: nis | -|group_compat: nis | -|shadow_compat: nis | -| | -|hosts: nis files dns | -| | -|services: nis [NOTFOUND=return] files | -|networks: nis [NOTFOUND=return] files | -|protocols: nis [NOTFOUND=return] files | -|rpc: nis [NOTFOUND=return] files | -|ethers: nis [NOTFOUND=return] files | -|netmasks: nis [NOTFOUND=return] files | -|netgroup: nis | -|bootparams: nis [NOTFOUND=return] files | -|publickey: nis [NOTFOUND=return] files | -|automount: files | -|aliases: nis [NOTFOUND=return] files | -+---------------------------------------------------------------------------+ - - - - -passwd_compat, group_compat and shadow_compat are only supported by glibc -2.x. If there are no shadow rules in /etc/nsswitch.conf, glibc will use the -passwd rule for lookups. There are some more lookup module for glibc like -hesoid. For more information, read the glibc documentation. - - ------------------------------------------------------------------------------ - -7.6. Shadow Passwords with NIS - - -Shadow passwords over NIS are always a bad idea. You loose the security, -which shadow gives you, and it is supported by only some few Linux C -Libraries. A good way to avoid shadow passwords over NIS is, to put only the -local system users in /etc/shadow. Remove the NIS user entries from the -shadow database, and put the password back in passwd. So you can use shadow -for the root login, and normal passwd for NIS user. This has the advantage -that it will work with every NIS client. - - ------------------------------------------------------------------------------ - -7.6.1. Linux - - -The only Linux libc which supports shadow passwords over NIS, is the GNU C -Library 2.x. Linux libc5 has no support for it. Linux libc5 compiled with NYS -enabled has some code for it. But this code is badly broken in some cases and -doesn't work with all correct shadow entries. - - ------------------------------------------------------------------------------ - -7.6.2. Solaris - - -Solaris does not support shadow passwords over NIS. - - ------------------------------------------------------------------------------ - -7.6.3. PAM - - -Linux-PAM 0.75 and newr does support Shadow passwords over NIS if you use the -pam_unix.so Module or if you install the extra pam_unix2.so Module. Old -systems using pam_pwdb/libpwdb (for example Red Hat Linux 5.x) need to change -the /etc/pam.d/* entries. All pam_pwdb rules should be replaced through a -pam_unix_* module. - - - -An example /etc/pam.d/login file looks like: - - - - -+----------------------------------------------------------------------------------+ -|#%PAM-1.0 | -|auth requisite pam_unix2.so nullok #set_secrpc | -|auth required pam_securetty.so | -|auth required pam_nologin.so | -|auth required pam_env.so | -|auth required pam_mail.so | -|account required pam_unix2.so | -|password required pam_pwcheck.so nullok | -|password required pam_unix2.so nullok use_first_pass use_authtok | -|session required pam_unix2.so none # debug or trace | -|session required pam_limits.so | -+----------------------------------------------------------------------------------+ - - - ------------------------------------------------------------------------------ - -8. What do you need to set up NIS+ ? - -8.1. The Software - - -The Linux NIS+ client code was developed for the GNU C library 2. There is -also a port for Linux libc5, since most commercial Applications where linked -against this library in the past, and you cannot recompile them for using -glibc. There are problems with libc5 and NIS+: static programs cannot be -linked with it, and programs compiled with this library will not work with -other libc5 versions. -As base System you need a glibc based Distribution like Debian, Red Hat Linux -or SuSE Linux. If you have a Linux Distribution, which does not have glibc -2.1.1 or later, you need to update to a newer version. - - - -The NIS+ client software can be obtained from: - - - - -+---------------------------------------------------------------------------------+ -| Site Directory File Name | -| | -| ftp.gnu.org /pub/gnu/glibc glibc-2.3.2.tar.gz, | -| glibc-linuxthreads-2.3.2.tar.gz | -| ftp.kernel.org /pub/linux/utils/net/NIS+ nis-utils-1.4.1.tar.gz | -+---------------------------------------------------------------------------------+ - - - - -You should also have a look at [http://www.linux-nis.org/nisplus/] http:// -www.linux-nis.org/nisplus/ for more information and the latest sources. - - ------------------------------------------------------------------------------ - -8.2. Setting up a NIS+ client - - -IMPORTANT: For setting up a NIS+ client read your Solaris NIS+ docs what to -do on the server side! This document only describes what to do on the client -side! - - - -After installing the new libc and nis-tools, create the credentials for the -new client on the NIS+ server. Make sure portmap is running. Then check if -your Linux PC has the same time as the NIS+ Server. For secure RPC, you have -only a small window from about 3 minutes, in which the credentials are valid. -A good idea is to run xntpd on every host. After this, run - - - - -+---------------------------------------------------------------------------+ -|domainname nisplus.domain. | -|nisinit -c -H | -+---------------------------------------------------------------------------+ - - - - -to initialize the cold start file. Read the nisinit man page for more -options. Make sure that the domainname will always be set after a reboot. If -you don't know what the NIS+ domain name is on your network, ask your system/ -network administrator. - - - -Now you should change your /etc/nsswitch.conf file. Make sure that the only -service after publickey is nisplus ("publickey: nisplus"), and nothing else! - - - -Then start keyserv and make sure, that it will always be started as first -daemon after portmap at boot time. Run - - - - -+---------------------------------------------------------------------------+ -|keylogin -r | -+---------------------------------------------------------------------------+ - - - - -to store the root secretkey on your system. (I hope you have added the -publickey for the new host on the NIS+ Server?). - - - -niscat passwd.org_dir should now show you all entries in the passwd database. - - ------------------------------------------------------------------------------ - -8.3. NIS+, keylogin, login and PAM - - -When the user logs in, he need to set his secretkey to keyserv. This is done -by calling "keylogin". The login from the shadow package will do this for the -user, if it was compiled against glibc 2.1. For a PAM aware login, you have -to change the /etc/pam.d/login file to use pam_unix2, not pwdb, which doesn't -support NIS+. An example: - - - - -+---------------------------------------------------------------------------+ -|#%PAM-1.0 | -|auth required /lib/security/pam_securetty.so | -|auth required /lib/security/pam_unix2.so set_secrpc | -|auth required /lib/security/pam_nologin.so | -|account required /lib/security/pam_unix2.so | -|password required /lib/security/pam_unix2.so | -|session required /lib/security/pam_unix2.so | -+---------------------------------------------------------------------------+ - - - ------------------------------------------------------------------------------ - -8.4. The nsswitch.conf File - - -The Network Services switch file /etc/nsswitch.conf determines the order of -lookups performed when a certain piece of information is requested, just like -the /etc/host.conf file which determines the way host lookups are performed. -For example, the line - - - - -+---------------------------------------------------------------------------+ -| hosts: files nisplus dns | -+---------------------------------------------------------------------------+ - - - - -specifies that host lookup functions should first look in the local /etc/ -hosts file, followed by a NIS+ lookup and finally through the domain name -service (/etc/resolv.conf and named), at which point if no match is found an -error is returned. - - - -A good /etc/nsswitch.conf file for NIS+ is: - - - - -+---------------------------------------------------------------------------+ -|# | -|# /etc/nsswitch.conf | -|# | -|# An example Name Service Switch config file. This file should be | -|# sorted with the most-used services at the beginning. | -|# | -|# The entry '[NOTFOUND=return]' means that the search for an | -|# entry should stop if the search in the previous entry turned | -|# up nothing. Note that if the search failed due to some other reason | -|# (like no NIS server responding) then the search continues with the | -|# next entry. | -|# | -|# Legal entries are: | -|# | -|# nisplus Use NIS+ (NIS version 3) | -|# nis Use NIS (NIS version 2), also called YP | -|# dns Use DNS (Domain Name Service) | -|# files Use the local files | -|# db Use the /var/db databases | -|# [NOTFOUND=return] Stop searching if not found so far | -|# | -| | -|passwd: compat | -|group: compat | -|shadow: compat | -| | -|passwd_compat: nisplus | -|group_compat: nisplus | -|shadow_compat: nisplus | -| | -|hosts: nisplus files dns | -| | -|services: nisplus [NOTFOUND=return] files | -|networks: nisplus [NOTFOUND=return] files | -|protocols: nisplus [NOTFOUND=return] files | -|rpc: nisplus [NOTFOUND=return] files | -|ethers: nisplus [NOTFOUND=return] files | -|netmasks: nisplus [NOTFOUND=return] files | -|netgroup: nisplus | -|bootparams: nisplus [NOTFOUND=return] files | -|publickey: nisplus | -|automount: files | -|aliases: nisplus [NOTFOUND=return] files | -+---------------------------------------------------------------------------+ - - - ------------------------------------------------------------------------------ - -9. Setting up a NIS Server - -9.1. The Server Program ypserv - - -This document only describes how to set up the "ypserv" NIS server. - - - -The NIS server software can be found on: - - - - -+---------------------------------------------------------------------------+ -| Site Directory File Name | -| | -| ftp.kernel.org /pub/linux/utils/net/NIS ypserv-2.9.tar.gz | -| ftp.kernel.org /pub/linux/utils/net/NIS ypserv-2.9.tar.bz2 | -+---------------------------------------------------------------------------+ - - - - -You could also look at [http://www.linux-nis.org/nis/] http:// -www.linux-nis.org/nis/ for more information. - - - -The server setup is the same for both traditional NIS and NYS. - - - -Compile the software to generate the ypserv and makedbm programs. ypserv-2.x -only supports the securenets file for access restrictions. - - - -If you run your server as master, determine what files you require to be -available via NIS and then add or remove the appropriate entries to the "all" -rule in /var/yp/Makefile. You always should look at the Makefile and edit the -Options at the beginning of the file. - - - -There was one big change between ypserv 1.1 and ypserv 1.2. Since version -1.2, the file handles are cached. This means you have to call makedbm always -with the -c option if you create new maps. Make sure, you are using the new / -var/yp/Makefile from ypserv 1.2 or later, or add the -c flag to makedbm in -the Makefile. If you don't do that, ypserv will continue to use the old maps, -and not the updated one. - - - -Now edit /var/yp/securenets and /etc/ypserv.conf. For more information, read -the ypserv(8) and ypserv.conf(5) manual pages. - - - -Make sure the portmapper (portmap(8)) is running, and start the server ypserv -. The command - - - - -+---------------------------------------------------------------------------+ -| % rpcinfo -u localhost ypserv | -+---------------------------------------------------------------------------+ - - - - -should output something like - - - - -+---------------------------------------------------------------------------+ -| program 100004 version 1 ready and waiting | -| program 100004 version 2 ready and waiting | -+---------------------------------------------------------------------------+ - - - - -The "version 1" line could be missing, depending on the ypserv version and -configuration you are using. It is only necessary if you have old SunOS 4.x -clients. - - - -Now generate the NIS (YP) database. On the master, run - - - - -+---------------------------------------------------------------------------+ -| % /usr/lib/yp/ypinit -m | -+---------------------------------------------------------------------------+ - - - - -On a slave make sure that ypwhich -m works. This means, that your slave must -be configured as NIS client before you could run - - - - -+---------------------------------------------------------------------------+ -| % /usr/lib/yp/ypinit -s masterhost | -+---------------------------------------------------------------------------+ - - - -to install the host as NIS slave. - -That's it, your server is up and running. - -If you have bigger problems, you could start ypserv and ypbind in debug mode -on different xterms. The debug output should show you what goes wrong. - -If you need to update a map, run make in the /var/yp directory on the NIS -master. This will update a map if the source file is newer, and push the -files to the slave servers. Please don't use ypinit for updating a map. - -You might want to edit root's crontab *on the slave* server and add the -following lines: - - - -+---------------------------------------------------------------------------+ -| 20 * * * * /usr/lib/yp/ypxfr_1perhour | -| 40 6 * * * /usr/lib/yp/ypxfr_1perday | -| 55 6,18 * * * /usr/lib/yp/ypxfr_2perday | -+---------------------------------------------------------------------------+ - - - -This will ensure that most NIS maps are kept up-to-date, even if an update is -missed because the slave was down at the time the update was done on the -master. - -You can add a slave at every time later. At first, make sure that the new -slave server has permissions to contact the NIS master. Then run - - - -+---------------------------------------------------------------------------+ -| % /usr/lib/yp/ypinit -s masterhost | -+---------------------------------------------------------------------------+ - - - -on the new slave. On the master server, add the new slave server name to /var -/yp/ypservers and run make in /var/yp to update the map. - -If you want to restrict access for users to your NIS server, you'll have to -setup the NIS server as a client as well by running ypbind and adding the -plus-entries to /etc/passwd _halfway_ the password file. The library -functions will ignore all normal entries after the first NIS entry, and will -get the rest of the info through NIS. This way the NIS access rules are -maintained. An example: - - - -+-------------------------------------------------------------------------------+ -| root:x:0:0:root:/root:/bin/bash | -| daemon:*:1:1:daemon:/usr/sbin: | -| bin:*:2:2:bin:/bin: | -| sys:*:3:3:sys:/dev: | -| sync:*:4:100:sync:/bin:/bin/sync | -| games:*:5:100:games:/usr/games: | -| man:*:6:100:man:/var/catman: | -| lp:*:7:7:lp:/var/spool/lpd: | -| mail:*:8:8:mail:/var/spool/mail: | -| news:*:9:9:news:/var/spool/news: | -| uucp:*:10:50:uucp:/var/spool/uucp: | -| nobody:*:65534:65534:noone at all,,,,:/dev/null: | -| +miquels:::::: | -| +:*:::::/etc/NoShell | -| [ All normal users AFTER this line! ] | -| tester:*:299:10:Just a test account:/tmp: | -| miquels:1234567890123:101:10:Miquel van Smoorenburg:/home/miquels:/bin/zsh| -+-------------------------------------------------------------------------------+ - - - -Thus the user "tester" will exist, but have a shell of /etc/NoShell. miquels -will have normal access. - -Alternatively, you could edit the /var/yp/Makefile file and set NIS to use -another source password file. On large systems the NIS password and group -files are usually stored in /etc/yp/. If you do this the normal tools to -administrate the password file such as passwd, chfn, adduser will not work -anymore and you need special homemade tools for this. - -However, yppasswd, ypchsh and ypchfn will work of course. ------------------------------------------------------------------------------ - -9.2. The Server Program yps - -To set up the "yps" NIS server please refer to the previous paragraph. The -"yps" server setup is similar, _but_ not exactly the same so beware if you -try to apply the "ypserv" instructions to "yps"! "yps" is not supported by -any author, and contains some security leaks. You really shouldn't use it ! - -The "yps" NIS server software can be found on: - - - -+---------------------------------------------------------------------------+ -| Site Directory File Name | -| | -| ftp.lysator.liu.se /pub/NYS/servers yps-0.21.tar.gz | -| ftp.kernel.org /pub/linux/utils/net/NIS yps-0.21.tar.gz | -+---------------------------------------------------------------------------+ - - - ------------------------------------------------------------------------------ - -9.3. The Program rpc.ypxfrd - -rpc.ypxfrd is used for speed up the transfer of very large NIS maps from a -NIS master to NIS slave servers. If a NIS slave server receives a message -that there is a new map, it will start ypxfr for transfering the new map. -ypxfr will read the contents of a map from the master server using the yp_all -() function. This process can take several minutes when there are very large -maps which have to store by the database library. - -The rpc.ypxfrd server speeds up the transfer process by allowing NIS slave -servers to simply copy the master server's map files rather than building -their own from scratch. rpc.ypxfrd uses an RPC-based file transfer protocol, -so that there is no need for building a new map. - -rpc.ypxfrd can be started by inetd. But since it starts very slow, it should -be started with ypserv. You need to start rpc.ypxfrd only on the NIS master -server. ------------------------------------------------------------------------------ - -9.4. The Program rpc.yppasswdd - -Whenever users change their passwords, the NIS password database and probably -other NIS databases, which depend on the NIS password database, should be -updated. The program "rpc.yppasswdd" is a server that handles password -changes and makes sure that the NIS information will be updated accordingly. -rpc.yppasswdd is now integrated in ypserv. You don't need the older, separate -yppasswd-0.9.tar.gz or yppasswd-0.10.tar.gz, and you shouldn't use them any -longer. - -You need to start rpc.yppasswdd only on the NIS master server. By default, -users are not allowed to change their full name or the login shell. You can -allow this with the -e chfn or -e chsh option. - -If your passwd and shadow files are not in another directory then /etc, you -need to add the -D option. For example, if you have put all source files in / -etc/yp and wish to allow the user to change his shell, you need to start -rpc.yppasswdd with the following parameters: - - - -+---------------------------------------------------------------------------+ -| rpc.yppasswdd -D /etc/yp -e chsh | -+---------------------------------------------------------------------------+ - - - -or - - - -+---------------------------------------------------------------------------+ -| rpc.yppasswdd -s /etc/yp/shadow -p /etc/yp/passwd -e chsh | -+---------------------------------------------------------------------------+ - - - -There is nothing more to do. You just need to make sure, that rpc.yppasswdd -uses the same files as /var/yp/Makefile. Errors will be logged using syslog. ------------------------------------------------------------------------------ - -10. Verifying the NIS/NYS Installation - -If everything is fine (as it should be), you should be able to verify your -installation with a few simple commands. Assuming, for example, your passwd -file is being supplied by NIS, the command - - - -+---------------------------------------------------------------------------+ -| % ypcat passwd | -+---------------------------------------------------------------------------+ - - - -should give you the contents of your NIS passwd file. The command - - - -+---------------------------------------------------------------------------+ -| % ypmatch userid passwd | -+---------------------------------------------------------------------------+ - - - -(where userid is the login name of an arbitrary user) should give you the -user's entry in the NIS passwd file. The "ypcat" and "ypmatch" programs -should be included with your distribution of traditional NIS or NYS. - -If a user cannot log in, run the following program on the client: - - -+---------------------------------------------------------------------------+ -|#include | -|#include | -|#include | -| | -|int | -|main(int argc, char *argv[]) | -|{ | -| struct passwd *pwd; | -| | -| if(argc != 2) | -| { | -| fprintf(stderr,"Usage: getwpnam username\n"); | -| exit(1); | -| } | -| | -| pwd=getpwnam(argv[1]); | -| | -| if(pwd != NULL) | -| { | -| printf("name.....: [%s]\n",pwd->pw_name); | -| printf("password.: [%s]\n",pwd->pw_passwd); | -| printf("user id..: [%d]\n", pwd->pw_uid); | -| printf("group id.: [%d]\n",pwd->pw_gid); | -| printf("gecos....: [%s]\n",pwd->pw_gecos); | -| printf("directory: [%s]\n",pwd->pw_dir); | -| printf("shell....: [%s]\n",pwd->pw_shell); | -| } | -| else | -| fprintf(stderr,"User \"%s\" not found!\n",argv[1]); | -| | -| exit(0); | -|} | -+---------------------------------------------------------------------------+ - - - -Running this program with the username as parameter will print all the -information the getpwnam function gives back for this user. This should show -you which entry is incorrect. The most common problem is, that the password -field is overwritten with a "*". - -GNU C Library 2.1 (glibc 2.1) comes with a tool called getent. Use this -program instead the above on such a system. You could try: - - -+---------------------------------------------------------------------------+ -| getent passwd | -+---------------------------------------------------------------------------+ - - -or - - -+---------------------------------------------------------------------------+ -| getent passwd login | -+---------------------------------------------------------------------------+ - - - ------------------------------------------------------------------------------ - -11. Creating and Updating NIS maps - -11.1. Creating new NIS maps - -The initial NIS maps will be created by running - - -+---------------------------------------------------------------------------+ -| % /usr/lib/yp/ypinit -m | -+---------------------------------------------------------------------------+ - - - -This is done when setting up the NIS master server for the first time. For -more information about this, read Section 9. If you wish to add new maps to -your server or remove old one, you need to edit the /var/yp/Makefile and -change the all: rule. Add or remove the name of the rule, which generates the -map. - -If you delete a map, you also have to remove the corresponding files. - -After this change, you only need to run - - -+---------------------------------------------------------------------------+ -| % make -C /var/yp | -+---------------------------------------------------------------------------+ - - - -and the maps should be created. ------------------------------------------------------------------------------ - -11.2. Updating NIS maps - -If you modify the sources for the NIS maps (for example if you create a new -user by adding the account to the passwd file), you need to regenerate the -NIS maps. This is done by a simple - - -+---------------------------------------------------------------------------+ -| % make -C /var/yp | -+---------------------------------------------------------------------------+ - - - -This command will check which sources have changed, creates the maps new and -tell ypserv that the maps have changed. ------------------------------------------------------------------------------ - -11.3. Length of Map entries - -The length of one entry is limited by the NIS protocol to 1024 characters. -You can't just increase this value and recompile the system. Every system -that uses NIS v2 expects key and data values to be no more than 1024 bytes in -size; if you suddenly make YPMAXRECORD larger on your client and server, you -will break interoperability with all other systems on your network that use -NIS. To make it work right, you'd have to go to every vendor that supports -NIS and get them to all make the change at the same time. Chances are you -won't be able to do this. - -With glibc 2.1 and newer this limit was removed from the glibc NIS -implementation. So it is possible under Linux to use longer entries, but only -if you have no other NIS clients or servers in your network. - -To allow the creation of NIS maps with a longer entry, you need to add the ---no-limit-check option to the makedbm call in /var/yp/Makefile. - -The result should look like: - - -+-------------------------------------------------------------------------------------+ -|DBLOAD = $(YPBINDIR)/makedbm -c -m `$(YPBINDIR)/yphelper --hostname` --no-limit-check| -+-------------------------------------------------------------------------------------+ - - - -WARNING: This breaks the NIS protocol and even if Linux supports it, not all -Applictions running under Linux works with this change! - -There is another way of solving this problem for /etc/group entries. This -idea is from Ken Cameron: - - -+---------------------------------------------------------------------------+ -|1. Break the entry into more than one line and name each group | -| slightly differnet. | -| | -|2. keep the GID the same for all. | -| | -|3. have the first entry with the right group name and the GID. | -| I don't put any user names in this one. | -| | -|What happens is that going by user name you pick up the GID when the code | -|reads it. Then going the other way it stops after the first match of GID | -|and takes that name. It's ugly but works! | -+---------------------------------------------------------------------------+ - - ------------------------------------------------------------------------------ - -12. Surviving a Reboot - -Once you have NIS correctly configured on the server and client, you do need -to be sure that the configuration will survive a reboot. - -There are two separate issues to check: the existence of an init script and -the correct storage of the NIS domain name. ------------------------------------------------------------------------------ - -12.1. NIS Init Script - -In your version of Linux, you need to check your directory of init scripts, -typically /etc/init.d, /etc/rc.d/init.d or /sbin/init.d to be sure there is a -startup script there for NIS. Usually this file is called ypbind or ypclient. ------------------------------------------------------------------------------ - -12.2. NIS Domain Name - -Perhaps the greatest issue that some people have with NIS is ensuring that -the NIS domain name is available after a reboot. According to Solaris 2.x, -the NIS domain name should be entered as a single line in: - - -+---------------------------------------------------------------------------+ -| /etc/defaultdomain | -+---------------------------------------------------------------------------+ - - - -However, most Linux distributions does not seem to use this file. ------------------------------------------------------------------------------ - -12.3. Distribution-specific Issues - -At this time, the following information is known about how various Linux -distributions handle the storage of the NIS domainname. ------------------------------------------------------------------------------ - -12.3.1. Caldera 2.x - -Caldera uses the file /etc/nis.conf which has the same format as the normal / -etc/yp.conf. ------------------------------------------------------------------------------ - -12.3.2. Debian - -Debian appears to follow Sun's usage of /etc/defaultdomain. ------------------------------------------------------------------------------ - -12.3.3. Red Hat Linux 6.x, 7.x, 8.x and 9 - -Create or modify the variable NISDOMAIN in the file /etc/sysconfig/network. ------------------------------------------------------------------------------ - -12.3.4. SuSE Linux 6.x and 7.x - -Modify the variable YP_DOMAINNAME in /etc/rc.config and then run the command -SuSEconfig. ------------------------------------------------------------------------------ - -12.3.5. SuSE Linux 8.x and later - -Since version 8.0 SuSE Linux also follow Sun's usage of /etc/defaultdomain. ------------------------------------------------------------------------------ - -13. Changing passwords with rpasswd - -The standard way to change a NIS password is to call yppasswd, on some -systems this is only an alias for passwd. This commands uses the yppasswd -protocol and needs a running rpc.yppasswdd process on the NIS master server. -The protocol has the disadvantage, that the old password will be send in -clear text over the network. This is not so problematic, if the password -change was successfull. In this case, the old password is replaced with the -new one. But if the password change fails, an attacker can use the clear -password to login as this user. Even more worse: If the system administrator -changes the NIS password for another user, the root password of the NIS -master server is transfered in clear text over the network. And this one will -not be changed. - -One solution is to not use yppasswd for changing the password. Instead, a -good alternative is the rpasswd command from the pwdutils package. - - - -+-----------------------------------------------------------------------------+ -| Site Directory File Name | -| | -| ftp.kernel.org /pub/linux/utils/net/NIS pwdutils-2.3.tar.gz | -| ftp.suse.com /pub/people/kukuk/pam/pam_pwcheck pam_pwcheck-2.2.tar.bz2 | -| ftp.suse.com /pub/people/kukuk/pam/pam_unix2 pam_unix2-1.16.tar.bz2 | -+-----------------------------------------------------------------------------+ - - - -rpasswd changes passwords for user accounts on a remote server over a secure -SSL connection. A normal user may only change the password for their own -account, if the user knows the password of the administrator account (in the -moment this is the root password on the server), he may change the password -for any account if he calls rpasswd with the -a option. ------------------------------------------------------------------------------ - -13.1. Server Configuration - -For the server you need at first certificate, the default filename for this -is /etc/rpasswdd.pem. The file can be created with the following command: - - - -+----------------------------------------------------------------------------------------+ -|openssl req -new -x509 -nodes -days 730 -out /etc/rpasswdd.pem -keyout /etc/rpasswdd.pem| -+----------------------------------------------------------------------------------------+ - - - -A PAM configuration file for rpasswdd is needed, too. If the NIS accounts are -stored in /etc/passwd, the following is a good starting point for a working -configuration: - - - -+---------------------------------------------------------------------------+ -|#%PAM-1.0 | -|auth required pam_unix2.so | -|account required pam_unix2.so | -|password required pam_pwcheck.so | -|password required pam_unix2.so use_first_pass use_authtok | -|password required pam_make.so /var/yp | -|session required pam_unix2.so | -+---------------------------------------------------------------------------+ - - - -If sources for the NIS password maps are stored in another location (for -example in /etc/yp), the nisdir option of pam_unix2 can be used to find the -source files in another place: - - - -+----------------------------------------------------------------------------------+ -|#%PAM-1.0 | -|auth required pam_unix2.so | -|account required pam_unix2.so | -|password required pam_pwcheck.so nisdir=/etc/yp | -|password required pam_unix2.so nisdir=/etc/yp use_first_pass use_authtok | -|password required pam_make.so /var/yp | -|session required pam_unix2.so | -+----------------------------------------------------------------------------------+ - - - -Now start the rpasswdd daemon on the NIS master server. - -Since the password change is done with PAM modules, rpasswdd is also able to -allow password changes for NIS+, LDAP or other services supported by a PAM -module. - ------------------------------------------------------------------------------ - -13.2. Client Configuration - -On every client only the configuration file /etc/rpasswd.conf which contains -the name of the server is neded. If the server does not run on the default -port, the correct port can alse be mentioned here: - - - -+---------------------------------------------------------------------------+ -|# rpasswdd runs on master.example.com | -|server master.example.com | -|# Port 774 is the default port | -|port 774 | -+---------------------------------------------------------------------------+ - - - ------------------------------------------------------------------------------ - -14. Common Problems and Troubleshooting NIS - -Here are some common problems reported by various users: - - 1. The libraries for 4.5.19 are broken. NIS won't work with it. - - 2. If you upgrade the libraries from 4.5.19 to 4.5.24 then the su command - breaks. You need to get the su command from the slackware 1.2.0 - distribution. Incidentally that's where you can get the updated - libraries. - - 3. When a NIS server goes down and comes up again ypbind starts complaining - with messages like: - - - - +---------------------------------------------------------------+ - | yp_match: clnt_call: | - | RPC: Unable to receive; errno = Connection refused | - +---------------------------------------------------------------+ - - - - and logins are refused for those who are registered in the NIS database. - Try to login as root and kill ypbind and start it up again. An update to - ypbind 3.3 or higher should also help. - - 4. After upgrading the libc to a version greater then 5.4.20, the YP tools - will not work any longer. You need yp-tools 1.2 or later for libc >= - 5.4.21 and glibc 2.x. For earlier libc version you need yp-clients 2.2. - yp-tools 2.x should work for all libraries. - - 5. In libc 5.4.21 - 5.4.35 yp_maplist is broken, you need 5.4.36 or later, - or some YP programs like ypwhich will segfault. - - 6. libc 5 with traditional NIS doesn't support shadow passwords over NIS. - You need libc5 + NYS or glibc 2.x. - - 7. ypcat shadow doesn't show the shadow map. This is correct, the name of - the shadow map is shadow.byname, not shadow. - - 8. Solaris doesn't use always privileged ports. So don't use password - mangling if you have a Solaris client. - ------------------------------------------------------------------------------ - -15. Frequently Asked Questions - -Most of your questions should be answered by now. If there are still -questions unanswered you might want to post a message to - - - -+---------------------------------------------------------------------------+ -| comp.os.linux.networking | -+---------------------------------------------------------------------------+ - - - - diff --git a/LDP/guide/docbook/Linux-Networking/Networking-Management.xml b/LDP/guide/docbook/Linux-Networking/Networking-Management.xml deleted file mode 100644 index 6c663b39..00000000 --- a/LDP/guide/docbook/Linux-Networking/Networking-Management.xml +++ /dev/null @@ -1,60 +0,0 @@ - - -Network-Management - - -There is an impressive number of tools focused on network management -and remote administration under Linux. Some interesting remote administration -projects are linuxconf and webmin: - - - -· Webmin -· Linuxconf - - - -Other tools include network traffic analysis tools, network security -tools, monitoring tools, configuration tools, etc. An archive of many -of these tools may be found at Metalab - - - -9.2. SNMP - - -The Simple Network Management Protocol is a protocol for Internet -network management services. It allows for remote monitoring and -configuration of routers, bridges, network cards, switches, etc... -There is a large amount of libraries, clients, daemons and SNMP based -monitoring programs available for Linux. A good page dealing with SNMP -and Linux software may be found at : http://linas.org/linux/NMS.html - - -10. Enterprise Linux Networking - - -In certain situations it is necessary for the networking -infrastructure to have proper mechanisms to guarantee network -availability nearly 100% of the time. Some related techniques are -described in the following sections. Most of the following material -can be found at the excellent Linas website: -http://linas.org/linux/index.html and in the Linux High-Availability -HOWTO - - -10.1. High Availability - - -Redundancy is used to prevent the overall IT system from having single -points of failure. A server with only one network card or a single -SCSI disk has two single points of failure. The objective is to mask -unplanned outages from users in a manner that lets users continue to -work quickly. High availability software is a set of scripts and tools -that automatically monitor and detect failures, taking the appropriate -steps to restore normal operation and to notifying system -administrators. - - - diff --git a/LDP/guide/docbook/Linux-Networking/Overview.xml b/LDP/guide/docbook/Linux-Networking/Overview.xml index 4a1305e3..776dd6cc 100644 --- a/LDP/guide/docbook/Linux-Networking/Overview.xml +++ b/LDP/guide/docbook/Linux-Networking/Overview.xml @@ -2836,3 +2836,25 @@ its clear that this system is best suited to smaller networks. + + + +Internet + + +Internet is not described as a network of any single kind. It can be construed +as a large set of heterogenous networks that support a certain set of protocols +(TCP/IP) and provide certain common services. A good way to learn about the +Internet is to use the Internet! + + + +Linux is a great platform to act as an Intranet / Internet server. The +term Intranet refers to the application of Internet technologies +inside an organisation mainly for the purpose of distributing and +making available information inside the company. Internet and Intranet +services offered by Linux include mail, news, WWW servers and many +more that will be outlined further on in this document. + + + diff --git a/LDP/guide/docbook/Linux-Networking/Protocols-and-Standards.xml b/LDP/guide/docbook/Linux-Networking/Protocols-and-Standards.xml index abd5ede1..a7fb945b 100644 --- a/LDP/guide/docbook/Linux-Networking/Protocols-and-Standards.xml +++ b/LDP/guide/docbook/Linux-Networking/Protocols-and-Standards.xml @@ -3392,3 +3392,1724 @@ Wavelan device names are `eth0', `eth1', etc. + + + +ISDN + + +The Integrated Services Digital Network (ISDN) is a series of +standards that specify a general purpose switched digital data +network. An ISDN `call' creates a synchronous point to point data +service to the destination. ISDN is generally delivered on a high +speed link that is broken down into a number of discrete channels. +There are two different types of channels, the `B Channels' which will +actually carry the user data and a single channel called the `D +channel' which is used to send control information to the ISDN +exchange to establish calls and other functions. In Australia for +example, ISDN may be delivered on a 2Mbps link that is broken into 30 +discrete 64kbps B channels with one 64kbps D channel. Any number of +channels may be used at a time and in any combination. You could for +example establish 30 separate calls to 30 different destinations at +64kbps each, or you could establish 15 calls to 15 different +destinations at 128kbps each (two channels used per call), or just a +small number of calls and leave the rest idle. A channel may be used +for either incoming or outgoing calls. The original intention of ISDN +was to allow Telecommunications companies to provide a single data +service which could deliver either telephone (via digitised voice) or +data services to your home or business without requiring you to make +any special configuration changes. + + + +There are a few different ways to connect your computer to an ISDN +service. One way is to use a device called a `Terminal Adaptor' which +plugs into the Network Terminating Unit that you telecommunications +carrier will have installed when you got your ISDN service and +presents a number of serial interfaces. One of those interfaces is +used to enter commands to establish calls and configuration and the +others are actually connected to the network devices that will use the +data circuits when they are established. Linux will work in this sort +of configuration without modification, you just treat the port on the +Terminal Adaptor like you would treat any other serial device. +Another way, which is the way the kernel ISDN support is designed for +allows you to install an ISDN card into your Linux machine and then +has your Linux software handle the protocols and make the calls +itself. + + + +The Linux kernel has built-in ISDN capabilies. Isdn4linux controls +ISDN PC cards and can emulate a modem with the Hayes command set ("AT" +commands). The possibilities range from simply using a terminal +program to connections via HDLC (using included devices) to full +connection to the Internet with PPP to audio applications. + +· FAQ for isdn4linux: http://ww.isdn4linux.de/faq/ + + + + + Kernel Compile Options: + + ISDN subsystem ---> + <*> ISDN support + [ ] Support synchronous PPP + [ ] Support audio via ISDN + < > ICN 2B and 4B support + < > PCBIT-D support + < > Teles/NICCY1016PC/Creatix support + + + + +The Linux implementation of ISDN supports a number of different types +of internal ISDN cards. These are those listed in the kernel +configuration options: + + + · ICN 2B and 4B + · Octal PCBIT-D + · Teles ISDN-cards and compatibles + + +Some of these cards require software to be downloaded to them to make +them operational. There is a separate utility to do this with. + + + +Full details on how to configure the Linux ISDN support is available +from the /usr/src/linux/Documentation/isdn/ directory and an FAQ +dedicated to isdn4linux is available at www.lrz-muenchen.de. (You can +click on the english flag to get an english version). + + + +A note about PPP. The PPP suite of protocols will operate over either +asynchronous or synchronous serial lines. The commonly distributed PPP +daemon for Linux `pppd' supports only asynchronous mode. If you wish +to run the PPP protocols over your ISDN service you need a specially +modified version. Details of where to find it are available in the +documentation referred to above. + + + + + + +NIS + + +The Network Information Service (NIS) provides a simple network lookup +service consisting of databases and processes. Its purpose is to +provide information that has to be known throughout the network to all +machines on the network. For example, it enables an administrator to +allow users access to any machine in a network running NIS without a +password entry existing on each machine; only the main database needs +to be maintained. This section describes how to configure Linux as +NIS(YP) or NIS+ client and how to install an NIS(YP) server. +Don't forget to read Section 5. + + +----------------------------------------------------------------------------- +2.2. Some General Information + + +The next four lines are quoted from the Sun(tm) System & Network +Administration Manual: + + + + ++---------------------------------------------------------------------------+ +| "NIS was formerly known as Sun Yellow Pages (YP) but | +| the name Yellow Pages(tm) is a registered trademark | +| in the United Kingdom of British Telecom plc and may | +| not be used without permission." | ++---------------------------------------------------------------------------+ + + + + +NIS stands for Network Information Service. Its purpose is to provide +information, that has to be known throughout the network, to all machines on +the network. Information likely to be distributed by NIS is: + + + +  * login names/passwords/home directories (/etc/passwd) +  * group information (/etc/group) + + + +If, for example, your password entry is recorded in the NIS passwd database, +you will be able to login on all machines on the network which have the NIS +client programs running. + + + +Sun is a trademark of Sun Microsystems, Inc. licensed to SunSoft, Inc. + +----------------------------------------------------------------------------- + +3. NIS, NYS or NIS+ ? + +3.1. libc 4/5 with traditional NIS or NYS ? + + +The choice between "traditional NIS" or the NIS code in the NYS library is a +choice between laziness and maturity vs. flexibility and love of adventure. +The "traditional NIS" code is in the standard C library and has been around +longer and sometimes suffers from its age and slight inflexibility. +The NIS code in the NYS library requires you to recompile the libc library to +include the NYS code into it (or maybe you can get a precompiled version of +libc from someone who has already done it). +Another difference is that the traditional NIS code has some support for NIS +Netgroups, which the NYS code doesn't. On the other hand the NYS code allows +you to handle Shadow Passwords in a transparent way. The "traditonal NIS" +code doesn't support Shadow Passwords over NIS. + + +----------------------------------------------------------------------------- + +3.2. glibc 2 and NIS/NIS+ + + +Forgot all this if you use the new GNU C Library 2.x (aka libc6). It has real +NSS (name switch service) support, which makes it very flexible, and contains +support for the following NIS/NIS+ maps: aliases, ethers, group, hosts, +netgroups, networks, protocols, publickey, passwd, rpc, services and shadow. +The GNU C Library has no problems with shadow passwords over NIS. + + +----------------------------------------------------------------------------- + +3.3. NIS or NIS+ ? + + +The choice between NIS and NIS+ is easy - use NIS+ only if you have severe +security needs. NIS+ is much more problematic to administer (it's pretty easy +to handle on the client side, but the server side is horrible). Another +problem is that the support for NIS+ under Linux contains a lot of bugs and +that the development has stopped. + + +----------------------------------------------------------------------------- + +4. How it works + +4.1. How NIS works + + +Within a network there must be at least one machine acting as a NIS server. +You can have multiple NIS servers, each serving different NIS "domains" - or +you can have cooperating NIS servers, where one is the master NIS server, and +all the other are so-called slave NIS servers (for a certain NIS "domain", +that is!) - or you can have a mix of them... + + + +Slave servers only have copies of the NIS databases and receive these copies +from the master NIS server whenever changes are made to the master's +databases. Depending on the number of machines in your network and the +reliability of your network, you might decide to install one or more slave +servers. Whenever a NIS server goes down or is too slow in responding to +requests, a NIS client connected to that server will try to find one that is +up or faster. + + + +NIS databases are in so-called DBM format, derived from ASCII databases. For +example, the files /etc/passwd and /etc/group can be directly converted to +DBM format using ASCII-to-DBM translation software (makedbm, included with +the server software). The master NIS server should have both, the ASCII +databases and the DBM databases. + + + +Slave servers will be notified of any change to the NIS maps, (via the yppush +program), and automatically retrieve the necessary changes in order to +synchronize their databases. NIS clients do not need to do this since they +always talk to the NIS server to read the information stored in it's DBM +databases. + + + +Old ypbind versions do a broadcast to find a running NIS server. This is +insecure, due the fact that anyone may install a NIS server and answer the +broadcast queries. Newer Versions of ypbind (ypbind-3.3 or ypbind-mt) are +able to get the server from a configuration file - thus no need to broadcast. + + +----------------------------------------------------------------------------- + +4.2. How NIS+ works + + +NIS+ is a new version of the network information nameservice from Sun. The +biggest difference between NIS and NIS+ is that NIS+ has support for data +encryption and authentication over secure RPC. + + + +The naming model of NIS+ is based upon a tree structure. Each node in the +tree corresponds to an NIS+ object, from which we have six types: directory, +entry, group, link, table and private. + + + +The NIS+ directory that forms the root of the NIS+ namespace is called the +root directory. There are two special NIS+ directories: org_dir and +groups_dir. The org_dir directory consists of all administration tables, such +as passwd, hosts, and mail_aliases. The groups_dir directory consists of NIS+ +group objects which are used for access control. The collection of org_dir, +groups_dir and their parent directory is referred to as an NIS+ domain. + + +----------------------------------------------------------------------------- + +5. The RPC Portmapper + + +To run any of the software mentioned below you will need to run the program / +sbin/portmap. Some Linux distributions already have the code in the /sbin/ +init.d/ or /etc/rc.d/ files to start up this daemon. All you have to do is to +activate it and reboot your Linux machine. Read your Linux Distribution +Documentation how to do this. + + + +The RPC portmapper (portmap(8)) is a server that converts RPC program numbers +into TCP/IP (or UDP/IP) protocol port numbers. It must be running in order to +make RPC calls (which is what the NIS/NIS+ client software does) to RPC +servers (like a NIS or NIS+ server) on that machine. When an RPC server is +started, it will tell portmap what port number it is listening to, and what +RPC program numbers it is prepared to serve. When a client wishes to make an +RPC call to a given program number, it will first contact portmap on the +server machine to determine the port number where RPC packets should be sent. + + + +Since RPC servers could be started by inetd(8), portmap should be running +before inetd is started. + + + +For secure RPC, the portmapper needs the Time service. Make sure, that the +Time service is enabled in /etc/inetd.conf on all hosts: + + + + ++---------------------------------------------------------------------------+ +|# | +|# Time service is used for clock syncronization. | +|# | +|time stream tcp nowait root internal | +|time dgram udp wait root internal | ++---------------------------------------------------------------------------+ + + + +IMPORTANT: Don't forget to restart inetd after changes on its configuration +file ! + + +----------------------------------------------------------------------------- + +6. What do you need to set up NIS? + +6.1. Determine whether you are a Server, Slave or Client. + + +To answer this question you have to consider two cases: + + + + 1. Your machine is going to be part of a network with existing NIS servers + 2. You do not have any NIS servers in the network yet + + + +In the first case, you only need the client programs (ypbind, ypwhich, ypcat, +yppoll, ypmatch). The most important program is ypbind. This program must be +running at all times, which means, it should always appear in the list of +processes. It is a daemon process and needs to be started from the system's +startup file (eg. /etc/init.d/nis, /sbin/init.d/ypclient, /etc/rc.d/init.d/ +ypbind, /etc/rc.local). As soon as ypbind is running your system has become a +NIS client. + + + +In the second case, if you don't have NIS servers, then you will also need a +NIS server program (usually called ypserv). Section 9 describes how to set up +a NIS server on your Linux machine using the ypserv daemon. + + +----------------------------------------------------------------------------- + +6.2. The Software + + +The system library "/usr/lib/libc.a" (version 4.4.2 and better) or the shared +library "/lib/libc.so.x" contain all necessary system calls to succesfully +compile the NIS client and server software. For the GNU C Library 2 (glibc +2.x), you also need /lib/libnsl.so.1. + + + +Some people reported that NIS only works with "/usr/lib/libc.a" version +4.5.21 and better so if you want to play it safe don't use older libc's. The +NIS client software can be obtained from: + + + + ++----------------------------------------------------------------------------------+ +| Site Directory File Name | +| | +| ftp.kernel.org /pub/linux/utils/net/NIS yp-tools-2.8.tar.gz | +| ftp.kernel.org /pub/linux/utils/net/NIS ypbind-mt-1.13.tar.gz | +| ftp.kernel.org /pub/linux/utils/net/NIS ypbind-3.3.tar.gz | +| ftp.kernel.org /pub/linux/utils/net/NIS ypbind-3.3-glibc5.diff.gz| ++----------------------------------------------------------------------------------+ + + + + +Once you obtained the software, please follow the instructions which come +with the software. yp-clients 2.2 are for use with libc4 and libc5 until +5.4.20. libc 5.4.21 and glibc 2.x needs yp-tools 1.4.1 or later. The new +yp-tools 2.4 should work with every Linux libc. Since there was a bug in the +NIS code, you shouldn't use libc 5.4.21-5.4.35. Use libc 5.4.36 or later +instead, or the most YP programs will not work. ypbind 3.3 will work with all +libraries, too. If you use gcc 2.8.x or greater, egcs or glibc 2.x, you +should add the ypbind-3.3-glibc5.diff patch to ypbind 3.3. If possible you +should avoid the use of ypbind 3.3 for security reasons. ypbind-mt is a new, +multithreaded daemon. It needs a Linux 2.2 kernel and glibc 2.1 or later. + + +----------------------------------------------------------------------------- + +7. Setting Up the NIS Client + +7.1. The ypbind daemon + + +After you have succesfully compiled the software you are now ready to install +it. A suitable place for the ypbind daemon is the directory /usr/sbin. Some +people may tell you that you don't need ypbind on a system with NYS. This is +wrong. ypwhich and ypcat need it always. + + + +You must do this as root of course. The other binaries (ypwhich, ypcat, +yppasswd, yppoll, ypmatch) should go in a directory accessible by all users, +normally /usr/bin. + + + +Newer ypbind versions have a configuration file called /etc/yp.conf. You can +hardcode a NIS server there - for more info see the manual page for ypbind +(8). You also need this file for NYS. An example: + + + + ++---------------------------------------------------------------------------+ +|ypserver 10.10.0.1 | +|ypserver 10.0.100.8 | +|ypserver 10.3.1.1 | ++---------------------------------------------------------------------------+ + + + + +If the system can resolve the hostnames without NIS, you may use the name, +otherwise you have to use the IP address. ypbind 3.3 has a bug and will only +use the last entry (ypserver 10.3.1.1 in the example). All other entries are +ignored. ypbind-mt handle this correct and uses that one, which answerd at +first. + + + +It might be a good idea to test ypbind before incorporating it in the startup +files. To test ypbind do the following: + + +  * Make sure you have your YP-domain name set. If it is not set then issue + the command: + + +---------------------------------------------------------------+ + | /bin/domainname nis.domain | + +---------------------------------------------------------------+ + + where nis.domain should be some string _NOT_ normally associated with the + DNS-domain name of your machine! The reason for this is that it makes it + a little harder for external crackers to retreive the password database + from your NIS servers. If you don't know what the NIS domain name is on + your network, ask your system/network administrator. + +  * Start up "/sbin/portmap" if it is not already running. + +  * Create the directory /var/yp if it does not exist. + +  * Start up /usr/sbin/ypbind + +  * Use the command rpcinfo -p localhost to check if ypbind was able to + register its service with the portmapper. The output should look like: + +---------------------------------------------------------------+ + | program vers proto port | + | 100000 2 tcp 111 portmapper | + | 100000 2 udp 111 portmapper | + | 100007 2 udp 637 ypbind | + | 100007 2 tcp 639 ypbind | + +---------------------------------------------------------------+ + or + +---------------------------------------------------------------+ + | program vers proto port | + | 100000 2 tcp 111 portmapper | + | 100000 2 udp 111 portmapper | + | 100007 2 udp 758 ypbind | + | 100007 1 udp 758 ypbind | + | 100007 2 tcp 761 ypbind | + | 100007 1 tcp 761 ypbind | + +---------------------------------------------------------------+ + Depending on the ypbind version you are using. + +  * You may also run rpcinfo -u localhost ypbind. This command should produce + something like: + +---------------------------------------------------------------+ + | program 100007 version 2 ready and waiting | + +---------------------------------------------------------------+ + or + +---------------------------------------------------------------+ + | program 100007 version 1 ready and waiting | + | program 100007 version 2 ready and waiting | + +---------------------------------------------------------------+ + The output depends on the ypbind version you have installed. Important is + only the "version 2" message. + +At this point you should be able to use NIS client programs like ypcat, +etc... For example, ypcat passwd.byname will give you the entire NIS password +database. + +IMPORTANT: If you skipped the test procedure then make sure you have set the +domain name, and created the directory + ++---------------------------------------------------------------------------+ +| /var/yp | ++---------------------------------------------------------------------------+ + + +This directory MUST exist for ypbind to start up succesfully. + + + +To check if the domainname is set correct, use the /bin/ypdomainname from +yp-tools 2.2. It uses the yp_get_default_domain() function which is more +restrict. It doesn't allow for example the "(none)" domainname, which is the +default under Linux and makes a lot of problems. + + + +If the test worked you may now want to change your startupd files so that +ypbind will be started at boot time and your system will act as a NIS client. +Make sure that the domainname will be set before you start ypbind. + + + +Well, that's it. Reboot the machine and watch the boot messages to see if +ypbind is actually started. + + +----------------------------------------------------------------------------- + +7.2. Setting up a NIS Client using Traditional NIS + + +For host lookups you must set (or add) "nis" to the lookup order line in your +/etc/host.conf file. Please read the manpage "resolv+.8" for more details. + + + +Add the following line to /etc/passwd on your NIS clients: + + + + ++---------------------------------------------------------------------------+ +|+:::::: | ++---------------------------------------------------------------------------+ + + + + +You can also use the + and - characters to include/exclude or change users. +If you want to exclude the user guest just add -guest to your /etc/passwd +file. You want to use a different shell (e.g. ksh) for the user "linux"? No +problem, just add "+linux::::::/bin/ksh" (without the quotes) to your /etc/ +passwd. Fields that you don't want to change have to be left empty. You could +also use Netgroups for user control. + + + +For example, to allow login-access only to miquels, dth and ed, and all +members of the sysadmin netgroup, but to have the account data of all other +users available use: + + + + ++---------------------------------------------------------------------------+ +| +miquels::::::: | +| +ed::::::: | +| +dth::::::: | +| +@sysadmins::::::: | +| -ftp | +| +:*::::::/etc/NoShell | ++---------------------------------------------------------------------------+ + + + + +Note that in Linux you can also override the password field, as we did in +this example. We also remove the login "ftp", so it isn't known any longer, +and anonymous ftp will not work. + + + +The netgroup would look like + + + + ++---------------------------------------------------------------------------+ +|sysadmins (-,software,) (-,kukuk,) | ++---------------------------------------------------------------------------+ + + + + +IMPORTANT: The netgroup feature is implemented starting from libc 4.5.26. If +you have a version of libc earlier than 4.5.26, every user in the NIS +password database can access your linux machine if you run "ypbind" ! + + +----------------------------------------------------------------------------- + +7.3. Setting up a NIS Client using NYS + + +All that is required is that the NIS configuration file (/etc/yp.conf) points +to the correct server(s) for its information. Also, the Name Services Switch +configuration file (/etc/nsswitch.conf) must be correctly set up. + + + +You should install ypbind. It isn't needed by the libc, but the NIS(YP) tools +need it. + + + +If you wish to use the include/exclude user feature (+/-guest/+@admins), you +have to use "passwd: compat" and "group: compat" in nsswitch.conf. Note that +there is no "shadow: compat"! You have to use "shadow: files nis" in this +case. + + + +The NYS sources are part of the libc 5 sources. When run configure, say the +first time "NO" to the "Values correct" question, then say "YES" to "Build a +NYS libc from nys". + + +----------------------------------------------------------------------------- + +7.4. Setting up a NIS Client using glibc 2.x + + +The glibc uses "traditional NIS", so you need to start ypbind. The Name +Services Switch configuration file (/etc/nsswitch.conf) must be correctly set +up. If you use the compat mode for passwd, shadow or group, you have to add +the "+" at the end of this files and you can use the include/exclude user +feature. The configuration is excatly the same as under Solaris 2.x. + + +----------------------------------------------------------------------------- + +7.5. The nsswitch.conf File + + +The Network Services switch file /etc/nsswitch.conf determines the order of +lookups performed when a certain piece of information is requested, just like +the /etc/host.conf file which determines the way host lookups are performed. +For example, the line + + + + ++---------------------------------------------------------------------------+ +| hosts: files nis dns | ++---------------------------------------------------------------------------+ + + + + +specifies that host lookup functions should first look in the local /etc/ +hosts file, followed by a NIS lookup and finally through the domain name +service (/etc/resolv.conf and named), at which point if no match is found an +error is returned. This file must be readable for every user! You can find +more information in the man-page nsswitch.5 or nsswitch.conf.5. + + + +A good /etc/nsswitch.conf file for NIS is: + + + + ++---------------------------------------------------------------------------+ +|# | +|# /etc/nsswitch.conf | +|# | +|# An example Name Service Switch config file. This file should be | +|# sorted with the most-used services at the beginning. | +|# | +|# The entry '[NOTFOUND=return]' means that the search for an | +|# entry should stop if the search in the previous entry turned | +|# up nothing. Note that if the search failed due to some other reason | +|# (like no NIS server responding) then the search continues with the | +|# next entry. | +|# | +|# Legal entries are: | +|# | +|# nisplus Use NIS+ (NIS version 3) | +|# nis Use NIS (NIS version 2), also called YP | +|# dns Use DNS (Domain Name Service) | +|# files Use the local files | +|# db Use the /var/db databases | +|# [NOTFOUND=return] Stop searching if not found so far | +|# | +| | +|passwd: compat | +|group: compat | +|# For libc5, you must use shadow: files nis | +|shadow: compat | +| | +|passwd_compat: nis | +|group_compat: nis | +|shadow_compat: nis | +| | +|hosts: nis files dns | +| | +|services: nis [NOTFOUND=return] files | +|networks: nis [NOTFOUND=return] files | +|protocols: nis [NOTFOUND=return] files | +|rpc: nis [NOTFOUND=return] files | +|ethers: nis [NOTFOUND=return] files | +|netmasks: nis [NOTFOUND=return] files | +|netgroup: nis | +|bootparams: nis [NOTFOUND=return] files | +|publickey: nis [NOTFOUND=return] files | +|automount: files | +|aliases: nis [NOTFOUND=return] files | ++---------------------------------------------------------------------------+ + + + + +passwd_compat, group_compat and shadow_compat are only supported by glibc +2.x. If there are no shadow rules in /etc/nsswitch.conf, glibc will use the +passwd rule for lookups. There are some more lookup module for glibc like +hesoid. For more information, read the glibc documentation. + + +----------------------------------------------------------------------------- + +7.6. Shadow Passwords with NIS + + +Shadow passwords over NIS are always a bad idea. You loose the security, +which shadow gives you, and it is supported by only some few Linux C +Libraries. A good way to avoid shadow passwords over NIS is, to put only the +local system users in /etc/shadow. Remove the NIS user entries from the +shadow database, and put the password back in passwd. So you can use shadow +for the root login, and normal passwd for NIS user. This has the advantage +that it will work with every NIS client. + + +----------------------------------------------------------------------------- + +7.6.1. Linux + + +The only Linux libc which supports shadow passwords over NIS, is the GNU C +Library 2.x. Linux libc5 has no support for it. Linux libc5 compiled with NYS +enabled has some code for it. But this code is badly broken in some cases and +doesn't work with all correct shadow entries. + + +----------------------------------------------------------------------------- + +7.6.2. Solaris + + +Solaris does not support shadow passwords over NIS. + + +----------------------------------------------------------------------------- + +7.6.3. PAM + + +Linux-PAM 0.75 and newr does support Shadow passwords over NIS if you use the +pam_unix.so Module or if you install the extra pam_unix2.so Module. Old +systems using pam_pwdb/libpwdb (for example Red Hat Linux 5.x) need to change +the /etc/pam.d/* entries. All pam_pwdb rules should be replaced through a +pam_unix_* module. + + + +An example /etc/pam.d/login file looks like: + + + + ++----------------------------------------------------------------------------------+ +|#%PAM-1.0 | +|auth requisite pam_unix2.so nullok #set_secrpc | +|auth required pam_securetty.so | +|auth required pam_nologin.so | +|auth required pam_env.so | +|auth required pam_mail.so | +|account required pam_unix2.so | +|password required pam_pwcheck.so nullok | +|password required pam_unix2.so nullok use_first_pass use_authtok | +|session required pam_unix2.so none # debug or trace | +|session required pam_limits.so | ++----------------------------------------------------------------------------------+ + + + +----------------------------------------------------------------------------- + +8. What do you need to set up NIS+ ? + +8.1. The Software + + +The Linux NIS+ client code was developed for the GNU C library 2. There is +also a port for Linux libc5, since most commercial Applications where linked +against this library in the past, and you cannot recompile them for using +glibc. There are problems with libc5 and NIS+: static programs cannot be +linked with it, and programs compiled with this library will not work with +other libc5 versions. +As base System you need a glibc based Distribution like Debian, Red Hat Linux +or SuSE Linux. If you have a Linux Distribution, which does not have glibc +2.1.1 or later, you need to update to a newer version. + + + +The NIS+ client software can be obtained from: + + + + ++---------------------------------------------------------------------------------+ +| Site Directory File Name | +| | +| ftp.gnu.org /pub/gnu/glibc glibc-2.3.2.tar.gz, | +| glibc-linuxthreads-2.3.2.tar.gz | +| ftp.kernel.org /pub/linux/utils/net/NIS+ nis-utils-1.4.1.tar.gz | ++---------------------------------------------------------------------------------+ + + + + +You should also have a look at [http://www.linux-nis.org/nisplus/] http:// +www.linux-nis.org/nisplus/ for more information and the latest sources. + + +----------------------------------------------------------------------------- + +8.2. Setting up a NIS+ client + + +IMPORTANT: For setting up a NIS+ client read your Solaris NIS+ docs what to +do on the server side! This document only describes what to do on the client +side! + + + +After installing the new libc and nis-tools, create the credentials for the +new client on the NIS+ server. Make sure portmap is running. Then check if +your Linux PC has the same time as the NIS+ Server. For secure RPC, you have +only a small window from about 3 minutes, in which the credentials are valid. +A good idea is to run xntpd on every host. After this, run + + + + ++---------------------------------------------------------------------------+ +|domainname nisplus.domain. | +|nisinit -c -H | ++---------------------------------------------------------------------------+ + + + + +to initialize the cold start file. Read the nisinit man page for more +options. Make sure that the domainname will always be set after a reboot. If +you don't know what the NIS+ domain name is on your network, ask your system/ +network administrator. + + + +Now you should change your /etc/nsswitch.conf file. Make sure that the only +service after publickey is nisplus ("publickey: nisplus"), and nothing else! + + + +Then start keyserv and make sure, that it will always be started as first +daemon after portmap at boot time. Run + + + + ++---------------------------------------------------------------------------+ +|keylogin -r | ++---------------------------------------------------------------------------+ + + + + +to store the root secretkey on your system. (I hope you have added the +publickey for the new host on the NIS+ Server?). + + + +niscat passwd.org_dir should now show you all entries in the passwd database. + + +----------------------------------------------------------------------------- + +8.3. NIS+, keylogin, login and PAM + + +When the user logs in, he need to set his secretkey to keyserv. This is done +by calling "keylogin". The login from the shadow package will do this for the +user, if it was compiled against glibc 2.1. For a PAM aware login, you have +to change the /etc/pam.d/login file to use pam_unix2, not pwdb, which doesn't +support NIS+. An example: + + + + ++---------------------------------------------------------------------------+ +|#%PAM-1.0 | +|auth required /lib/security/pam_securetty.so | +|auth required /lib/security/pam_unix2.so set_secrpc | +|auth required /lib/security/pam_nologin.so | +|account required /lib/security/pam_unix2.so | +|password required /lib/security/pam_unix2.so | +|session required /lib/security/pam_unix2.so | ++---------------------------------------------------------------------------+ + + + +----------------------------------------------------------------------------- + +8.4. The nsswitch.conf File + + +The Network Services switch file /etc/nsswitch.conf determines the order of +lookups performed when a certain piece of information is requested, just like +the /etc/host.conf file which determines the way host lookups are performed. +For example, the line + + + + ++---------------------------------------------------------------------------+ +| hosts: files nisplus dns | ++---------------------------------------------------------------------------+ + + + + +specifies that host lookup functions should first look in the local /etc/ +hosts file, followed by a NIS+ lookup and finally through the domain name +service (/etc/resolv.conf and named), at which point if no match is found an +error is returned. + + + +A good /etc/nsswitch.conf file for NIS+ is: + + + + ++---------------------------------------------------------------------------+ +|# | +|# /etc/nsswitch.conf | +|# | +|# An example Name Service Switch config file. This file should be | +|# sorted with the most-used services at the beginning. | +|# | +|# The entry '[NOTFOUND=return]' means that the search for an | +|# entry should stop if the search in the previous entry turned | +|# up nothing. Note that if the search failed due to some other reason | +|# (like no NIS server responding) then the search continues with the | +|# next entry. | +|# | +|# Legal entries are: | +|# | +|# nisplus Use NIS+ (NIS version 3) | +|# nis Use NIS (NIS version 2), also called YP | +|# dns Use DNS (Domain Name Service) | +|# files Use the local files | +|# db Use the /var/db databases | +|# [NOTFOUND=return] Stop searching if not found so far | +|# | +| | +|passwd: compat | +|group: compat | +|shadow: compat | +| | +|passwd_compat: nisplus | +|group_compat: nisplus | +|shadow_compat: nisplus | +| | +|hosts: nisplus files dns | +| | +|services: nisplus [NOTFOUND=return] files | +|networks: nisplus [NOTFOUND=return] files | +|protocols: nisplus [NOTFOUND=return] files | +|rpc: nisplus [NOTFOUND=return] files | +|ethers: nisplus [NOTFOUND=return] files | +|netmasks: nisplus [NOTFOUND=return] files | +|netgroup: nisplus | +|bootparams: nisplus [NOTFOUND=return] files | +|publickey: nisplus | +|automount: files | +|aliases: nisplus [NOTFOUND=return] files | ++---------------------------------------------------------------------------+ + + + +----------------------------------------------------------------------------- + +9. Setting up a NIS Server + +9.1. The Server Program ypserv + + +This document only describes how to set up the "ypserv" NIS server. + + + +The NIS server software can be found on: + + + + ++---------------------------------------------------------------------------+ +| Site Directory File Name | +| | +| ftp.kernel.org /pub/linux/utils/net/NIS ypserv-2.9.tar.gz | +| ftp.kernel.org /pub/linux/utils/net/NIS ypserv-2.9.tar.bz2 | ++---------------------------------------------------------------------------+ + + + + +You could also look at [http://www.linux-nis.org/nis/] http:// +www.linux-nis.org/nis/ for more information. + + + +The server setup is the same for both traditional NIS and NYS. + + + +Compile the software to generate the ypserv and makedbm programs. ypserv-2.x +only supports the securenets file for access restrictions. + + + +If you run your server as master, determine what files you require to be +available via NIS and then add or remove the appropriate entries to the "all" +rule in /var/yp/Makefile. You always should look at the Makefile and edit the +Options at the beginning of the file. + + + +There was one big change between ypserv 1.1 and ypserv 1.2. Since version +1.2, the file handles are cached. This means you have to call makedbm always +with the -c option if you create new maps. Make sure, you are using the new / +var/yp/Makefile from ypserv 1.2 or later, or add the -c flag to makedbm in +the Makefile. If you don't do that, ypserv will continue to use the old maps, +and not the updated one. + + + +Now edit /var/yp/securenets and /etc/ypserv.conf. For more information, read +the ypserv(8) and ypserv.conf(5) manual pages. + + + +Make sure the portmapper (portmap(8)) is running, and start the server ypserv +. The command + + + + ++---------------------------------------------------------------------------+ +| % rpcinfo -u localhost ypserv | ++---------------------------------------------------------------------------+ + + + + +should output something like + + + + ++---------------------------------------------------------------------------+ +| program 100004 version 1 ready and waiting | +| program 100004 version 2 ready and waiting | ++---------------------------------------------------------------------------+ + + + + +The "version 1" line could be missing, depending on the ypserv version and +configuration you are using. It is only necessary if you have old SunOS 4.x +clients. + + + +Now generate the NIS (YP) database. On the master, run + + + + ++---------------------------------------------------------------------------+ +| % /usr/lib/yp/ypinit -m | ++---------------------------------------------------------------------------+ + + + + +On a slave make sure that ypwhich -m works. This means, that your slave must +be configured as NIS client before you could run + + + + ++---------------------------------------------------------------------------+ +| % /usr/lib/yp/ypinit -s masterhost | ++---------------------------------------------------------------------------+ + + + +to install the host as NIS slave. + +That's it, your server is up and running. + +If you have bigger problems, you could start ypserv and ypbind in debug mode +on different xterms. The debug output should show you what goes wrong. + +If you need to update a map, run make in the /var/yp directory on the NIS +master. This will update a map if the source file is newer, and push the +files to the slave servers. Please don't use ypinit for updating a map. + +You might want to edit root's crontab *on the slave* server and add the +following lines: + + + ++---------------------------------------------------------------------------+ +| 20 * * * * /usr/lib/yp/ypxfr_1perhour | +| 40 6 * * * /usr/lib/yp/ypxfr_1perday | +| 55 6,18 * * * /usr/lib/yp/ypxfr_2perday | ++---------------------------------------------------------------------------+ + + + +This will ensure that most NIS maps are kept up-to-date, even if an update is +missed because the slave was down at the time the update was done on the +master. + +You can add a slave at every time later. At first, make sure that the new +slave server has permissions to contact the NIS master. Then run + + + ++---------------------------------------------------------------------------+ +| % /usr/lib/yp/ypinit -s masterhost | ++---------------------------------------------------------------------------+ + + + +on the new slave. On the master server, add the new slave server name to /var +/yp/ypservers and run make in /var/yp to update the map. + +If you want to restrict access for users to your NIS server, you'll have to +setup the NIS server as a client as well by running ypbind and adding the +plus-entries to /etc/passwd _halfway_ the password file. The library +functions will ignore all normal entries after the first NIS entry, and will +get the rest of the info through NIS. This way the NIS access rules are +maintained. An example: + + + ++-------------------------------------------------------------------------------+ +| root:x:0:0:root:/root:/bin/bash | +| daemon:*:1:1:daemon:/usr/sbin: | +| bin:*:2:2:bin:/bin: | +| sys:*:3:3:sys:/dev: | +| sync:*:4:100:sync:/bin:/bin/sync | +| games:*:5:100:games:/usr/games: | +| man:*:6:100:man:/var/catman: | +| lp:*:7:7:lp:/var/spool/lpd: | +| mail:*:8:8:mail:/var/spool/mail: | +| news:*:9:9:news:/var/spool/news: | +| uucp:*:10:50:uucp:/var/spool/uucp: | +| nobody:*:65534:65534:noone at all,,,,:/dev/null: | +| +miquels:::::: | +| +:*:::::/etc/NoShell | +| [ All normal users AFTER this line! ] | +| tester:*:299:10:Just a test account:/tmp: | +| miquels:1234567890123:101:10:Miquel van Smoorenburg:/home/miquels:/bin/zsh| ++-------------------------------------------------------------------------------+ + + + +Thus the user "tester" will exist, but have a shell of /etc/NoShell. miquels +will have normal access. + +Alternatively, you could edit the /var/yp/Makefile file and set NIS to use +another source password file. On large systems the NIS password and group +files are usually stored in /etc/yp/. If you do this the normal tools to +administrate the password file such as passwd, chfn, adduser will not work +anymore and you need special homemade tools for this. + +However, yppasswd, ypchsh and ypchfn will work of course. +----------------------------------------------------------------------------- + +9.2. The Server Program yps + +To set up the "yps" NIS server please refer to the previous paragraph. The +"yps" server setup is similar, _but_ not exactly the same so beware if you +try to apply the "ypserv" instructions to "yps"! "yps" is not supported by +any author, and contains some security leaks. You really shouldn't use it ! + +The "yps" NIS server software can be found on: + + + ++---------------------------------------------------------------------------+ +| Site Directory File Name | +| | +| ftp.lysator.liu.se /pub/NYS/servers yps-0.21.tar.gz | +| ftp.kernel.org /pub/linux/utils/net/NIS yps-0.21.tar.gz | ++---------------------------------------------------------------------------+ + + + +----------------------------------------------------------------------------- + +9.3. The Program rpc.ypxfrd + +rpc.ypxfrd is used for speed up the transfer of very large NIS maps from a +NIS master to NIS slave servers. If a NIS slave server receives a message +that there is a new map, it will start ypxfr for transfering the new map. +ypxfr will read the contents of a map from the master server using the yp_all +() function. This process can take several minutes when there are very large +maps which have to store by the database library. + +The rpc.ypxfrd server speeds up the transfer process by allowing NIS slave +servers to simply copy the master server's map files rather than building +their own from scratch. rpc.ypxfrd uses an RPC-based file transfer protocol, +so that there is no need for building a new map. + +rpc.ypxfrd can be started by inetd. But since it starts very slow, it should +be started with ypserv. You need to start rpc.ypxfrd only on the NIS master +server. +----------------------------------------------------------------------------- + +9.4. The Program rpc.yppasswdd + +Whenever users change their passwords, the NIS password database and probably +other NIS databases, which depend on the NIS password database, should be +updated. The program "rpc.yppasswdd" is a server that handles password +changes and makes sure that the NIS information will be updated accordingly. +rpc.yppasswdd is now integrated in ypserv. You don't need the older, separate +yppasswd-0.9.tar.gz or yppasswd-0.10.tar.gz, and you shouldn't use them any +longer. + +You need to start rpc.yppasswdd only on the NIS master server. By default, +users are not allowed to change their full name or the login shell. You can +allow this with the -e chfn or -e chsh option. + +If your passwd and shadow files are not in another directory then /etc, you +need to add the -D option. For example, if you have put all source files in / +etc/yp and wish to allow the user to change his shell, you need to start +rpc.yppasswdd with the following parameters: + + + ++---------------------------------------------------------------------------+ +| rpc.yppasswdd -D /etc/yp -e chsh | ++---------------------------------------------------------------------------+ + + + +or + + + ++---------------------------------------------------------------------------+ +| rpc.yppasswdd -s /etc/yp/shadow -p /etc/yp/passwd -e chsh | ++---------------------------------------------------------------------------+ + + + +There is nothing more to do. You just need to make sure, that rpc.yppasswdd +uses the same files as /var/yp/Makefile. Errors will be logged using syslog. +----------------------------------------------------------------------------- + +10. Verifying the NIS/NYS Installation + +If everything is fine (as it should be), you should be able to verify your +installation with a few simple commands. Assuming, for example, your passwd +file is being supplied by NIS, the command + + + ++---------------------------------------------------------------------------+ +| % ypcat passwd | ++---------------------------------------------------------------------------+ + + + +should give you the contents of your NIS passwd file. The command + + + ++---------------------------------------------------------------------------+ +| % ypmatch userid passwd | ++---------------------------------------------------------------------------+ + + + +(where userid is the login name of an arbitrary user) should give you the +user's entry in the NIS passwd file. The "ypcat" and "ypmatch" programs +should be included with your distribution of traditional NIS or NYS. + +If a user cannot log in, run the following program on the client: + + ++---------------------------------------------------------------------------+ +|#include | +|#include | +|#include | +| | +|int | +|main(int argc, char *argv[]) | +|{ | +| struct passwd *pwd; | +| | +| if(argc != 2) | +| { | +| fprintf(stderr,"Usage: getwpnam username\n"); | +| exit(1); | +| } | +| | +| pwd=getpwnam(argv[1]); | +| | +| if(pwd != NULL) | +| { | +| printf("name.....: [%s]\n",pwd->pw_name); | +| printf("password.: [%s]\n",pwd->pw_passwd); | +| printf("user id..: [%d]\n", pwd->pw_uid); | +| printf("group id.: [%d]\n",pwd->pw_gid); | +| printf("gecos....: [%s]\n",pwd->pw_gecos); | +| printf("directory: [%s]\n",pwd->pw_dir); | +| printf("shell....: [%s]\n",pwd->pw_shell); | +| } | +| else | +| fprintf(stderr,"User \"%s\" not found!\n",argv[1]); | +| | +| exit(0); | +|} | ++---------------------------------------------------------------------------+ + + + +Running this program with the username as parameter will print all the +information the getpwnam function gives back for this user. This should show +you which entry is incorrect. The most common problem is, that the password +field is overwritten with a "*". + +GNU C Library 2.1 (glibc 2.1) comes with a tool called getent. Use this +program instead the above on such a system. You could try: + + ++---------------------------------------------------------------------------+ +| getent passwd | ++---------------------------------------------------------------------------+ + + +or + + ++---------------------------------------------------------------------------+ +| getent passwd login | ++---------------------------------------------------------------------------+ + + + +----------------------------------------------------------------------------- + +11. Creating and Updating NIS maps + +11.1. Creating new NIS maps + +The initial NIS maps will be created by running + + ++---------------------------------------------------------------------------+ +| % /usr/lib/yp/ypinit -m | ++---------------------------------------------------------------------------+ + + + +This is done when setting up the NIS master server for the first time. For +more information about this, read Section 9. If you wish to add new maps to +your server or remove old one, you need to edit the /var/yp/Makefile and +change the all: rule. Add or remove the name of the rule, which generates the +map. + +If you delete a map, you also have to remove the corresponding files. + +After this change, you only need to run + + ++---------------------------------------------------------------------------+ +| % make -C /var/yp | ++---------------------------------------------------------------------------+ + + + +and the maps should be created. +----------------------------------------------------------------------------- + +11.2. Updating NIS maps + +If you modify the sources for the NIS maps (for example if you create a new +user by adding the account to the passwd file), you need to regenerate the +NIS maps. This is done by a simple + + ++---------------------------------------------------------------------------+ +| % make -C /var/yp | ++---------------------------------------------------------------------------+ + + + +This command will check which sources have changed, creates the maps new and +tell ypserv that the maps have changed. +----------------------------------------------------------------------------- + +11.3. Length of Map entries + +The length of one entry is limited by the NIS protocol to 1024 characters. +You can't just increase this value and recompile the system. Every system +that uses NIS v2 expects key and data values to be no more than 1024 bytes in +size; if you suddenly make YPMAXRECORD larger on your client and server, you +will break interoperability with all other systems on your network that use +NIS. To make it work right, you'd have to go to every vendor that supports +NIS and get them to all make the change at the same time. Chances are you +won't be able to do this. + +With glibc 2.1 and newer this limit was removed from the glibc NIS +implementation. So it is possible under Linux to use longer entries, but only +if you have no other NIS clients or servers in your network. + +To allow the creation of NIS maps with a longer entry, you need to add the +--no-limit-check option to the makedbm call in /var/yp/Makefile. + +The result should look like: + + ++-------------------------------------------------------------------------------------+ +|DBLOAD = $(YPBINDIR)/makedbm -c -m `$(YPBINDIR)/yphelper --hostname` --no-limit-check| ++-------------------------------------------------------------------------------------+ + + + +WARNING: This breaks the NIS protocol and even if Linux supports it, not all +Applictions running under Linux works with this change! + +There is another way of solving this problem for /etc/group entries. This +idea is from Ken Cameron: + + ++---------------------------------------------------------------------------+ +|1. Break the entry into more than one line and name each group | +| slightly differnet. | +| | +|2. keep the GID the same for all. | +| | +|3. have the first entry with the right group name and the GID. | +| I don't put any user names in this one. | +| | +|What happens is that going by user name you pick up the GID when the code | +|reads it. Then going the other way it stops after the first match of GID | +|and takes that name. It's ugly but works! | ++---------------------------------------------------------------------------+ + + +----------------------------------------------------------------------------- + +12. Surviving a Reboot + +Once you have NIS correctly configured on the server and client, you do need +to be sure that the configuration will survive a reboot. + +There are two separate issues to check: the existence of an init script and +the correct storage of the NIS domain name. +----------------------------------------------------------------------------- + +12.1. NIS Init Script + +In your version of Linux, you need to check your directory of init scripts, +typically /etc/init.d, /etc/rc.d/init.d or /sbin/init.d to be sure there is a +startup script there for NIS. Usually this file is called ypbind or ypclient. +----------------------------------------------------------------------------- + +12.2. NIS Domain Name + +Perhaps the greatest issue that some people have with NIS is ensuring that +the NIS domain name is available after a reboot. According to Solaris 2.x, +the NIS domain name should be entered as a single line in: + + ++---------------------------------------------------------------------------+ +| /etc/defaultdomain | ++---------------------------------------------------------------------------+ + + + +However, most Linux distributions does not seem to use this file. +----------------------------------------------------------------------------- + +12.3. Distribution-specific Issues + +At this time, the following information is known about how various Linux +distributions handle the storage of the NIS domainname. +----------------------------------------------------------------------------- + +12.3.1. Caldera 2.x + +Caldera uses the file /etc/nis.conf which has the same format as the normal / +etc/yp.conf. +----------------------------------------------------------------------------- + +12.3.2. Debian + +Debian appears to follow Sun's usage of /etc/defaultdomain. +----------------------------------------------------------------------------- + +12.3.3. Red Hat Linux 6.x, 7.x, 8.x and 9 + +Create or modify the variable NISDOMAIN in the file /etc/sysconfig/network. +----------------------------------------------------------------------------- + +12.3.4. SuSE Linux 6.x and 7.x + +Modify the variable YP_DOMAINNAME in /etc/rc.config and then run the command +SuSEconfig. +----------------------------------------------------------------------------- + +12.3.5. SuSE Linux 8.x and later + +Since version 8.0 SuSE Linux also follow Sun's usage of /etc/defaultdomain. +----------------------------------------------------------------------------- + +13. Changing passwords with rpasswd + +The standard way to change a NIS password is to call yppasswd, on some +systems this is only an alias for passwd. This commands uses the yppasswd +protocol and needs a running rpc.yppasswdd process on the NIS master server. +The protocol has the disadvantage, that the old password will be send in +clear text over the network. This is not so problematic, if the password +change was successfull. In this case, the old password is replaced with the +new one. But if the password change fails, an attacker can use the clear +password to login as this user. Even more worse: If the system administrator +changes the NIS password for another user, the root password of the NIS +master server is transfered in clear text over the network. And this one will +not be changed. + +One solution is to not use yppasswd for changing the password. Instead, a +good alternative is the rpasswd command from the pwdutils package. + + + ++-----------------------------------------------------------------------------+ +| Site Directory File Name | +| | +| ftp.kernel.org /pub/linux/utils/net/NIS pwdutils-2.3.tar.gz | +| ftp.suse.com /pub/people/kukuk/pam/pam_pwcheck pam_pwcheck-2.2.tar.bz2 | +| ftp.suse.com /pub/people/kukuk/pam/pam_unix2 pam_unix2-1.16.tar.bz2 | ++-----------------------------------------------------------------------------+ + + + +rpasswd changes passwords for user accounts on a remote server over a secure +SSL connection. A normal user may only change the password for their own +account, if the user knows the password of the administrator account (in the +moment this is the root password on the server), he may change the password +for any account if he calls rpasswd with the -a option. +----------------------------------------------------------------------------- + +13.1. Server Configuration + +For the server you need at first certificate, the default filename for this +is /etc/rpasswdd.pem. The file can be created with the following command: + + + ++----------------------------------------------------------------------------------------+ +|openssl req -new -x509 -nodes -days 730 -out /etc/rpasswdd.pem -keyout /etc/rpasswdd.pem| ++----------------------------------------------------------------------------------------+ + + + +A PAM configuration file for rpasswdd is needed, too. If the NIS accounts are +stored in /etc/passwd, the following is a good starting point for a working +configuration: + + + ++---------------------------------------------------------------------------+ +|#%PAM-1.0 | +|auth required pam_unix2.so | +|account required pam_unix2.so | +|password required pam_pwcheck.so | +|password required pam_unix2.so use_first_pass use_authtok | +|password required pam_make.so /var/yp | +|session required pam_unix2.so | ++---------------------------------------------------------------------------+ + + + +If sources for the NIS password maps are stored in another location (for +example in /etc/yp), the nisdir option of pam_unix2 can be used to find the +source files in another place: + + + ++----------------------------------------------------------------------------------+ +|#%PAM-1.0 | +|auth required pam_unix2.so | +|account required pam_unix2.so | +|password required pam_pwcheck.so nisdir=/etc/yp | +|password required pam_unix2.so nisdir=/etc/yp use_first_pass use_authtok | +|password required pam_make.so /var/yp | +|session required pam_unix2.so | ++----------------------------------------------------------------------------------+ + + + +Now start the rpasswdd daemon on the NIS master server. + +Since the password change is done with PAM modules, rpasswdd is also able to +allow password changes for NIS+, LDAP or other services supported by a PAM +module. + +----------------------------------------------------------------------------- + +13.2. Client Configuration + +On every client only the configuration file /etc/rpasswd.conf which contains +the name of the server is neded. If the server does not run on the default +port, the correct port can alse be mentioned here: + + + ++---------------------------------------------------------------------------+ +|# rpasswdd runs on master.example.com | +|server master.example.com | +|# Port 774 is the default port | +|port 774 | ++---------------------------------------------------------------------------+ + + + +----------------------------------------------------------------------------- + +14. Common Problems and Troubleshooting NIS + +Here are some common problems reported by various users: + + 1. The libraries for 4.5.19 are broken. NIS won't work with it. + + 2. If you upgrade the libraries from 4.5.19 to 4.5.24 then the su command + breaks. You need to get the su command from the slackware 1.2.0 + distribution. Incidentally that's where you can get the updated + libraries. + + 3. When a NIS server goes down and comes up again ypbind starts complaining + with messages like: + + + + +---------------------------------------------------------------+ + | yp_match: clnt_call: | + | RPC: Unable to receive; errno = Connection refused | + +---------------------------------------------------------------+ + + + + and logins are refused for those who are registered in the NIS database. + Try to login as root and kill ypbind and start it up again. An update to + ypbind 3.3 or higher should also help. + + 4. After upgrading the libc to a version greater then 5.4.20, the YP tools + will not work any longer. You need yp-tools 1.2 or later for libc >= + 5.4.21 and glibc 2.x. For earlier libc version you need yp-clients 2.2. + yp-tools 2.x should work for all libraries. + + 5. In libc 5.4.21 - 5.4.35 yp_maplist is broken, you need 5.4.36 or later, + or some YP programs like ypwhich will segfault. + + 6. libc 5 with traditional NIS doesn't support shadow passwords over NIS. + You need libc5 + NYS or glibc 2.x. + + 7. ypcat shadow doesn't show the shadow map. This is correct, the name of + the shadow map is shadow.byname, not shadow. + + 8. Solaris doesn't use always privileged ports. So don't use password + mangling if you have a Solaris client. + +----------------------------------------------------------------------------- + +15. Frequently Asked Questions + +Most of your questions should be answered by now. If there are still +questions unanswered you might want to post a message to + + + ++---------------------------------------------------------------------------+ +| comp.os.linux.networking | ++---------------------------------------------------------------------------+ + + + + diff --git a/LDP/guide/docbook/Linux-Networking/Redundancy.xml b/LDP/guide/docbook/Linux-Networking/Redundancy.xml deleted file mode 100644 index f5db9081..00000000 --- a/LDP/guide/docbook/Linux-Networking/Redundancy.xml +++ /dev/null @@ -1,11 +0,0 @@ - 10.3. Redundant networking - - IP Address Takeover (IPAT). When a network adapter card fails, its IP - address should be taken by a working network card in the same node or - in another node. MAC Address Takeover: when an IP takeover occurs, it - should be made sure that all the nodes in the network update their ARP - caches (the mapping between IP and MAC addresses). - - See the High-Availability HOWTO for more details: - http://metalab.unc.edu/pub/Linux/ALPHA/linux-ha/High-Availability- - HOWTO.html diff --git a/LDP/guide/docbook/Linux-Networking/Redundant-Networking.xml b/LDP/guide/docbook/Linux-Networking/Redundant-Networking.xml deleted file mode 100644 index f7f9da15..00000000 --- a/LDP/guide/docbook/Linux-Networking/Redundant-Networking.xml +++ /dev/null @@ -1,19 +0,0 @@ - - -Redundant-Networking - - -IP Address Takeover (IPAT). When a network adapter card fails, its IP -address should be taken by a working network card in the same node or -in another node. MAC Address Takeover: when an IP takeover occurs, it -should be made sure that all the nodes in the network update their ARP -caches (the mapping between IP and MAC addresses). - - - -See the High-Availability HOWTO for more details: -http://metalab.unc.edu/pub/Linux/ALPHA/linux-ha/High-Availability- -HOWTO.html - - - diff --git a/LDP/guide/docbook/Linux-Networking/Routing.xml b/LDP/guide/docbook/Linux-Networking/Routing.xml deleted file mode 100644 index ff147f62..00000000 --- a/LDP/guide/docbook/Linux-Networking/Routing.xml +++ /dev/null @@ -1,6446 +0,0 @@ - - -Routing - -12.3. Packets and routers - -What the browser wants to do is send a command to the Web server on -www.tldp.org that looks like this: -GET /LDP/HOWTO/Fundamentals.html HTTP/1.0 - -Here's how that happens. The command is made into a packet, a block of bits -like a telegram that is wrapped with three important things; the source -address (the IP address of your machine), the destination address -(152.19.254.81), and a service number or port number (80, in this case) that -indicates that it's a World Wide Web request. - -Your machine then ships the packet down the wire (your connection to your -ISP, or local network) until it gets to a specialized machine called a -router. The router has a map of the Internet in its memory ?? not always a -complete one, but one that completely describes your network neighborhood and -knows how to get to the routers for other neighborhoods on the Internet. - -Your packet may pass through several routers on the way to its destination. -Routers are smart. They watch how long it takes for other routers to -acknowledge having received a packet. They also use that information to -direct traffic over fast links. They use it to notice when another router (or -a cable) have dropped off the network, and compensate if possible by finding -another route. - -There's an urban legend that the Internet was designed to survive nuclear -war. This is not true, but the Internet's design is extremely good at getting -reliable performance out of flaky hardware in an uncertain world. This is -directly due to the fact that its intelligence is distributed through -thousands of routers rather than concentrated in a few massive and vulnerable -switches (like the phone network). This means that failures tend to be well -localized and the network can route around them. - -Once your packet gets to its destination machine, that machine uses the -service number to feed the packet to the web server. The web server can tell -where to reply to by looking at the command packet's source IP address. When -the web server returns this document, it will be broken up into a number of -packets. The size of the packets will vary according to the transmission -media in the network and the type of service. ------------------------------------------------------------------------------ - - 8.1. Router - - The Linux kernel has built-in support for routing functions. A Linux - box can act either as an IP or IPX router for a fraction of the cost - of a commercial router. Recent kernels include special options for - machines acting primarily as routers: - - · Multicasting: Allows the Linux machine to act as a router for IP - packets that have several destination addresses. It is needed on - the MBONE, a high bandwidth network on top of the Internet which - carries audio and video broadcasts. - - · IP policy routing: Normally a router decides what to do with a - received packet based solely on the packet's final destination - address, but routing can also take into account the originating - address and the network device from which the packet reached it. - - There are some related projects which include one aiming at building a - complete, running Linux router on a floppy disk: Linux router project - - - -Linux Advanced Routing & Traffic Control HOWTO - -Bert Hubert - -Netherlabs BV - - -Gregory Maxwell - - - -Remco van Mook - - - -Martijn van Oosterhout - - - -Paul B Schroeder - - - -Jasper Spaans - - - -Revision History -Revision 1.1 2002-07-22 -DocBook Edition - - -A very hands-on approach to iproute2, traffic shaping and a bit of netfilter. - ------------------------------------------------------------------------------ -Table of Contents -1. Dedication -2. Introduction - 2.1. Disclaimer & License - 2.2. Prior knowledge - 2.3. What Linux can do for you - 2.4. Housekeeping notes - 2.5. Access, CVS & submitting updates - 2.6. Mailing list - 2.7. Layout of this document - - -3. Introduction to iproute2 - 3.1. Why iproute2? - 3.2. iproute2 tour - 3.3. Prerequisites - 3.4. Exploring your current configuration - 3.5. ARP - - -4. Rules - routing policy database - 4.1. Simple source policy routing - 4.2. Routing for multiple uplinks/providers - - -5. GRE and other tunnels - 5.1. A few general remarks about tunnels: - 5.2. IP in IP tunneling - 5.3. GRE tunneling - 5.4. Userland tunnels - - -6. IPv6 tunneling with Cisco and/or 6bone - 6.1. IPv6 Tunneling - - -7. IPsec: secure IP over the Internet -8. Multicast routing -9. Queueing Disciplines for Bandwidth Management - 9.1. Queues and Queueing Disciplines explained - 9.2. Simple, classless Queueing Disciplines - 9.3. Advice for when to use which queue - 9.4. Terminology - 9.5. Classful Queueing Disciplines - 9.6. Classifying packets with filters - 9.7. The Intermediate queueing device (IMQ) - - -10. Load sharing over multiple interfaces - 10.1. Caveats - 10.2. Other possibilities - - -11. Netfilter & iproute - marking packets -12. Advanced filters for (re-)classifying packets - 12.1. The u32 classifier - 12.2. The route classifier - 12.3. Policing filters - 12.4. Hashing filters for very fast massive filtering - - -13. Kernel network parameters - 13.1. Reverse Path Filtering - 13.2. Obscure settings - - -14. Advanced & less common queueing disciplines - 14.1. bfifo/pfifo - 14.2. Clark-Shenker-Zhang algorithm (CSZ) - 14.3. DSMARK - 14.4. Ingress qdisc - 14.5. Random Early Detection (RED) - 14.6. Generic Random Early Detection - 14.7. VC/ATM emulation - 14.8. Weighted Round Robin (WRR) - - -15. Cookbook - 15.1. Running multiple sites with different SLAs - 15.2. Protecting your host from SYN floods - 15.3. Rate limit ICMP to prevent dDoS - 15.4. Prioritizing interactive traffic - 15.5. Transparent web-caching using netfilter, iproute2, ipchains and - squid - 15.6. Circumventing Path MTU Discovery issues with per route MTU settings - 15.7. Circumventing Path MTU Discovery issues with MSS Clamping (for - ADSL, cable, PPPoE & PPtP users) - 15.8. The Ultimate Traffic Conditioner: Low Latency, Fast Up & Downloads - 15.9. Rate limiting a single host or netmask - - -16. Building bridges, and pseudo-bridges with Proxy ARP - 16.1. State of bridging and iptables - 16.2. Bridging and shaping - 16.3. Pseudo-bridges with Proxy-ARP - - -17. Dynamic routing - OSPF and BGP -18. Other possibilities -19. Further reading -20. Acknowledgements - ------------------------------------------------------------------------------ -Chapter 1. Dedication - -This document is dedicated to lots of people, and is my attempt to do -something back. To list but a few: - - - -  * Rusty Russell - -  * Alexey N. Kuznetsov - -  * The good folks from Google - -  * The staff of Casema Internet - - ------------------------------------------------------------------------------ -Chapter 2. Introduction - -Welcome, gentle reader. - -This document hopes to enlighten you on how to do more with Linux 2.2/2.4 -routing. Unbeknownst to most users, you already run tools which allow you to -do spectacular things. Commands like route and ifconfig are actually very -thin wrappers for the very powerful iproute2 infrastructure. - -I hope that this HOWTO will become as readable as the ones by Rusty Russell -of (amongst other things) netfilter fame. - -You can always reach us by writing to the [mailto:HOWTO@ds9a.nl] HOWTO team. -However, please consider posting to the mailing list (see the relevant -section) if you have questions which are not directly related to this HOWTO. -We are no free helpdesk, but we often will answer questions asked on the -list. - -Before losing your way in this HOWTO, if all you want to do is simple traffic -shaping, skip everything and head to the Other possibilities chapter, and -read about CBQ.init. ------------------------------------------------------------------------------ - -2.1. Disclaimer & License - -This document is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. - -In short, if your STM-64 backbone breaks down and distributes pornography to -your most esteemed customers - it's never our fault. Sorry. - -Copyright (c) 2002 by bert hubert, Gregory Maxwell, Martijn van Oosterhout, -Remco van Mook, Paul B. Schroeder and others. This material may be -distributed only subject to the terms and conditions set forth in the Open -Publication License, v1.0 or later (the latest version is presently available -at http://www.opencontent.org/openpub/). - -Please freely copy and distribute (sell or give away) this document in any -format. It's requested that corrections and/or comments be forwarded to the -document maintainer. - -It is also requested that if you publish this HOWTO in hardcopy that you send -the authors some samples for "review purposes" :-) ------------------------------------------------------------------------------ - -2.2. Prior knowledge - -As the title implies, this is the "Advanced" HOWTO. While by no means rocket -science, some prior knowledge is assumed. - -Here are some other references which might help teach you more: - -[http://netfilter.samba.org/unreliable-guides/networking-concepts-HOWTO/ - index.html] Rusty Russell's networking-concepts-HOWTO - Very nice introduction, explaining what a network is, and how it is - connected to other networks. - -Linux Networking-HOWTO (Previously the Net-3 HOWTO) - Great stuff, although very verbose. It teaches you a lot of stuff that's - already configured if you are able to connect to the Internet. Should be - located in /usr/doc/HOWTO/NET3-4-HOWTO.txt but can be also be found - [http://www.linuxports.com/howto/networking] online. - - ------------------------------------------------------------------------------ -2.3. What Linux can do for you - -A small list of things that are possible: - -  * Throttle bandwidth for certain computers - -  * Throttle bandwidth TO certain computers - -  * Help you to fairly share your bandwidth - -  * Protect your network from DoS attacks - -  * Protect the Internet from your customers - -  * Multiplex several servers as one, for load balancing or enhanced - availability - -  * Restrict access to your computers - -  * Limit access of your users to other hosts - -  * Do routing based on user id (yes!), MAC address, source IP address, port, - type of service, time of day or content - - -Currently, not many people are using these advanced features. This is for -several reasons. While the provided documentation is verbose, it is not very -hands-on. Traffic control is almost undocumented. ------------------------------------------------------------------------------ - -2.4. Housekeeping notes - -There are several things which should be noted about this document. While I -wrote most of it, I really don't want it to stay that way. I am a strong -believer in Open Source, so I encourage you to send feedback, updates, -patches etcetera. Do not hesitate to inform me of typos or plain old errors. -If my English sounds somewhat wooden, please realize that I'm not a native -speaker. Feel free to send suggestions. - -If you feel to you are better qualified to maintain a section, or think that -you can author and maintain new sections, you are welcome to do so. The SGML -of this HOWTO is available via CVS, I very much envision more people working -on it. - -In aid of this, you will find lots of FIXME notices. Patches are always -welcome! Wherever you find a FIXME, you should know that you are treading in -unknown territory. This is not to say that there are no errors elsewhere, but -be extra careful. If you have validated something, please let us know so we -can remove the FIXME notice. - -About this HOWTO, I will take some liberties along the road. For example, I -postulate a 10Mbit Internet connection, while I know full well that those are -not very common. ------------------------------------------------------------------------------ - -2.5. Access, CVS & submitting updates - -The canonical location for the HOWTO is [http://www.ds9a.nl/lartc] here. - -We now have anonymous CVS access available to the world at large. This is -good in a number of ways. You can easily upgrade to newer versions of this -HOWTO and submitting patches is no work at all. - -Furthermore, it allows the authors to work on the source independently, which -is good too. -+---------------------------------------------------------------------------+ -|$ export CVSROOT=:pserver:anon@outpost.ds9a.nl:/var/cvsroot | -|$ cvs login | -|CVS password: [enter 'cvs' (without 's)] | -|$ cvs co 2.4routing | -|cvs server: Updating 2.4routing | -|U 2.4routing/2.4routing.sgml | -+---------------------------------------------------------------------------+ - -If you spot an error, or want to add something, just fix it locally, and run -cvs diff -u, and send the result off to us. - -A Makefile is supplied which should help you create postscript, dvi, pdf, -html and plain text. You may need to install docbook, docbook-utils, -ghostscript and tetex to get all formats. ------------------------------------------------------------------------------ - -2.6. Mailing list - -The authors receive an increasing amount of mail about this HOWTO. Because of -the clear interest of the community, it has been decided to start a -mailinglist where people can talk to each other about Advanced Routing and -Traffic Control. You can subscribe to the list [http://mailman.ds9a.nl/ -mailman/listinfo/lartc] here. - -It should be pointed out that the authors are very hesitant of answering -questions not asked on the list. We would like the archive of the list to -become some kind of knowledge base. If you have a question, please search the -archive, and then post to the mailinglist. ------------------------------------------------------------------------------ - -2.7. Layout of this document - -We will be doing interesting stuff almost immediately, which also means that -there will initially be parts that are explained incompletely or are not -perfect. Please gloss over these parts and assume that all will become clear. - -Routing and filtering are two distinct things. Filtering is documented very -well by Rusty's HOWTOs, available here: - -  * [http://netfilter.samba.org/unreliable-guides/] Rusty's Remarkably - Unreliable Guides - - -We will be focusing mostly on what is possible by combining netfilter and -iproute2. ------------------------------------------------------------------------------ - -Chapter 3. Introduction to iproute2 - -3.1. Why iproute2? - -Most Linux distributions, and most UNIX's, currently use the venerable arp, -ifconfig and route commands. While these tools work, they show some -unexpected behaviour under Linux 2.2 and up. For example, GRE tunnels are an -integral part of routing these days, but require completely different tools. - -With iproute2, tunnels are an integral part of the tool set. - -The 2.2 and above Linux kernels include a completely redesigned network -subsystem. This new networking code brings Linux performance and a feature -set with little competition in the general OS arena. In fact, the new -routing, filtering, and classifying code is more featureful than the one -provided by many dedicated routers and firewalls and traffic shaping -products. - -As new networking concepts have been invented, people have found ways to -plaster them on top of the existing framework in existing OSes. This constant -layering of cruft has lead to networking code that is filled with strange -behaviour, much like most human languages. In the past, Linux emulated -SunOS's handling of many of these things, which was not ideal. - -This new framework makes it possible to clearly express features previously -beyond Linux's reach. ------------------------------------------------------------------------------ - -3.2. iproute2 tour - -Linux has a sophisticated system for bandwidth provisioning called Traffic -Control. This system supports various method for classifying, prioritizing, -sharing, and limiting both inbound and outbound traffic. - -We'll start off with a tiny tour of iproute2 possibilities. ------------------------------------------------------------------------------ - -3.3. Prerequisites - -You should make sure that you have the userland tools installed. This package -is called 'iproute' on both RedHat and Debian, and may otherwise be found at -ftp://ftp.inr.ac.ru/ip-routing/iproute2-2.2.4-now-ss??????.tar.gz". - -You can also try [ftp://ftp.inr.ac.ru/ip-routing/iproute2-current.tar.gz] -here for the latest version. - -Some parts of iproute require you to have certain kernel options enabled. It -should also be noted that all releases of RedHat up to and including 6.2 come -without most of the traffic control features in the default kernel. - -RedHat 7.2 has everything in by default. - -Also make sure that you have netlink support, should you choose to roll your -own kernel. Iproute2 needs it. ------------------------------------------------------------------------------ - -3.4. Exploring your current configuration - -This may come as a surprise, but iproute2 is already configured! The current -commands ifconfig and route are already using the advanced syscalls, but -mostly with very default (ie. boring) settings. - -The ip tool is central, and we'll ask it to display our interfaces for us. ------------------------------------------------------------------------------ - -3.4.1. ip shows us our links - -+-------------------------------------------------------------------------------+ -|[ahu@home ahu]$ ip link list | -|1: lo: mtu 3924 qdisc noqueue | -| link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 | -|2: dummy: mtu 1500 qdisc noop | -| link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff | -|3: eth0: mtu 1400 qdisc pfifo_fast qlen 100 | -| link/ether 48:54:e8:2a:47:16 brd ff:ff:ff:ff:ff:ff | -|4: eth1: mtu 1500 qdisc pfifo_fast qlen 100 | -| link/ether 00:e0:4c:39:24:78 brd ff:ff:ff:ff:ff:ff | -|3764: ppp0: mtu 1492 qdisc pfifo_fast qlen 10 | -| link/ppp | -+-------------------------------------------------------------------------------+ - -Your mileage may vary, but this is what it shows on my NAT router at home. -I'll only explain part of the output as not everything is directly relevant. - -We first see the loopback interface. While your computer may function -somewhat without one, I'd advise against it. The MTU size (Maximum Transfer -Unit) is 3924 octets, and it is not supposed to queue. Which makes sense -because the loopback interface is a figment of your kernel's imagination. - -I'll skip the dummy interface for now, and it may not be present on your -computer. Then there are my two physical network interfaces, one at the side -of my cable modem, the other one serves my home ethernet segment. -Furthermore, we see a ppp0 interface. - -Note the absence of IP addresses. iproute disconnects the concept of 'links' -and 'IP addresses'. With IP aliasing, the concept of 'the' IP address had -become quite irrelevant anyhow. - -It does show us the MAC addresses though, the hardware identifier of our -ethernet interfaces. ------------------------------------------------------------------------------ - -3.4.2. ip shows us our IP addresses - -+-------------------------------------------------------------------------------+ -|[ahu@home ahu]$ ip address show | -|1: lo: mtu 3924 qdisc noqueue | -| link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 | -| inet 127.0.0.1/8 brd 127.255.255.255 scope host lo | -|2: dummy: mtu 1500 qdisc noop | -| link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff | -|3: eth0: mtu 1400 qdisc pfifo_fast qlen 100 | -| link/ether 48:54:e8:2a:47:16 brd ff:ff:ff:ff:ff:ff | -| inet 10.0.0.1/8 brd 10.255.255.255 scope global eth0 | -|4: eth1: mtu 1500 qdisc pfifo_fast qlen 100 | -| link/ether 00:e0:4c:39:24:78 brd ff:ff:ff:ff:ff:ff | -|3764: ppp0: mtu 1492 qdisc pfifo_fast qlen 10 | -| link/ppp | -| inet 212.64.94.251 peer 212.64.94.1/32 scope global ppp0 | -+-------------------------------------------------------------------------------+ - -This contains more information. It shows all our addresses, and to which -cards they belong. 'inet' stands for Internet (IPv4). There are lots of other -address families, but these don't concern us right now. - -Let's examine eth0 somewhat closer. It says that it is related to the inet -address '10.0.0.1/8'. What does this mean? The /8 stands for the number of -bits that are in the Network Address. There are 32 bits, so we have 24 bits -left that are part of our network. The first 8 bits of 10.0.0.1 correspond to -10.0.0.0, our Network Address, and our netmask is 255.0.0.0. - -The other bits are connected to this interface, so 10.250.3.13 is directly -available on eth0, as is 10.0.0.1 for example. - -With ppp0, the same concept goes, though the numbers are different. Its -address is 212.64.94.251, without a subnet mask. This means that we have a -point-to-point connection and that every address, with the exception of -212.64.94.251, is remote. There is more information, however. It tells us -that on the other side of the link there is, yet again, only one address, -212.64.94.1. The /32 tells us that there are no 'network bits'. - -It is absolutely vital that you grasp these concepts. Refer to the -documentation mentioned at the beginning of this HOWTO if you have trouble. - -You may also note 'qdisc', which stands for Queueing Discipline. This will -become vital later on. ------------------------------------------------------------------------------ - -3.4.3. ip shows us our routes - -Well, we now know how to find 10.x.y.z addresses, and we are able to reach -212.64.94.1. This is not enough however, so we need instructions on how to -reach the world. The Internet is available via our ppp connection, and it -appears that 212.64.94.1 is willing to spread our packets around the world, -and deliver results back to us. -+---------------------------------------------------------------------------+ -|[ahu@home ahu]$ ip route show | -|212.64.94.1 dev ppp0 proto kernel scope link src 212.64.94.251 | -|10.0.0.0/8 dev eth0 proto kernel scope link src 10.0.0.1 | -|127.0.0.0/8 dev lo scope link | -|default via 212.64.94.1 dev ppp0 | -+---------------------------------------------------------------------------+ - -This is pretty much self explanatory. The first 4 lines of output explicitly -state what was already implied by ip address show, the last line tells us -that the rest of the world can be found via 212.64.94.1, our default gateway. -We can see that it is a gateway because of the word via, which tells us that -we need to send packets to 212.64.94.1, and that it will take care of things. - -For reference, this is what the old route utility shows us: -+-----------------------------------------------------------------------------+ -|[ahu@home ahu]$ route -n | -|Kernel IP routing table | -|Destination Gateway Genmask Flags Metric Ref Use | -|Iface | -|212.64.94.1 0.0.0.0 255.255.255.255 UH 0 0 0 ppp0 | -|10.0.0.0 0.0.0.0 255.0.0.0 U 0 0 0 eth0 | -|127.0.0.0 0.0.0.0 255.0.0.0 U 0 0 0 lo | -|0.0.0.0 212.64.94.1 0.0.0.0 UG 0 0 0 ppp0 | -+-----------------------------------------------------------------------------+ ------------------------------------------------------------------------------ - -3.5. ARP - -ARP is the Address Resolution Protocol as described in [http://www.faqs.org/ -rfcs/rfc826.html] RFC 826. ARP is used by a networked machine to resolve the -hardware location/address of another machine on the same local network. -Machines on the Internet are generally known by their names which resolve to -IP addresses. This is how a machine on the foo.com network is able to -communicate with another machine which is on the bar.net network. An IP -address, though, cannot tell you the physical location of a machine. This is -where ARP comes into the picture. - -Let's take a very simple example. Suppose I have a network composed of -several machines. Two of the machines which are currently on my network are -foo with an IP address of 10.0.0.1 and bar with an IP address of 10.0.0.2. -Now foo wants to ping bar to see that he is alive, but alas, foo has no idea -where bar is. So when foo decides to ping bar he will need to send out an ARP -request. This ARP request is akin to foo shouting out on the network "Bar -(10.0.0.2)! Where are you?" As a result of this every machine on the network -will hear foo shouting, but only bar (10.0.0.2) will respond. Bar will then -send an ARP reply directly back to foo which is akin bar saying, "Foo -(10.0.0.1) I am here at 00:60:94:E9:08:12." After this simple transaction -that's used to locate his friend on the network, foo is able to communicate -with bar until he (his arp cache) forgets where bar is (typically after 15 -minutes on Unix). - -Now let's see how this works. You can view your machines current arp/neighbor -cache/table like so: -+---------------------------------------------------------------------------+ -|[root@espa041 /home/src/iputils]# ip neigh show | -|9.3.76.42 dev eth0 lladdr 00:60:08:3f:e9:f9 nud reachable | -|9.3.76.1 dev eth0 lladdr 00:06:29:21:73:c8 nud reachable | -+---------------------------------------------------------------------------+ - -As you can see my machine espa041 (9.3.76.41) knows where to find espa042 -(9.3.76.42) and espagate (9.3.76.1). Now let's add another machine to the arp -cache. -+-------------------------------------------------------------------------------+ -|[root@espa041 /home/paulsch/.gnome-desktop]# ping -c 1 espa043 | -|PING espa043.austin.ibm.com (9.3.76.43) from 9.3.76.41 : 56(84) bytes of data. | -|64 bytes from 9.3.76.43: icmp_seq=0 ttl=255 time=0.9 ms | -| | -|--- espa043.austin.ibm.com ping statistics --- | -|1 packets transmitted, 1 packets received, 0% packet loss | -|round-trip min/avg/max = 0.9/0.9/0.9 ms | -| | -|[root@espa041 /home/src/iputils]# ip neigh show | -|9.3.76.43 dev eth0 lladdr 00:06:29:21:80:20 nud reachable | -|9.3.76.42 dev eth0 lladdr 00:60:08:3f:e9:f9 nud reachable | -|9.3.76.1 dev eth0 lladdr 00:06:29:21:73:c8 nud reachable | -+-------------------------------------------------------------------------------+ - -As a result of espa041 trying to contact espa043, espa043's hardware address/ -location has now been added to the arp/neighbor cache. So until the entry for -espa043 times out (as a result of no communication between the two) espa041 -knows where to find espa043 and has no need to send an ARP request. - -Now let's delete espa043 from our arp cache: -+---------------------------------------------------------------------------+ -|[root@espa041 /home/src/iputils]# ip neigh delete 9.3.76.43 dev eth0 | -|[root@espa041 /home/src/iputils]# ip neigh show | -|9.3.76.43 dev eth0 nud failed | -|9.3.76.42 dev eth0 lladdr 00:60:08:3f:e9:f9 nud reachable | -|9.3.76.1 dev eth0 lladdr 00:06:29:21:73:c8 nud stale | -+---------------------------------------------------------------------------+ - -Now espa041 has again forgotten where to find espa043 and will need to send -another ARP request the next time he needs to communicate with espa043. You -can also see from the above output that espagate (9.3.76.1) has been changed -to the "stale" state. This means that the location shown is still valid, but -it will have to be confirmed at the first transaction to that machine. ------------------------------------------------------------------------------ - -Chapter 4. Rules - routing policy database - -If you have a large router, you may well cater for the needs of different -people, who should be served differently. The routing policy database allows -you to do this by having multiple sets of routing tables. - -If you want to use this feature, make sure that your kernel is compiled with -the "IP: advanced router" and "IP: policy routing" features. - -When the kernel needs to make a routing decision, it finds out which table -needs to be consulted. By default, there are three tables. The old 'route' -tool modifies the main and local tables, as does the ip tool (by default). - -The default rules: -+---------------------------------------------------------------------------+ -|[ahu@home ahu]$ ip rule list | -|0: from all lookup local | -|32766: from all lookup main | -|32767: from all lookup default | -+---------------------------------------------------------------------------+ - -This lists the priority of all rules. We see that all rules apply to all -packets ('from all'). We've seen the 'main' table before, it is output by ip -route ls, but the 'local' and 'default' table are new. - -If we want to do fancy things, we generate rules which point to different -tables which allow us to override system wide routing rules. - -For the exact semantics on what the kernel does when there are more matching -rules, see Alexey's ip-cref documentation. ------------------------------------------------------------------------------ - -4.1. Simple source policy routing - -Let's take a real example once again, I have 2 (actually 3, about time I -returned them) cable modems, connected to a Linux NAT ('masquerading') -router. People living here pay me to use the Internet. Suppose one of my -house mates only visits hotmail and wants to pay less. This is fine with me, -but they'll end up using the low-end cable modem. - -The 'fast' cable modem is known as 212.64.94.251 and is a PPP link to -212.64.94.1. The 'slow' cable modem is known by various ip addresses, -212.64.78.148 in this example and is a link to 195.96.98.253. - -The local table: -+---------------------------------------------------------------------------+ -|[ahu@home ahu]$ ip route list table local | -|broadcast 127.255.255.255 dev lo proto kernel scope link src 127.0.0.1 | -|local 10.0.0.1 dev eth0 proto kernel scope host src 10.0.0.1 | -|broadcast 10.0.0.0 dev eth0 proto kernel scope link src 10.0.0.1 | -|local 212.64.94.251 dev ppp0 proto kernel scope host src 212.64.94.251 | -|broadcast 10.255.255.255 dev eth0 proto kernel scope link src 10.0.0.1 | -|broadcast 127.0.0.0 dev lo proto kernel scope link src 127.0.0.1 | -|local 212.64.78.148 dev ppp2 proto kernel scope host src 212.64.78.148 | -|local 127.0.0.1 dev lo proto kernel scope host src 127.0.0.1 | -|local 127.0.0.0/8 dev lo proto kernel scope host src 127.0.0.1 | -+---------------------------------------------------------------------------+ - -Lots of obvious things, but things that need to be specified somewhere. Well, -here they are. The default table is empty. - -Let's view the 'main' table: -+---------------------------------------------------------------------------+ -|[ahu@home ahu]$ ip route list table main | -|195.96.98.253 dev ppp2 proto kernel scope link src 212.64.78.148 | -|212.64.94.1 dev ppp0 proto kernel scope link src 212.64.94.251 | -|10.0.0.0/8 dev eth0 proto kernel scope link src 10.0.0.1 | -|127.0.0.0/8 dev lo scope link | -|default via 212.64.94.1 dev ppp0 | -+---------------------------------------------------------------------------+ - -We now generate a new rule which we call 'John', for our hypothetical house -mate. Although we can work with pure numbers, it's far easier if we add our -tables to /etc/iproute2/rt_tables. -+---------------------------------------------------------------------------+ -|# echo 200 John >> /etc/iproute2/rt_tables | -|# ip rule add from 10.0.0.10 table John | -|# ip rule ls | -|0: from all lookup local | -|32765: from 10.0.0.10 lookup John | -|32766: from all lookup main | -|32767: from all lookup default | -+---------------------------------------------------------------------------+ - -Now all that is left is to generate John's table, and flush the route cache: -+---------------------------------------------------------------------------+ -|# ip route add default via 195.96.98.253 dev ppp2 table John | -|# ip route flush cache | -+---------------------------------------------------------------------------+ - -And we are done. It is left as an exercise for the reader to implement this -in ip-up. ------------------------------------------------------------------------------ - -4.2. Routing for multiple uplinks/providers - -A common configuration is the following, in which there are two providers -that connect a local network (or even a single machine) to the big Internet. -+---------------------------------------------------------------------------+ -| ________ | -| +------------+ / | -| | | | | -| +-------------+ Provider 1 +------- | -| __ | | | / | -| ___/ \_ +------+-------+ +------------+ | | -| _/ \__ | if1 | / | -| / \ | | | | -|| Local network -----+ Linux router | | Internet | -| \_ __/ | | | | -| \__ __/ | if2 | \ | -| \___/ +------+-------+ +------------+ | | -| | | | \ | -| +-------------+ Provider 2 +------- | -| | | | | -| +------------+ \________ | -+---------------------------------------------------------------------------+ -There are usually two questions given this setup. ------------------------------------------------------------------------------ - -4.2.1. Split access - -The first is how to route answers to packets coming in over a particular -provider, say Provider 1, back out again over that same provider. - -Let us first set some symbolical names. Let $IF1 be the name of the first -interface (if1 in the picture above) and $IF2 the name of the second -interface. Then let $IP1 be the IP address associated with $IF1 and $IP2 the -IP address associated with $IF2. Next, let $P1 be the IP address of the -gateway at Provider 1, and $P2 the IP address of the gateway at provider 2. -Finally, let $P1_NET be the IP network $P1 is in, and $P2_NET the IP network -$P2 is in. - -One creates two additional routing tables, say T1 and T2. These are added in -/etc/iproute2/rt_tables. Then you set up routing in these tables as follows: - -+---------------------------------------------------------------------------+ -| ip route add $P1_NET dev $IF1 src $IP1 table T1 | -| ip route add default via $P1 table T1 | -| ip route add $P2_NET dev $IF2 src $IP2 table T2 | -| ip route add default via $P2 table T2 | -| | -+---------------------------------------------------------------------------+ -Nothing spectacular, just build a route to the gateway and build a default -route via that gateway, as you would do in the case of a single upstream -provider, but put the routes in a separate table per provider. Note that the -network route suffices, as it tells you how to find any host in that network, -which includes the gateway, as specified above. - -Next you set up the main routing table. It is a good idea to route things to -the direct neighbour through the interface connected to that neighbour. Note -the `src' arguments, they make sure the right outgoing IP address is chosen. -+---------------------------------------------------------------------------+ -| ip route add $P1_NET dev $IF1 src $IP1 | -| ip route add $P2_NET dev $IF2 src $IP2 | -| | -+---------------------------------------------------------------------------+ -Then, your preference for default route: -+---------------------------------------------------------------------------+ -| ip route add default via $P1 | -| | -+---------------------------------------------------------------------------+ -Next, you set up the routing rules. These actually choose what routing table -to route with. You want to make sure that you route out a given interface if -you already have the corresponding source address: -+---------------------------------------------------------------------------+ -| ip rule add from $IP1 table T1 | -| ip rule add from $IP2 table T2 | -| | -+---------------------------------------------------------------------------+ -This set of commands makes sure all answers to traffic coming in on a -particular interface get answered from that interface. - -Now, this is just the very basic setup. It will work for all processes -running on the router itself, and for the local network, if it is -masqueraded. If it is not, then you either have IP space from both providers -or you are going to want to masquerade to one of the two providers. In both -cases you will want to add rules selecting which provider to route out from -based on the IP address of the machine in the local network. ------------------------------------------------------------------------------ - -4.2.2. Load balancing - -The second question is how to balance traffic going out over the two -providers. This is actually not hard if you already have set up split access -as above. - -Instead of choosing one of the two providers as your default route, you now -set up the default route to be a multipath route. In the default kernel this -will balance routes over the two providers. It is done as follows (once more -building on the example in the section on split-access): -+----------------------------------------------------------------------------------+ -| ip route add default scope global nexthop via $P1 dev $IF1 weight 1 \ | -| nexthop via $P2 dev $IF2 weight 1 | -| | -+----------------------------------------------------------------------------------+ -This will balance the routes over both providers. The weight parameters can -be tweaked to favor one provider over the other. - -Note that balancing will not be perfect, as it is route based, and routes are -cached. This means that routes to often-used sites will always be over the -same provider. - -Furthermore, if you really want to do this, you probably also want to look at -Julian Anastasov's patches at http://www.linuxvirtualserver.org/~julian/# -routes , Julian's route patch page. They will make things nicer to work with. ------------------------------------------------------------------------------ - -Chapter 5. GRE and other tunnels - -There are 3 kinds of tunnels in Linux. There's IP in IP tunneling, GRE -tunneling and tunnels that live outside the kernel (like, for example PPTP). ------------------------------------------------------------------------------ - -5.1. A few general remarks about tunnels: - -Tunnels can be used to do some very unusual and very cool stuff. They can -also make things go horribly wrong when you don't configure them right. Don't -point your default route to a tunnel device unless you know EXACTLY what you -are doing :-). Furthermore, tunneling increases overhead, because it needs an -extra set of IP headers. Typically this is 20 bytes per packet, so if the -normal packet size (MTU) on a network is 1500 bytes, a packet that is sent -through a tunnel can only be 1480 bytes big. This is not necessarily a -problem, but be sure to read up on IP packet fragmentation/reassembly when -you plan to connect large networks with tunnels. Oh, and of course, the -fastest way to dig a tunnel is to dig at both sides. ------------------------------------------------------------------------------ - -5.2. IP in IP tunneling - -This kind of tunneling has been available in Linux for a long time. It -requires 2 kernel modules, ipip.o and new_tunnel.o. - -Let's say you have 3 networks: Internal networks A and B, and intermediate -network C (or let's say, Internet). So we have network A: -+---------------------------------------------------------------------------+ -|network 10.0.1.0 | -|netmask 255.255.255.0 | -|router 10.0.1.1 | -+---------------------------------------------------------------------------+ - -The router has address 172.16.17.18 on network C. - -and network B: -+---------------------------------------------------------------------------+ -|network 10.0.2.0 | -|netmask 255.255.255.0 | -|router 10.0.2.1 | -+---------------------------------------------------------------------------+ - -The router has address 172.19.20.21 on network C. - -As far as network C is concerned, we assume that it will pass any packet sent -from A to B and vice versa. You might even use the Internet for this. - -Here's what you do: - -First, make sure the modules are installed: -+---------------------------------------------------------------------------+ -|insmod ipip.o | -|insmod new_tunnel.o | -+---------------------------------------------------------------------------+ - -Then, on the router of network A, you do the following: -+---------------------------------------------------------------------------+ -|ifconfig tunl0 10.0.1.1 pointopoint 172.19.20.21 | -|route add -net 10.0.2.0 netmask 255.255.255.0 dev tunl0 | -+---------------------------------------------------------------------------+ - -And on the router of network B: -+---------------------------------------------------------------------------+ -|ifconfig tunl0 10.0.2.1 pointopoint 172.16.17.18 | -|route add -net 10.0.1.0 netmask 255.255.255.0 dev tunl0 | -+---------------------------------------------------------------------------+ - -And if you're finished with your tunnel: -+---------------------------------------------------------------------------+ -|ifconfig tunl0 down | -+---------------------------------------------------------------------------+ - -Presto, you're done. You can't forward broadcast or IPv6 traffic through an -IP-in-IP tunnel, though. You just connect 2 IPv4 networks that normally -wouldn't be able to talk to each other, that's all. As far as compatibility -goes, this code has been around a long time, so it's compatible all the way -back to 1.3 kernels. Linux IP-in-IP tunneling doesn't work with other -Operating Systems or routers, as far as I know. It's simple, it works. Use it -if you have to, otherwise use GRE. ------------------------------------------------------------------------------ - -5.3. GRE tunneling - -GRE is a tunneling protocol that was originally developed by Cisco, and it -can do a few more things than IP-in-IP tunneling. For example, you can also -transport multicast traffic and IPv6 through a GRE tunnel. - -In Linux, you'll need the ip_gre.o module. ------------------------------------------------------------------------------ - -5.3.1. IPv4 Tunneling - -Let's do IPv4 tunneling first: - -Let's say you have 3 networks: Internal networks A and B, and intermediate -network C (or let's say, Internet). - -So we have network A: -+---------------------------------------------------------------------------+ -|network 10.0.1.0 | -|netmask 255.255.255.0 | -|router 10.0.1.1 | -+---------------------------------------------------------------------------+ -The router has address 172.16.17.18 on network C. Let's call this network -neta (ok, hardly original) - -and network B: -+---------------------------------------------------------------------------+ -|network 10.0.2.0 | -|netmask 255.255.255.0 | -|router 10.0.2.1 | -+---------------------------------------------------------------------------+ -The router has address 172.19.20.21 on network C. Let's call this network -netb (still not original) - -As far as network C is concerned, we assume that it will pass any packet sent -from A to B and vice versa. How and why, we do not care. - -On the router of network A, you do the following: -+---------------------------------------------------------------------------+ -|ip tunnel add netb mode gre remote 172.19.20.21 local 172.16.17.18 ttl 255 | -|ip link set netb up | -|ip addr add 10.0.1.1 dev netb | -|ip route add 10.0.2.0/24 dev netb | -+---------------------------------------------------------------------------+ - -Let's discuss this for a bit. In line 1, we added a tunnel device, and called -it netb (which is kind of obvious because that's where we want it to go). -Furthermore we told it to use the GRE protocol (mode gre), that the remote -address is 172.19.20.21 (the router at the other end), that our tunneling -packets should originate from 172.16.17.18 (which allows your router to have -several IP addresses on network C and let you decide which one to use for -tunneling) and that the TTL field of the packet should be set to 255 (ttl -255). - -The second line enables the device. - -In the third line we gave the newly born interface netb the address 10.0.1.1. -This is OK for smaller networks, but when you're starting up a mining -expedition (LOTS of tunnels), you might want to consider using another IP -range for tunneling interfaces (in this example, you could use 10.0.3.0). - -In the fourth line we set the route for network B. Note the different -notation for the netmask. If you're not familiar with this notation, here's -how it works: you write out the netmask in binary form, and you count all the -ones. If you don't know how to do that, just remember that 255.0.0.0 is /8, -255.255.0.0 is /16 and 255.255.255.0 is /24. Oh, and 255.255.254.0 is /23, in -case you were wondering. - -But enough about this, let's go on with the router of network B. -+---------------------------------------------------------------------------+ -|ip tunnel add neta mode gre remote 172.16.17.18 local 172.19.20.21 ttl 255 | -|ip link set neta up | -|ip addr add 10.0.2.1 dev neta | -|ip route add 10.0.1.0/24 dev neta | -+---------------------------------------------------------------------------+ -And when you want to remove the tunnel on router A: -+---------------------------------------------------------------------------+ -|ip link set netb down | -|ip tunnel del netb | -+---------------------------------------------------------------------------+ -Of course, you can replace netb with neta for router B. ------------------------------------------------------------------------------ - -5.3.2. IPv6 Tunneling - -See Section 6 for a short bit about IPv6 Addresses. - -On with the tunnels. - -Let's assume that you have the following IPv6 network, and you want to -connect it to 6bone, or a friend. - - -+---------------------------------------------------------------------------+ -|Network 3ffe:406:5:1:5:a:2:1/96 | -+---------------------------------------------------------------------------+ -Your IPv4 address is 172.16.17.18, and the 6bone router has IPv4 address -172.22.23.24. - - -+------------------------------------------------------------------------------+ -|ip tunnel add sixbone mode sit remote 172.22.23.24 local 172.16.17.18 ttl 255 | -|ip link set sixbone up | -|ip addr add 3ffe:406:5:1:5:a:2:1/96 dev sixbone | -|ip route add 3ffe::/15 dev sixbone | -+------------------------------------------------------------------------------+ - - -Let's discuss this. In the first line, we created a tunnel device called -sixbone. We gave it mode sit (which is IPv6 in IPv4 tunneling) and told it -where to go to (remote) and where to come from (local). TTL is set to -maximum, 255. Next, we made the device active (up). After that, we added our -own network address, and set a route for 3ffe::/15 (which is currently all of -6bone) through the tunnel. - -GRE tunnels are currently the preferred type of tunneling. It's a standard -that is also widely adopted outside the Linux community and therefore a Good -Thing. ------------------------------------------------------------------------------ - -5.4. Userland tunnels - -There are literally dozens of implementations of tunneling outside the -kernel. Best known are of course PPP and PPTP, but there are lots more (some -proprietary, some secure, some that don't even use IP) and that is really -beyond the scope of this HOWTO. ------------------------------------------------------------------------------ - -Chapter 6. IPv6 tunneling with Cisco and/or 6bone - -By Marco Davids - -NOTE to maintainer: - -As far as I am concerned, this IPv6-IPv4 tunneling is not per definition GRE -tunneling. You could tunnel IPv6 over IPv4 by means of GRE tunnel devices -(GRE tunnels ANY to IPv4), but the device used here ("sit") only tunnels IPv6 -over IPv4 and is therefore something different. ------------------------------------------------------------------------------ - -6.1. IPv6 Tunneling - -This is another application of the tunneling capabilities of Linux. It is -popular among the IPv6 early adopters, or pioneers if you like. The -'hands-on' example described below is certainly not the only way to do IPv6 -tunneling. However, it is the method that is often used to tunnel between -Linux and a Cisco IPv6 capable router and experience tells us that this is -just the thing many people are after. Ten to one this applies to you too ;-) - -A short bit about IPv6 addresses: - -IPv6 addresses are, compared to IPv4 addresses, really big: 128 bits against -32 bits. And this provides us just with the thing we need: many, many -IP-addresses: 340,282,266,920,938,463,463,374,607,431,768,211,465 to be -precise. Apart from this, IPv6 (or IPng, for IP Next Generation) is supposed -to provide for smaller routing tables on the Internet's backbone routers, -simpler configuration of equipment, better security at the IP level and -better support for QoS. - -An example: 2002:836b:9820:0000:0000:0000:836b:9886 - -Writing down IPv6 addresses can be quite a burden. Therefore, to make life -easier there are some rules: - - - -  * Don't use leading zeroes. Same as in IPv4. - -  * Use colons to separate every 16 bits or two bytes. - -  * When you have lots of consecutive zeroes, you can write this down as ::. - You can only do this once in an address and only for quantities of 16 - bits, though. - - - - -The address 2002:836b:9820:0000:0000:0000:836b:9886 can be written down as -2002:836b:9820::836b:9886, which is somewhat friendlier. - -Another example, the address 3ffe:0000:0000:0000:0000:0020:34A1:F32C can be -written down as 3ffe::20:34A1:F32C, which is a lot shorter. - -IPv6 is intended to be the successor of the current IPv4. Because it is -relatively new technology, there is no worldwide native IPv6 network yet. To -be able to move forward swiftly, the 6bone was introduced. - -Native IPv6 networks are connected to each other by encapsulating the IPv6 -protocol in IPv4 packets and sending them over the existing IPv4 -infrastructure from one IPv6 site to another. - -That is precisely where the tunnel steps in. - -To be able to use IPv6, we should have a kernel that supports it. There are -many good documents on how to achieve this. But it all comes down to a few -steps: - -  * Get yourself a recent Linux distribution, with suitable glibc. - -  * Then get yourself an up-to-date kernel source. - - -If you are all set, then you can go ahead and compile an IPv6 capable kernel: - -  * Go to /usr/src/linux and type: - -  * make menuconfig - -  * Choose "Networking Options" - -  * Select "The IPv6 protocol", "IPv6: enable EUI-64 token format", "IPv6: - disable provider based addresses" - - -HINT: Don't go for the 'module' option. Often this won't work well. - -In other words, compile IPv6 as 'built-in' in your kernel. You can then save -your config like usual and go ahead with compiling the kernel. - -HINT: Before doing so, consider editing the Makefile: EXTRAVERSION = -x ; --> -; EXTRAVERSION = -x-IPv6 - -There is a lot of good documentation about compiling and installing a kernel, -however this document is about something else. If you run into problems at -this stage, go and look for documentation about compiling a Linux kernel -according to your own specifications. - -The file /usr/src/linux/README might be a good start. After you accomplished -all this, and rebooted with your brand new kernel, you might want to issue an -'/sbin/ifconfig -a' and notice the brand new 'sit0-device'. SIT stands for -Simple Internet Transition. You may give yourself a compliment; you are now -one major step closer to IP, the Next Generation ;-) - -Now on to the next step. You want to connect your host, or maybe even your -entire LAN to another IPv6 capable network. This might be the "6bone" that is -setup especially for this particular purpose. - -Let's assume that you have the following IPv6 network: 3ffe:604:6:8::/64 and -you want to connect it to 6bone, or a friend. Please note that the /64 subnet -notation works just like with regular IP addresses. - -Your IPv4 address is 145.100.24.181 and the 6bone router has IPv4 address -145.100.1.5 -+-----------------------------------------------------------------------------------+ -|# ip tunnel add sixbone mode sit remote 145.100.1.5 [local 145.100.24.181 ttl 255] | -|# ip link set sixbone up | -|# ip addr add 3FFE:604:6:7::2/126 dev sixbone | -|# ip route add 3ffe::0/16 dev sixbone | -+-----------------------------------------------------------------------------------+ - -Let's discuss this. In the first line, we created a tunnel device called -sixbone. We gave it mode sit (which is IPv6 in IPv4 tunneling) and told it -where to go to (remote) and where to come from (local). TTL is set to -maximum, 255. - -Next, we made the device active (up). After that, we added our own network -address, and set a route for 3ffe::/15 (which is currently all of 6bone) -through the tunnel. If the particular machine you run this on is your IPv6 -gateway, then consider adding the following lines: -+---------------------------------------------------------------------------+ -|# echo 1 >/proc/sys/net/ipv6/conf/all/forwarding | -|# /usr/local/sbin/radvd | -+---------------------------------------------------------------------------+ - -The latter, radvd is -like zebra- a router advertisement daemon, to support -IPv6's autoconfiguration features. Search for it with your favourite -search-engine if you like. You can check things like this: -+---------------------------------------------------------------------------+ -|# /sbin/ip -f inet6 addr | -+---------------------------------------------------------------------------+ - -If you happen to have radvd running on your IPv6 gateway and boot your IPv6 -capable Linux on a machine on your local LAN, you would be able to enjoy the -benefits of IPv6 autoconfiguration: -+------------------------------------------------------------------------------+ -|# /sbin/ip -f inet6 addr | -|1: lo: mtu 3924 qdisc noqueue inet6 ::1/128 scope host | -| | -|3: eth0: mtu 1500 qdisc pfifo_fast qlen 100 | -|inet6 3ffe:604:6:8:5054:4cff:fe01:e3d6/64 scope global dynamic | -|valid_lft forever preferred_lft 604646sec inet6 fe80::5054:4cff:fe01:e3d6/10 | -|scope link | -+------------------------------------------------------------------------------+ - -You could go ahead and configure your bind for IPv6 addresses. The A type has -an equivalent for IPv6: AAAA. The in-addr.arpa's equivalent is: ip6.int. -There's a lot of information available on this topic. - -There is an increasing number of IPv6-aware applications available, including -secure shell, telnet, inetd, Mozilla the browser, Apache the webserver and a -lot of others. But this is all outside the scope of this Routing document ;-) - -On the Cisco side the configuration would be something like this: -+---------------------------------------------------------------------------+ -|! | -|interface Tunnel1 | -|description IPv6 tunnel | -|no ip address | -|no ip directed-broadcast | -|ipv6 enable | -|ipv6 address 3FFE:604:6:7::1/126 | -|tunnel source Serial0 | -|tunnel destination 145.100.24.181 | -|tunnel mode ipv6ip | -|! | -|ipv6 route 3FFE:604:6:8::/64 Tunnel1 | -+---------------------------------------------------------------------------+ -But if you don't have a Cisco at your disposal, try one of the many IPv6 -tunnel brokers available on the Internet. They are willing to configure their -Cisco with an extra tunnel for you. Mostly by means of a friendly web -interface. Search for "ipv6 tunnel broker" on your favourite search engine. ------------------------------------------------------------------------------ - -Chapter 7. IPsec: secure IP over the Internet - -FIXME: editor vacancy. In the meantime, see: [http://www.freeswan.org/] The -FreeS/WAN project. Another IPSec implementation for Linux is Cerberus, by -NIST. However, their web pages have not been updated in over a year, and -their version tended to trail well behind the current Linux kernel. USAGI, an -alternative IPv6 implementation for Linux, also includes an IPSec -implementation, but that might only be for IPv6. ------------------------------------------------------------------------------ - -Chapter 8. Multicast routing - -FIXME: Editor Vacancy! - -The Multicast-HOWTO is ancient (relatively-speaking) and may be inaccurate or -misleading in places, for that reason. - -Before you can do any multicast routing, you need to configure the Linux -kernel to support the type of multicast routing you want to do. This, in -turn, requires you to decide what type of multicast routing you expect to be -using. There are essentially four "common" types - DVMRP (the Multicast -version of the RIP unicast protocol), MOSPF (the same, but for OSPF), PIM-SM -("Protocol Independent Multicasting - Sparse Mode", which assumes that users -of any multicast group are spread out, rather than clumped) and PIM-DM (the -same, but "Dense Mode", which assumes that there will be significant clumps -of users of the same multicast group). - -In the Linux kernel, you will notice that these options don't appear. This is -because the protocol itself is handled by a routing application, such as -Zebra, mrouted, or pimd. However, you still have to have a good idea of which -you're going to use, to select the right options in the kernel. - -For all multicast routing, you will definitely need to enable "multicasting" -and "multicast routing". For DVMRP and MOSPF, this is sufficient. If you are -going to use PIM, you must also enable PIMv1 or PIMv2, depending on whether -the network you are connecting to uses version 1 or 2 of the PIM protocol. - -Once you have all that sorted out, and your new Linux kernel compiled, you -will see that the IP protocols listed, at boot time, now include IGMP. This -is a protocol for managing multicast groups. At the time of writing, Linux -supports IGMP versions 1 and 2 only, although version 3 does exist and has -been documented. This doesn't really affect us that much, as IGMPv3 is still -new enough that the extra capabilities of IGMPv3 aren't going to be that much -use. Because IGMP deals with groups, only the features present in the -simplest version of IGMP over the entire group are going to be used. For the -most part, that will be IGMPv2, although IGMPv1 is sill going to be -encountered. - -So far, so good. We've enabled multicasting. Now, we have to tell the Linux -kernel to actually do something with it, so we can start routing. This means -adding the Multicast virtual network to the router table: - -ip route add 224.0.0.0/4 dev eth0 - -(Assuming, of course, that you're multicasting over eth0! Substitute the -device of your choice, for this.) - -Now, tell Linux to forward packets... - -echo 1 > /proc/sys/net/ipv4/ip_forward - -At this point, you may be wondering if this is ever going to do anything. So, -to test our connection, we ping the default group, 224.0.0.1, to see if -anyone is alive. All machines on your LAN with multicasting enabled should -respond, but nothing else. You'll notice that none of the machines that -respond have an IP address of 224.0.0.1. What a surprise! :) This is a group -address (a "broadcast" to subscribers), and all members of the group will -respond with their own address, not the group address. - -ping -c 2 224.0.0.1 - -At this point, you're ready to do actual multicast routing. Well, assuming -that you have two networks to route between. - -(To Be Continued!) ------------------------------------------------------------------------------ - -Chapter 9. Queueing Disciplines for Bandwidth Management - -Now, when I discovered this, it really blew me away. Linux 2.2/2.4 comes with -everything to manage bandwidth in ways comparable to high-end dedicated -bandwidth management systems. - -Linux even goes far beyond what Frame and ATM provide. - -Just to prevent confusion, tc uses the following rules for bandwith -specification: -mbps = 1024 kbps = 1024 * 1024 bps => byte/s -mbit = 1024 kbit => kilo bit/s. -mb = 1024 kb = 1024 * 1024 b => byte -mbit = 1024 kbit => kilo bit. -Internally, the number is stored in bps and b. - -But when tc prints the rate, it uses following : -1Mbit = 1024 Kbit = 1024 * 1024 bps => bit/s ------------------------------------------------------------------------------ - -9.1. Queues and Queueing Disciplines explained - -With queueing we determine the way in which data is SENT. It is important to -realise that we can only shape data that we transmit. - -With the way the Internet works, we have no direct control of what people -send us. It's a bit like your (physical!) mailbox at home. There is no way -you can influence the world to modify the amount of mail they send you, short -of contacting everybody. - -However, the Internet is mostly based on TCP/IP which has a few features that -help us. TCP/IP has no way of knowing the capacity of the network between two -hosts, so it just starts sending data faster and faster ('slow start') and -when packets start getting lost, because there is no room to send them, it -will slow down. In fact it is a bit smarter than this, but more about that -later. - -This is the equivalent of not reading half of your mail, and hoping that -people will stop sending it to you. With the difference that it works for the -Internet :-) - -If you have a router and wish to prevent certain hosts within your network -from downloading too fast, you need to do your shaping on the *inner* -interface of your router, the one that sends data to your own computers. - -You also have to be sure you are controlling the bottleneck of the link. If -you have a 100Mbit NIC and you have a router that has a 256kbit link, you -have to make sure you are not sending more data than your router can handle. -Otherwise, it will be the router who is controlling the link and shaping the -available bandwith. We need to 'own the queue' so to speak, and be the -slowest link in the chain. Luckily this is easily possible. ------------------------------------------------------------------------------ - -9.2. Simple, classless Queueing Disciplines - -As said, with queueing disciplines, we change the way data is sent. Classless -queueing disciplines are those that, by and large accept data and only -reschedule, delay or drop it. - -These can be used to shape traffic for an entire interface, without any -subdivisions. It is vital that you understand this part of queueing before we -go on the the classful qdisc-containing-qdiscs! - -By far the most widely used discipline is the pfifo_fast qdisc - this is the -default. This also explains why these advanced features are so robust. They -are nothing more than 'just another queue'. - -Each of these queues has specific strengths and weaknesses. Not all of them -may be as well tested. ------------------------------------------------------------------------------ - -9.2.1. pfifo_fast - -This queue is, as the name says, First In, First Out, which means that no -packet receives special treatment. At least, not quite. This queue has 3 so -called 'bands'. Within each band, FIFO rules apply. However, as long as there -are packets waiting in band 0, band 1 won't be processed. Same goes for band -1 and band 2. - -The kernel honors the so called Type of Service flag of packets, and takes -care to insert 'minimum delay' packets in band 0. - -Do not confuse this classless simple qdisc with the classful PRIO one! -Although they behave similarly, pfifo_fast is classless and you cannot add -other qdiscs to it with the tc command. ------------------------------------------------------------------------------ - -9.2.1.1. Parameters & usage - -You can't configure the pfifo_fast qdisc as it is the hardwired default. This -is how it is configured by default: - -priomap - Determines how packet priorities, as assigned by the kernel, map to - bands. Mapping occurs based on the TOS octet of the packet, which looks - like this: - - - +---------------------------------------------------------------+ - | 0 1 2 3 4 5 6 7 | - |+-----+-----+-----+-----+-----+-----+-----+-----+ | - || | | | | - || PRECEDENCE | TOS | MBZ | | - || | | | | - |+-----+-----+-----+-----+-----+-----+-----+-----+ | - +---------------------------------------------------------------+ - - - The four TOS bits (the 'TOS field') are defined as: - +---------------------------------------------------------------+ - |Binary Decimcal Meaning | - |----------------------------------------- | - |1000 8 Minimize delay (md) | - |0100 4 Maximize throughput (mt) | - |0010 2 Maximize reliability (mr) | - |0001 1 Minimize monetary cost (mmc) | - |0000 0 Normal Service | - +---------------------------------------------------------------+ - - - As there is 1 bit to the right of these four bits, the actual value of - the TOS field is double the value of the TOS bits. Tcpdump -v -v shows - you the value of the entire TOS field, not just the four bits. It is the - value you see in the first column of this table: - - - +---------------------------------------------------------------+ - |TOS Bits Means Linux Priority Band | - |------------------------------------------------------------ | - |0x0 0 Normal Service 0 Best Effort 1 | - |0x2 1 Minimize Monetary Cost 1 Filler 2 | - |0x4 2 Maximize Reliability 0 Best Effort 1 | - |0x6 3 mmc+mr 0 Best Effort 1 | - |0x8 4 Maximize Throughput 2 Bulk 2 | - |0xa 5 mmc+mt 2 Bulk 2 | - |0xc 6 mr+mt 2 Bulk 2 | - |0xe 7 mmc+mr+mt 2 Bulk 2 | - |0x10 8 Minimize Delay 6 Interactive 0 | - |0x12 9 mmc+md 6 Interactive 0 | - |0x14 10 mr+md 6 Interactive 0 | - |0x16 11 mmc+mr+md 6 Interactive 0 | - |0x18 12 mt+md 4 Int. Bulk 1 | - |0x1a 13 mmc+mt+md 4 Int. Bulk 1 | - |0x1c 14 mr+mt+md 4 Int. Bulk 1 | - |0x1e 15 mmc+mr+mt+md 4 Int. Bulk 1 | - +---------------------------------------------------------------+ - - - Lots of numbers. The second column contains the value of the relevant - four TOS bits, followed by their translated meaning. For example, 15 - stands for a packet wanting Minimal Monetary Cost, Maximum Reliability, - Maximum Throughput AND Minimum Delay. I would call this a 'Dutch Packet'. - - The fourth column lists the way the Linux kernel interprets the TOS bits, - by showing to which Priority they are mapped. - - The last column shows the result of the default priomap. On the command - line, the default priomap looks like this: - +---------------------------------------------------------------+ - |1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 | - +---------------------------------------------------------------+ - - - This means that priority 4, for example, gets mapped to band number 1. - The priomap also allows you to list higher priorities (> 7) which do not - correspond to TOS mappings, but which are set by other means. - - This table from RFC 1349 (read it for more details) tells you how - applications might very well set their TOS bits: - +-----------------------------------------------------------------+ - |TELNET 1000 (minimize delay) | - |FTP | - | Control 1000 (minimize delay) | - | Data 0100 (maximize throughput) | - | | - |TFTP 1000 (minimize delay) | - | | - |SMTP | - | Command phase 1000 (minimize delay) | - | DATA phase 0100 (maximize throughput) | - | | - |Domain Name Service | - | UDP Query 1000 (minimize delay) | - | TCP Query 0000 | - | Zone Transfer 0100 (maximize throughput) | - | | - |NNTP 0001 (minimize monetary cost) | - | | - |ICMP | - | Errors 0000 | - | Requests 0000 (mostly) | - | Responses (mostly) | - +-----------------------------------------------------------------+ - - -txqueuelen - The length of this queue is gleaned from the interface configuration, - which you can see and set with ifconfig and ip. To set the queue length - to 10, execute: ifconfig eth0 txqueuelen 10 - - You can't set this parameter with tc! - - ------------------------------------------------------------------------------ -9.2.2. Token Bucket Filter - -The Token Bucket Filter (TBF) is a simple qdisc that only passes packets -arriving at a rate which is not exceeding some administratively set rate, but -with the possibility to allow short bursts in excess of this rate. - -TBF is very precise, network- and processor friendly. It should be your first -choice if you simply want to slow an interface down! - -The TBF implementation consists of a buffer (bucket), constantly filled by -some virtual pieces of information called tokens, at a specific rate (token -rate). The most important parameter of the bucket is its size, that is the -number of tokens it can store. - -Each arriving token collects one incoming data packet from the data queue and -is then deleted from the bucket. Associating this algorithm with the two -flows -- token and data, gives us three possible scenarios: - - - -  * The data arrives in TBF at a rate that's equal to the rate of incoming - tokens. In this case each incoming packet has its matching token and - passes the queue without delay. - -  * The data arrives in TBF at a rate that's smaller than the token rate. - Only a part of the tokens are deleted at output of each data packet - that's sent out the queue, so the tokens accumulate, up to the bucket - size. The unused tokens can then be used to send data a a speed that's - exceeding the standard token rate, in case short data bursts occur. - -  * The data arrives in TBF at a rate bigger than the token rate. This means - that the bucket will soon be devoid of tokens, which causes the TBF to - throttle itself for a while. This is called an 'overlimit situation'. If - packets keep coming in, packets will start to get dropped. - - - - -The last scenario is very important, because it allows to administratively -shape the bandwidth available to data that's passing the filter. - -The accumulation of tokens allows a short burst of overlimit data to be still -passed without loss, but any lasting overload will cause packets to be -constantly delayed, and then dropped. - -Please note that in the actual implementation, tokens correspond to bytes, -not packets. ------------------------------------------------------------------------------ - -9.2.2.1. Parameters & usage - -Even though you will probably not need to change them, tbf has some knobs -available. First the parameters that are always available: - -limit or latency - Limit is the number of bytes that can be queued waiting for tokens to - become available. You can also specify this the other way around by - setting the latency parameter, which specifies the maximum amount of time - a packet can sit in the TBF. The latter calculation takes into account - the size of the bucket, the rate and possibly the peakrate (if set). - -burst/buffer/maxburst - Size of the bucket, in bytes. This is the maximum amount of bytes that - tokens can be available for instantaneously. In general, larger shaping - rates require a larger buffer. For 10mbit/s on Intel, you need at least - 10kbyte buffer if you want to reach your configured rate! - - If your buffer is too small, packets may be dropped because more tokens - arrive per timer tick than fit in your bucket. - -mpu - A zero-sized packet does not use zero bandwidth. For ethernet, no packet - uses less than 64 bytes. The Minimum Packet Unit determines the minimal - token usage for a packet. - -rate - The speedknob. See remarks above about limits! - - -If the bucket contains tokens and is allowed to empty, by default it does so -at infinite speed. If this is unacceptable, use the following parameters: - -peakrate - If tokens are available, and packets arrive, they are sent out - immediately by default, at 'lightspeed' so to speak. That may not be what - you want, especially if you have a large bucket. - - The peakrate can be used to specify how quickly the bucket is allowed to - be depleted. If doing everything by the book, this is achieved by - releasing a packet, and then wait just long enough, and release the next. - We calculated our waits so we send just at peakrate. - - However, due to de default 10ms timer resolution of Unix, with 10.000 - bits average packets, we are limited to 1mbit/s of peakrate! - -mtu/minburst - The 1mbit/s peakrate is not very useful if your regular rate is more than - that. A higher peakrate is possible by sending out more packets per - timertick, which effectively means that we create a second bucket! - - This second bucket defaults to a single packet, which is not a bucket at - all. - - To calculate the maximum possible peakrate, multiply the configured mtu - by 100 (or more correctly, HZ, which is 100 on Intel, 1024 on Alpha). - - ------------------------------------------------------------------------------ -9.2.2.2. Sample configuration - -A simple but *very* useful configuration is this: -+---------------------------------------------------------------------------+ -|# tc qdisc add dev ppp0 root tbf rate 220kbit latency 50ms burst 1540 | -+---------------------------------------------------------------------------+ - - -Ok, why is this useful? If you have a networking device with a large queue, -like a DSL modem or a cable modem, and you talk to it over a fast device, -like over an ethernet interface, you will find that uploading absolutely -destroys interactivity. - -This is because uploading will fill the queue in the modem, which is probably -*huge* because this helps actually achieving good data throughput uploading. -But this is not what you want, you want to have the queue not too big so -interactivity remains and you can still do other stuff while sending data. - -The line above slows down sending to a rate that does not lead to a queue in -the modem - the queue will be in Linux, where we can control it to a limited -size. - -Change 220kbit to your uplink's *actual* speed, minus a few percent. If you -have a really fast modem, raise 'burst' a bit. ------------------------------------------------------------------------------ - -9.2.3. Stochastic Fairness Queueing - -Stochastic Fairness Queueing (SFQ) is a simple implementation of the fair -queueing algorithms family. It's less accurate than others, but it also -requires less calculations while being almost perfectly fair. - -The key word in SFQ is conversation (or flow), which mostly corresponds to a -TCP session or a UDP stream. Traffic is divided into a pretty large number of -FIFO queues, one for each conversation. Traffic is then sent in a round robin -fashion, giving each session the chance to send data in turn. - -This leads to very fair behaviour and disallows any single conversation from -drowning out the rest. SFQ is called 'Stochastic' because it doesn't really -allocate a queue for each session, it has an algorithm which divides traffic -over a limited number of queues using a hashing algorithm. - -Because of the hash, multiple sessions might end up in the same bucket, which -would halve each session's chance of sending a packet, thus halving the -effective speed available. To prevent this situation from becoming -noticeable, SFQ changes its hashing algorithm quite often so that any two -colliding sessions will only do so for a small number of seconds. - -It is important to note that SFQ is only useful in case your actual outgoing -interface is really full! If it isn't then there will be no queue on your -linux machine and hence no effect. Later on we will describe how to combine -SFQ with other qdiscs to get a best-of-both worlds situation. - -Specifically, setting SFQ on the ethernet interface heading to your cable -modem or DSL router is pointless without further shaping! ------------------------------------------------------------------------------ - -9.2.3.1. Parameters & usage - -The SFQ is pretty much self tuning: - -perturb - Reconfigure hashing once this many seconds. If unset, hash will never be - reconfigured. Not recommended. 10 seconds is probably a good value. - -quantum - Amount of bytes a stream is allowed to dequeue before the next queue gets - a turn. Defaults to 1 maximum sized packet (MTU-sized). Do not set below - the MTU! - - ------------------------------------------------------------------------------ -9.2.3.2. Sample configuration - -If you have a device which has identical link speed and actual available -rate, like a phone modem, this configuration will help promote fairness: -+--------------------------------------------------------------------------------+ -|# tc qdisc add dev ppp0 root sfq perturb 10 | -|# tc -s -d qdisc ls | -|qdisc sfq 800c: dev ppp0 quantum 1514b limit 128p flows 128/1024 perturb 10sec | -| Sent 4812 bytes 62 pkts (dropped 0, overlimits 0) | -+--------------------------------------------------------------------------------+ - - -The number 800c: is the automatically assigned handle number, limit means -that 128 packets can wait in this queue. There are 1024 hashbuckets available -for accounting, of which 128 can be active at a time (no more packets fit in -the queue!) Once every 10 seconds, the hashes are reconfigured. ------------------------------------------------------------------------------ - -9.3. Advice for when to use which queue - -Summarizing, these are the simple queues that actually manage traffic by -reordering, slowing or dropping packets. - -The following tips may help in choosing which queue to use. It mentions some -qdiscs described in the Chapter 14 chapter. - -  * To purely slow down outgoing traffic, use the Token Bucket Filter. Works - up to huge bandwidths, if you scale the bucket. - -  * If your link is truly full and you want to make sure that no single - session can dominate your outgoing bandwidth, use Stochastical Fairness - Queueing. - -  * If you have a big backbone and know what you are doing, consider Random - Early Drop (see Advanced chapter). - -  * To 'shape' incoming traffic which you are not forwarding, use the Ingress - Policer. Incoming shaping is called 'policing', by the way, not - 'shaping'. - -  * If you *are* forwarding it, use a TBF on the interface you are forwarding - the data to. Unless you want to shape traffic that may go out over - several interfaces, in which case the only common factor is the incoming - interface. In that case use the Ingress Policer. - -  * If you don't want to shape, but only want to see if your interface is so - loaded that it has to queue, use the pfifo queue (not pfifo_fast). It - lacks internal bands but does account the size of its backlog. - -  * Finally - you can also do "social shaping". You may not always be able to - use technology to achieve what you want. Users experience technical - constraints as hostile. A kind word may also help with getting your - bandwidth to be divided right! - - ------------------------------------------------------------------------------ -9.4. Terminology - -To properly understand more complicated configurations it is necessary to -explain a few concepts first. Because of the complexity and he relative youth -of the subject, a lot of different words are used when people in fact mean -the same thing. - -The following is loosely based on draft-ietf-diffserv-model-06.txt, An -Informal Management Model for Diffserv Routers. It can currently be found at -[http://www.ietf.org/internet-drafts/draft-ietf-diffserv-model-06.txt] http:/ -/www.ietf.org/internet-drafts/draft-ietf-diffserv-model-06.txt. - -Read it for the strict definitions of the terms used. - -Queueing Discipline - An algorithm that manages the queue of a device, either incoming - (ingress) or outgoing (egress). - -Classless qdisc - A qdisc with no configurable internal subdivisions. - -Classful qdisc - A classful qdisc contains multiple classes. Each of these classes - contains a further qdisc, which may again be classful, but need not be. - According to the strict definition, pfifo_fast *is* classful, because it - contains three bands which are, in fact, classes. However, from the - user's configuration perspective, it is classless as the classes can't be - touched with the tc tool. - -Classes - A classful qdisc may have many classes, which each are internal to the - qdisc. Each of these classes may contain a real qdisc. - -Classifier - Each classful qdisc needs to determine to which class it needs to send a - packet. This is done using the classifier. - -Filter - Classification can be performed using filters. A filter contains a number - of conditions which if matched, make the filter match. - -Scheduling - A qdisc may, with the help of a classifier, decide that some packets need - to go out earlier than others. This process is called Scheduling, and is - performed for example by the pfifo_fast qdisc mentioned earlier. - Scheduling is also called 'reordering', but this is confusing. - -Shaping - The process of delaying packets before they go out to make traffic - confirm to a configured maximum rate. Shaping is performed on egress. - Colloquially, dropping packets to slow traffic down is also often called - Shaping. - -Policing - Delaying or dropping packets in order to make traffic stay below a - configured bandwidth. In Linux, policing can only drop a packet and not - delay it - there is no 'ingress queue'. - -Work-Conserving - A work-conserving qdisc always delivers a packet if one is available. In - other words, it never delays a packet if the network adaptor is ready to - send one (in the case of an egress qdisc). - -non-Work-Conserving - Some queues, like for example the Token Bucket Filter, may need to hold - on to a packet for a certain time in order to limit the bandwidth. This - means that they sometimes refuse to give up a packet, even though they - have one available. - - -Now that we have our terminology straight, let's see where all these things -are. - - -+---------------------------------------------------------------------------+ -| Userspace programs | -| ^ | -| | | -| +---------------+-----------------------------------------+ | -| | Y | | -| | -------> IP Stack | | -| | | | | | -| | | Y | | -| | | Y | | -| | ^ | | | -| | | / ----------> Forwarding -> | | -| | ^ / | | | -| | |/ Y | | -| | | | | | -| | ^ Y /-qdisc1-\ | | -| | | Egress /--qdisc2--\ | | -| --->->Ingress Classifier ---qdisc3---- | -> | -| | Qdisc \__qdisc4__/ | | -| | \-qdiscN_/ | | -| | | | -| +----------------------------------------------------------+ | -+---------------------------------------------------------------------------+ -Thanks to Jamal Hadi Salim for this ASCII representation. - -The big block represents the kernel. The leftmost arrow represents traffic -entering your machine from the network. It is then fed to the Ingress Qdisc -which may apply Filters to a packet, and decide to drop it. This is called -'Policing'. - -This happens at a very early stage, before it has seen a lot of the kernel. -It is therefore a very good place to drop traffic very early, without -consuming a lot of CPU power. - -If the packet is allowed to continue, it may be destined for a local -application, in which case it enters the IP stack in order to be processed, -and handed over to a userspace program. The packet may also be forwarded -without entering an application, in which case it is destined for egress. -Userspace programs may also deliver data, which is then examined and -forwarded to the Egress Classifier. - -There it is investigated and enqueued to any of a number of qdiscs. In the -unconfigured default case, there is only one egress qdisc installed, the -pfifo_fast, which always receives the packet. This is called 'enqueueing'. - -The packet now sits in the qdisc, waiting for the kernel to ask for it for -transmission over the network interface. This is called 'dequeueing'. - -This picture also holds in case there is only one network adaptor - the -arrows entering and leaving the kernel should not be taken too literally. -Each network adaptor has both ingress and egress hooks. ------------------------------------------------------------------------------ - -9.5. Classful Queueing Disciplines - -Classful qdiscs are very useful if you have different kinds of traffic which -should have differing treatment. One of the classful qdiscs is called 'CBQ' , -'Class Based Queueing' and it is so widely mentioned that people identify -queueing with classes solely with CBQ, but this is not the case. - -CBQ is merely the oldest kid on the block - and also the most complex one. It -may not always do what you want. This may come as something of a shock to -many who fell for the 'sendmail effect', which teaches us that any complex -technology which doesn't come with documentation must be the best available. - -More about CBQ and its alternatives shortly. ------------------------------------------------------------------------------ - -9.5.1. Flow within classful qdiscs & classes - -When traffic enters a classful qdisc, it needs to be sent to any of the -classes within - it needs to be 'classified'. To determine what to do with a -packet, the so called 'filters' are consulted. It is important to know that -the filters are called from within a qdisc, and not the other way around! - -The filters attached to that qdisc then return with a decision, and the qdisc -uses this to enqueue the packet into one of the classes. Each subclass may -try other filters to see if further instructions apply. If not, the class -enqueues the packet to the qdisc it contains. - -Besides containing other qdiscs, most classful qdiscs also perform shaping. -This is useful to perform both packet scheduling (with SFQ, for example) and -rate control. You need this in cases where you have a high speed interface -(for example, ethernet) to a slower device (a cable modem). - -If you were only to run SFQ, nothing would happen, as packets enter & leave -your router without delay: the output interface is far faster than your -actual link speed. There is no queue to schedule then. ------------------------------------------------------------------------------ - -9.5.2. The qdisc family: roots, handles, siblings and parents - -Each interface has one egress 'root qdisc', by default the earlier mentioned -classless pfifo_fast queueing discipline. Each qdisc can be assigned a -handle, which can be used by later configuration statements to refer to that -qdisc. Besides an egress qdisc, an interface may also have an ingress, which -polices traffic coming in. - -The handles of these qdiscs consist of two parts, a major number and a minor -number. It is habitual to name the root qdisc '1:', which is equal to '1:0'. -The minor number of a qdisc is always 0. - -Classes need to have the same major number as their parent. ------------------------------------------------------------------------------ - -9.5.2.1. How filters are used to classify traffic - -Recapping, a typical hierarchy might look like this: -+---------------------------------------------------------------------------+ -| root 1: | -| | | -| _1:1_ | -| / | \ | -| / | \ | -| / | \ | -| 10: 11: 12: | -| / \ / \ | -| 10:1 10:2 12:1 12:2 | -+---------------------------------------------------------------------------+ - - -But don't let this tree fool you! You should *not* imagine the kernel to be -at the apex of the tree and the network below, that is just not the case. -Packets get enqueued and dequeued at the root qdisc, which is the only thing -the kernel talks to. - -A packet might get classified in a chain like this: - -1: -> 1:1 -> 12: -> 12:2 - -The packet now resides in a queue in a qdisc attached to class 12:2. In this -example, a filter was attached to each 'node' in the tree, each choosing a -branch to take next. This can make sense. However, this is also possible: - -1: -> 12:2 - -In this case, a filter attached to the root decided to send the packet -directly to 12:2. ------------------------------------------------------------------------------ - -9.5.2.2. How packets are dequeued to the hardware - -When the kernel decides that it needs to extract packets to send to the -interface, the root qdisc 1: gets a dequeue request, which is passed to 1:1, -which is in turn passed to 10:, 11: and 12:, which each query their siblings, -and try to dequeue() from them. In this case, the kernel needs to walk the -entire tree, because only 12:2 contains a packet. - -In short, nested classes ONLY talk to their parent qdiscs, never to an -interface. Only the root qdisc gets dequeued by the kernel! - -The upshot of this is that classes never get dequeued faster than their -parents allow. And this is exactly what we want: this way we can have SFQ in -an inner class, which doesn't do any shaping, only scheduling, and have a -shaping outer qdisc, which does the shaping. ------------------------------------------------------------------------------ - -9.5.3. The PRIO qdisc - -The PRIO qdisc doesn't actually shape, it only subdivides traffic based on -how you configured your filters. You can consider the PRIO qdisc a kind of -pfifo_fast on steroids, whereby each band is a separate class instead of a -simple FIFO. - -When a packet is enqueued to the PRIO qdisc, a class is chosen based on the -filter commands you gave. By default, three classes are created. These -classes by default contain pure FIFO qdiscs with no internal structure, but -you can replace these by any qdisc you have available. - -Whenever a packet needs to be dequeued, class :1 is tried first. Higher -classes are only used if lower bands all did not give up a packet. - -This qdisc is very useful in case you want to prioritize certain kinds of -traffic without using only TOS-flags but using all the power of the tc -filters. It can also contain more all qdiscs, whereas pfifo_fast is limited -to simple fifo qdiscs. - -Because it doesn't actually shape, the same warning as for SFQ holds: either -use it only if your physical link is really full or wrap it inside a classful -qdisc that does shape. The last holds for almost all cable modems and DSL -devices. - -In formal words, the PRIO qdisc is a Work-Conserving scheduler. ------------------------------------------------------------------------------ - -9.5.3.1. PRIO parameters & usage - -The following parameters are recognized by tc: - -bands - Number of bands to create. Each band is in fact a class. If you change - this number, you must also change: - -priomap - If you do not provide tc filters to classify traffic, the PRIO qdisc - looks at the TC_PRIO priority to decide how to enqueue traffic. - - This works just like with the pfifo_fast qdisc mentioned earlier, see - there for lots of detail. - - -The bands are classes, and are called major:1 to major:3 by default, so if -your PRIO qdisc is called 12:, tc filter traffic to 12:1 to grant it more -priority. - -Reiterating, band 0 goes to minor number 1! Band 1 to minor number 2, etc. ------------------------------------------------------------------------------ - -9.5.3.2. Sample configuration - -We will create this tree: -+---------------------------------------------------------------------------+ -| root 1: prio | -| / | \ | -| 1:1 1:2 1:3 | -| | | | | -| 10: 20: 30: | -| sfq tbf sfq | -|band 0 1 2 | -+---------------------------------------------------------------------------+ - - -Bulk traffic will go to 30:, interactive traffic to 20: or 10:. - -Command lines: -+-------------------------------------------------------------------------------------+ -|# tc qdisc add dev eth0 root handle 1: prio | -|## This *instantly* creates classes 1:1, 1:2, 1:3 | -| | -|# tc qdisc add dev eth0 parent 1:1 handle 10: sfq | -|# tc qdisc add dev eth0 parent 1:2 handle 20: tbf rate 20kbit buffer 1600 limit 3000 | -|# tc qdisc add dev eth0 parent 1:3 handle 30: sfq | -+-------------------------------------------------------------------------------------+ - - -Now let's see what we created: -+---------------------------------------------------------------------------+ -|# tc -s qdisc ls dev eth0 | -|qdisc sfq 30: quantum 1514b | -| Sent 0 bytes 0 pkts (dropped 0, overlimits 0) | -| | -| qdisc tbf 20: rate 20Kbit burst 1599b lat 667.6ms | -| Sent 0 bytes 0 pkts (dropped 0, overlimits 0) | -| | -| qdisc sfq 10: quantum 1514b | -| Sent 132 bytes 2 pkts (dropped 0, overlimits 0) | -| | -| qdisc prio 1: bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 | -| Sent 174 bytes 3 pkts (dropped 0, overlimits 0) | -+---------------------------------------------------------------------------+ -As you can see, band 0 has already had some traffic, and one packet was sent -while running this command! - -We now do some bulk data transfer with a tool that properly sets TOS flags, -and take another look: -+--------------------------------------------------------------------------------+ -|# scp tc ahu@10.0.0.11:./ | -|ahu@10.0.0.11's password: | -|tc 100% |*****************************| 353 KB 00:00 | -|# tc -s qdisc ls dev eth0 | -|qdisc sfq 30: quantum 1514b | -| Sent 384228 bytes 274 pkts (dropped 0, overlimits 0) | -| | -| qdisc tbf 20: rate 20Kbit burst 1599b lat 667.6ms | -| Sent 2640 bytes 20 pkts (dropped 0, overlimits 0) | -| | -| qdisc sfq 10: quantum 1514b | -| Sent 2230 bytes 31 pkts (dropped 0, overlimits 0) | -| | -| qdisc prio 1: bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 | -| Sent 389140 bytes 326 pkts (dropped 0, overlimits 0) | -+--------------------------------------------------------------------------------+ -As you can see, all traffic went to handle 30:, which is the lowest priority -band, just as intended. Now to verify that interactive traffic goes to higher -bands, we create some interactive traffic: - - -+---------------------------------------------------------------------------+ -|# tc -s qdisc ls dev eth0 | -|qdisc sfq 30: quantum 1514b | -| Sent 384228 bytes 274 pkts (dropped 0, overlimits 0) | -| | -| qdisc tbf 20: rate 20Kbit burst 1599b lat 667.6ms | -| Sent 2640 bytes 20 pkts (dropped 0, overlimits 0) | -| | -| qdisc sfq 10: quantum 1514b | -| Sent 14926 bytes 193 pkts (dropped 0, overlimits 0) | -| | -| qdisc prio 1: bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 | -| Sent 401836 bytes 488 pkts (dropped 0, overlimits 0) | -+---------------------------------------------------------------------------+ - - -It worked - all additional traffic has gone to 10:, which is our highest -priority qdisc. No traffic was sent to the lowest priority, which previously -received our entire scp. ------------------------------------------------------------------------------ - -9.5.4. The famous CBQ qdisc - -As said before, CBQ is the most complex qdisc available, the most hyped, the -least understood, and probably the trickiest one to get right. This is not -because the authors are evil or incompetent, far from it, it's just that the -CBQ algorithm isn't all that precise and doesn't really match the way Linux -works. - -Besides being classful, CBQ is also a shaper and it is in that aspect that it -really doesn't work very well. It should work like this. If you try to shape -a 10mbit/s connection to 1mbit/s, the link should be idle 90% of the time. If -it isn't, we need to throttle so that it IS idle 90% of the time. - -This is pretty hard to measure, so CBQ instead derives the idle time from the -number of microseconds that elapse between requests from the hardware layer -for more data. Combined, this can be used to approximate how full or empty -the link is. - -This is rather circumspect and doesn't always arrive at proper results. For -example, what if the actual link speed of an interface that is not really -able to transmit the full 100mbit/s of data, perhaps because of a badly -implemented driver? A PCMCIA network card will also never achieve 100mbit/s -because of the way the bus is designed - again, how do we calculate the idle -time? - -It gets even worse if we consider not-quite-real network devices like PPP -over Ethernet or PPTP over TCP/IP. The effective bandwidth in that case is -probably determined by the efficiency of pipes to userspace - which is huge. - -People who have done measurements discover that CBQ is not always very -accurate and sometimes completely misses the mark. - -In many circumstances however it works well. With the documentation provided -here, you should be able to configure it to work well in most cases. ------------------------------------------------------------------------------ - -9.5.4.1. CBQ shaping in detail - -As said before, CBQ works by making sure that the link is idle just long -enough to bring down the real bandwidth to the configured rate. To do so, it -calculates the time that should pass between average packets. - -During operations, the effective idletime is measured using an exponential -weighted moving average (EWMA), which considers recent packets to be -exponentially more important than past ones. The UNIX loadaverage is -calculated in the same way. - -The calculated idle time is subtracted from the EWMA measured one, the -resulting number is called 'avgidle'. A perfectly loaded link has an avgidle -of zero: packets arrive exactly once every calculated interval. - -An overloaded link has a negative avgidle and if it gets too negative, CBQ -shuts down for a while and is then 'overlimit'. - -Conversely, an idle link might amass a huge avgidle, which would then allow -infinite bandwidths after a few hours of silence. To prevent this, avgidle is -capped at maxidle. - -If overlimit, in theory, the CBQ could throttle itself for exactly the amount -of time that was calculated to pass between packets, and then pass one -packet, and throttle again. But see the 'minburst' parameter below. - -These are parameters you can specify in order to configure shaping: - -avpkt - Average size of a packet, measured in bytes. Needed for calculating - maxidle, which is derived from maxburst, which is specified in packets. - -bandwidth - The physical bandwidth of your device, needed for idle time calculations. - -cell - The time a packet takes to be transmitted over a device may grow in - steps, based on the packet size. An 800 and an 806 size packet may take - just as long to send, for example - this sets the granularity. Most often - set to '8'. Must be an integral power of two. - -maxburst - This number of packets is used to calculate maxidle so that when avgidle - is at maxidle, this number of average packets can be burst before avgidle - drops to 0. Set it higher to be more tolerant of bursts. You can't set - maxidle directly, only via this parameter. - -minburst - As mentioned before, CBQ needs to throttle in case of overlimit. The - ideal solution is to do so for exactly the calculated idle time, and pass - 1 packet. However, Unix kernels generally have a hard time scheduling - events shorter than 10ms, so it is better to throttle for a longer - period, and then pass minburst packets in one go, and then sleep minburst - times longer. - - The time to wait is called the offtime. Higher values of minburst lead to - more accurate shaping in the long term, but to bigger bursts at - millisecond timescales. - -minidle - If avgidle is below 0, we are overlimits and need to wait until avgidle - will be big enough to send one packet. To prevent a sudden burst from - shutting down the link for a prolonged period of time, avgidle is reset - to minidle if it gets too low. - - Minidle is specified in negative microseconds, so 10 means that avgidle - is capped at -10us. - -mpu - Minimum packet size - needed because even a zero size packet is padded to - 64 bytes on ethernet, and so takes a certain time to transmit. CBQ needs - to know this to accurately calculate the idle time. - -rate - Desired rate of traffic leaving this qdisc - this is the 'speed knob'! - - -Internally, CBQ has a lot of fine tuning. For example, classes which are -known not to have data enqueued to them aren't queried. Overlimit classes are -penalized by lowering their effective priority. All very smart & complicated. ------------------------------------------------------------------------------ - -9.5.4.2. CBQ classful behaviour - -Besides shaping, using the aforementioned idletime approximations, CBQ also -acts like the PRIO queue in the sense that classes can have differing -priorities and that lower priority numbers will be polled before the higher -priority ones. - -Each time a packet is requested by the hardware layer to be sent out to the -network, a weighted round robin process ('WRR') starts, beginning with the -lower priority classes. - -These are then grouped and queried if they have data available. If so, it is -returned. After a class has been allowed to dequeue a number of bytes, the -next class within that priority is tried. - -The following parameters control the WRR process: - -allot - When the outer CBQ is asked for a packet to send out on the interface, it - will try all inner qdiscs (in the classes) in turn, in order of the - 'priority' parameter. Each time a class gets its turn, it can only send - out a limited amount of data. 'Allot' is the base unit of this amount. - See the 'weight' parameter for more information. - -prio - The CBQ can also act like the PRIO device. Inner classes with lower - priority are tried first and as long as they have traffic, other classes - are not polled for traffic. - -weight - Weight helps in the Weighted Round Robin process. Each class gets a - chance to send in turn. If you have classes with significantly more - bandwidth than other classes, it makes sense to allow them to send more - data in one round than the others. - - A CBQ adds up all weights under a class, and normalizes them, so you can - use arbitrary numbers: only the ratios are important. People have been - using 'rate/10' as a rule of thumb and it appears to work well. The - renormalized weight is multiplied by the 'allot' parameter to determine - how much data can be sent in one round. - - -Please note that all classes within an CBQ hierarchy need to share the same -major number! ------------------------------------------------------------------------------ - -9.5.4.3. CBQ parameters that determine link sharing & borrowing - -Besides purely limiting certain kinds of traffic, it is also possible to -specify which classes can borrow capacity from other classes or, conversely, -lend out bandwidth. - -Isolated/sharing - A class that is configured with 'isolated' will not lend out bandwidth to - sibling classes. Use this if you have competing or mutually-unfriendly - agencies on your link who do want to give each other freebies. - - The control program tc also knows about 'sharing', which is the reverse - of 'isolated'. - -bounded/borrow - A class can also be 'bounded', which means that it will not try to borrow - bandwidth from sibling classes. tc also knows about 'borrow', which is - the reverse of 'bounded'. - - -A typical situation might be where you have two agencies on your link which -are both 'isolated' and 'bounded', which means that they are really limited -to their assigned rate, and also won't allow each other to borrow. - -Within such an agency class, there might be other classes which are allowed -to swap bandwidth. ------------------------------------------------------------------------------ - -9.5.4.4. Sample configuration - -This configuration limits webserver traffic to 5mbit and SMTP traffic to 3 -mbit. Together, they may not get more than 6mbit. We have a 100mbit NIC and -the classes may borrow bandwidth from each other. -+---------------------------------------------------------------------------+ -|# tc qdisc add dev eth0 root handle 1:0 cbq bandwidth 100Mbit \ | -| avpkt 1000 cell 8 | -|# tc class add dev eth0 parent 1:0 classid 1:1 cbq bandwidth 100Mbit \ | -| rate 6Mbit weight 0.6Mbit prio 8 allot 1514 cell 8 maxburst 20 \ | -| avpkt 1000 bounded | -+---------------------------------------------------------------------------+ -This part installs the root and the customary 1:0 class. The 1:1 class is -bounded, so the total bandwidth can't exceed 6mbit. - -As said before, CBQ requires a *lot* of knobs. All parameters are explained -above, however. The corresponding HTB configuration is lots simpler. - - -+---------------------------------------------------------------------------+ -|# tc class add dev eth0 parent 1:1 classid 1:3 cbq bandwidth 100Mbit \ | -| rate 5Mbit weight 0.5Mbit prio 5 allot 1514 cell 8 maxburst 20 \ | -| avpkt 1000 | -|# tc class add dev eth0 parent 1:1 classid 1:4 cbq bandwidth 100Mbit \ | -| rate 3Mbit weight 0.3Mbit prio 5 allot 1514 cell 8 maxburst 20 \ | -| avpkt 1000 | -+---------------------------------------------------------------------------+ - - -These are our two classes. Note how we scale the weight with the configured -rate. Both classes are not bounded, but they are connected to class 1:1 which -is bounded. So the sum of bandwith of the 2 classes will never be more than -6mbit. The classids need to be within the same major number as the parent -CBQ, by the way! - - -+---------------------------------------------------------------------------+ -|# tc qdisc add dev eth0 parent 1:3 handle 30: sfq | -|# tc qdisc add dev eth0 parent 1:4 handle 40: sfq | -+---------------------------------------------------------------------------+ - - -Both classes have a FIFO qdisc by default. But we replaced these with an SFQ -queue so each flow of data is treated equally. -+---------------------------------------------------------------------------+ -|# tc filter add dev eth0 parent 1:0 protocol ip prio 1 u32 match ip \ | -| sport 80 0xffff flowid 1:3 | -|# tc filter add dev eth0 parent 1:0 protocol ip prio 1 u32 match ip \ | -| sport 25 0xffff flowid 1:4 | -+---------------------------------------------------------------------------+ - - -These commands, attached directly to the root, send traffic to the right -qdiscs. - -Note that we use 'tc class add' to CREATE classes within a qdisc, but that we -use 'tc qdisc add' to actually add qdiscs to these classes. - -You may wonder what happens to traffic that is not classified by any of the -two rules. It appears that in this case, data will then be processed within -1:0, and be unlimited. - -If SMTP+web together try to exceed the set limit of 6mbit/s, bandwidth will -be divided according to the weight parameter, giving 5/8 of traffic to the -webserver and 3/8 to the mail server. - -With this configuration you can also say that webserver traffic will always -get at minimum 5/8 * 6 mbit = 3.75 mbit. ------------------------------------------------------------------------------ - -9.5.4.5. Other CBQ parameters: split & defmap - -As said before, a classful qdisc needs to call filters to determine which -class a packet will be enqueued to. - -Besides calling the filter, CBQ offers other options, defmap & split. This is -pretty complicated to understand, and it is not vital. But as this is the -only known place where defmap & split are properly explained, I'm doing my -best. - -As you will often want to filter on the Type of Service field only, a special -syntax is provided. Whenever the CBQ needs to figure out where a packet needs -to be enqueued, it checks if this node is a 'split node'. If so, one of the -sub-qdiscs has indicated that it wishes to receive all packets with a certain -configured priority, as might be derived from the TOS field, or socket -options set by applications. - -The packets' priority bits are or-ed with the defmap field to see if a match -exists. In other words, this is a short-hand way of creating a very fast -filter, which only matches certain priorities. A defmap of ff (hex) will -match everything, a map of 0 nothing. A sample configuration may help make -things clearer: - - -+---------------------------------------------------------------------------+ -|# tc qdisc add dev eth1 root handle 1: cbq bandwidth 10Mbit allot 1514 \ | -| cell 8 avpkt 1000 mpu 64 | -| | -|# tc class add dev eth1 parent 1:0 classid 1:1 cbq bandwidth 10Mbit \ | -| rate 10Mbit allot 1514 cell 8 weight 1Mbit prio 8 maxburst 20 \ | -| avpkt 1000 | -+---------------------------------------------------------------------------+ -Standard CBQ preamble. I never get used to the sheer amount of numbers -required! - -Defmap refers to TC_PRIO bits, which are defined as follows: - - -+---------------------------------------------------------------------------+ -|TC_PRIO.. Num Corresponds to TOS | -|------------------------------------------------- | -|BESTEFFORT 0 Maximize Reliablity | -|FILLER 1 Minimize Cost | -|BULK 2 Maximize Throughput (0x8) | -|INTERACTIVE_BULK 4 | -|INTERACTIVE 6 Minimize Delay (0x10) | -|CONTROL 7 | -+---------------------------------------------------------------------------+ - - -The TC_PRIO.. number corresponds to bits, counted from the right. See the -pfifo_fast section for more details how TOS bits are converted to priorities. - -Now the interactive and the bulk classes: - - -+---------------------------------------------------------------------------+ -|# tc class add dev eth1 parent 1:1 classid 1:2 cbq bandwidth 10Mbit \ | -| rate 1Mbit allot 1514 cell 8 weight 100Kbit prio 3 maxburst 20 \ | -| avpkt 1000 split 1:0 defmap c0 | -| | -|# tc class add dev eth1 parent 1:1 classid 1:3 cbq bandwidth 10Mbit \ | -| rate 8Mbit allot 1514 cell 8 weight 800Kbit prio 7 maxburst 20 \ | -| avpkt 1000 split 1:0 defmap 3f | -+---------------------------------------------------------------------------+ - - -The 'split qdisc' is 1:0, which is where the choice will be made. C0 is -binary for 11000000, 3F for 00111111, so these two together will match -everything. The first class matches bits 7 & 6, and thus corresponds to -'interactive' and 'control' traffic. The second class matches the rest. - -Node 1:0 now has a table like this: -+---------------------------------------------------------------------------+ -|priority send to | -|0 1:3 | -|1 1:3 | -|2 1:3 | -|3 1:3 | -|4 1:3 | -|5 1:3 | -|6 1:2 | -|7 1:2 | -+---------------------------------------------------------------------------+ - - -For additional fun, you can also pass a 'change mask', which indicates -exactly which priorities you wish to change. You only need to use this if you -are running 'tc class change'. For example, to add best effort traffic to 1: -2, we could run this: - - -+---------------------------------------------------------------------------+ -|# tc class change dev eth1 classid 1:2 cbq defmap 01/01 | -+---------------------------------------------------------------------------+ - - -The priority map over at 1:0 now looks like this: - - -+---------------------------------------------------------------------------+ -|priority send to | -|0 1:2 | -|1 1:3 | -|2 1:3 | -|3 1:3 | -|4 1:3 | -|5 1:3 | -|6 1:2 | -|7 1:2 | -+---------------------------------------------------------------------------+ - - -FIXME: did not test 'tc class change', only looked at the source. ------------------------------------------------------------------------------ - -9.5.5. Hierarchical Token Bucket - -Martin Devera () rightly realised that CBQ is complex and does not -seem optimized for many typical situations. His Hierarchical approach is well -suited for setups where you have a fixed amount of bandwidth which you want -to divide for different purposes, giving each purpose a guaranteed bandwidth, -with the possibility of specifying how much bandwidth can be borrowed. - -HTB works just like CBQ but does not resort to idle time calculations to -shape. Instead, it is a classful Token Bucket Filter - hence the name. It has -only a few parameters, which are well documented on his [http://luxik.cdi.cz/ -~devik/qos/htb/] site. - -As your HTB configuration gets more complex, your configuration scales well. -With CBQ it is already complex even in simple cases! HTB is not yet a part of -the standard kernel, but it should soon be! - -If you are in a position to patch your kernel, by all means consider HTB. ------------------------------------------------------------------------------ - -9.5.5.1. Sample configuration - -Functionally almost identical to the CBQ sample configuration above: - - -+------------------------------------------------------------------------------------+ -|# tc qdisc add dev eth0 root handle 1: htb default 30 | -| | -|# tc class add dev eth0 parent 1: classid 1:1 htb rate 6mbit burst 15k | -| | -|# tc class add dev eth0 parent 1:1 classid 1:10 htb rate 5mbit burst 15k | -|# tc class add dev eth0 parent 1:1 classid 1:20 htb rate 3mbit ceil 6mbit burst 15k | -|# tc class add dev eth0 parent 1:1 classid 1:30 htb rate 1kbit ceil 6mbit burst 15k | -+------------------------------------------------------------------------------------+ - - -The author then recommends SFQ for beneath these classes: -+---------------------------------------------------------------------------+ -|# tc qdisc add dev eth0 parent 1:10 handle 10: sfq perturb 10 | -|# tc qdisc add dev eth0 parent 1:20 handle 20: sfq perturb 10 | -|# tc qdisc add dev eth0 parent 1:30 handle 30: sfq perturb 10 | -+---------------------------------------------------------------------------+ - -Add the filters which direct traffic to the right classes: -+---------------------------------------------------------------------------+ -|# U32="tc filter add dev eth0 protocol ip parent 1:0 prio 1 u32" | -|# $U32 match ip dport 80 0xffff flowid 1:10 | -|# $U32 match ip sport 25 0xffff flowid 1:20 | -+---------------------------------------------------------------------------+ -And that's it - no unsightly unexplained numbers, no undocumented parameters. - -HTB certainly looks wonderful - if 10: and 20: both have their guaranteed -bandwidth, and more is left to divide, they borrow in a 5:3 ratio, just as -you would expect. - -Unclassified traffic gets routed to 30:, which has little bandwidth of its -own but can borrow everything that is left over. Because we chose SFQ -internally, we get fairness thrown in for free! ------------------------------------------------------------------------------ - -9.6. Classifying packets with filters - -To determine which class shall process a packet, the so-called 'classifier -chain' is called each time a choice needs to be made. This chain consists of -all filters attached to the classful qdisc that needs to decide. - -To reiterate the tree, which is not a tree: -+---------------------------------------------------------------------------+ -| root 1: | -| | | -| _1:1_ | -| / | \ | -| / | \ | -| / | \ | -| 10: 11: 12: | -| / \ / \ | -| 10:1 10:2 12:1 12:2 | -+---------------------------------------------------------------------------+ - -When enqueueing a packet, at each branch the filter chain is consulted for a -relevant instruction. A typical setup might be to have a filter in 1:1 that -directs a packet to 12: and a filter on 12: that sends the packet to 12:2. - -You might also attach this latter rule to 1:1, but you can make efficiency -gains by having more specific tests lower in the chain. - -You can't filter a packet 'upwards', by the way. Also, with HTB, you should -attach all filters to the root! - -And again - packets are only enqueued downwards! When they are dequeued, they -go up again, where the interface lives. They do NOT fall off the end of the -tree to the network adaptor! ------------------------------------------------------------------------------ - -9.6.1. Some simple filtering examples - -As explained in the Classifier chapter, you can match on literally anything, -using a very complicated syntax. To start, we will show how to do the obvious -things, which luckily are quite easy. - -Let's say we have a PRIO qdisc called '10:' which contains three classes, and -we want to assign all traffic from and to port 22 to the highest priority -band, the filters would be: - - -+---------------------------------------------------------------------------+ -|# tc filter add dev eth0 protocol ip parent 10: prio 1 u32 match \ | -| ip dport 22 0xffff flowid 10:1 | -|# tc filter add dev eth0 protocol ip parent 10: prio 1 u32 match \ | -| ip sport 80 0xffff flowid 10:1 | -|# tc filter add dev eth0 protocol ip parent 10: prio 2 flowid 10:2 | -+---------------------------------------------------------------------------+ - - -What does this say? It says: attach to eth0, node 10: a priority 1 u32 filter -that matches on IP destination port 22 *exactly* and send it to band 10:1. -And it then repeats the same for source port 80. The last command says that -anything unmatched so far should go to band 10:2, the next-highest priority. - -You need to add 'eth0', or whatever your interface is called, because each -interface has a unique namespace of handles. - -To select on an IP address, use this: -+---------------------------------------------------------------------------+ -|# tc filter add dev eth0 parent 10:0 protocol ip prio 1 u32 \ | -| match ip dst 4.3.2.1/32 flowid 10:1 | -|# tc filter add dev eth0 parent 10:0 protocol ip prio 1 u32 \ | -| match ip src 1.2.3.4/32 flowid 10:1 | -|# tc filter add dev eth0 protocol ip parent 10: prio 2 \ | -| flowid 10:2 | -+---------------------------------------------------------------------------+ - - -This assigns traffic to 4.3.2.1 and traffic from 1.2.3.4 to the highest -priority queue, and the rest to the next-highest one. - -You can concatenate matches, to match on traffic from 1.2.3.4 and from port -80, do this: -+------------------------------------------------------------------------------------+ -|# tc filter add dev eth0 parent 10:0 protocol ip prio 1 u32 match ip src 4.3.2.1/32 | -| match ip sport 80 0xffff flowid 10:1 | -+------------------------------------------------------------------------------------+ - ------------------------------------------------------------------------------ - -9.6.2. All the filtering commands you will normally need - -Most shaping commands presented here start with this preamble: -+---------------------------------------------------------------------------+ -|# tc filter add dev eth0 parent 1:0 protocol ip prio 1 u32 .. | -+---------------------------------------------------------------------------+ -These are the so called 'u32' matches, which can match on ANY part of a -packet. - -On source/destination address - Source mask 'match ip src 1.2.3.0/24', destination mask 'match ip dst - 4.3.2.0/24'. To match a single host, use /32, or omit the mask. - -On source/destination port, all IP protocols - Source: 'match ip sport 80 0xffff', 'match ip dport 0xffff' - -On ip protocol (tcp, udp, icmp, gre, ipsec) - Use the numbers from /etc/protocols, for example, icmp is 1: 'match ip - protocol 1 0xff'. - -On fwmark - You can mark packets with either ipchains and have that mark survive - routing across interfaces. This is really useful to for example only - shape traffic on eth1 that came in on eth0. Syntax: # tc filter add dev - eth1 protocol ip parent 1:0 prio 1 handle 6 fw flowid 1:1 Note that this - is not a u32 match! - - You can place a mark like this: - +---------------------------------------------------------------+ - |# iptables -A PREROUTING -t mangle -i eth0 -j MARK --set-mark 6| - +---------------------------------------------------------------+ - The number 6 is arbitrary. - - If you don't want to understand the full tc filter syntax, just use - iptables, and only learn to select on fwmark. - -On the TOS field - To select interactive, minimum delay traffic: - +---------------------------------------------------------------+ - |# tc filter add dev ppp0 parent 1:0 protocol ip prio 10 u32 \ | - | match ip tos 0x10 0xff \ | - | flowid 1:4 | - +---------------------------------------------------------------+ - Use 0x08 0xff for bulk traffic. - - -For more filtering commands, see the Advanced Filters chapter. ------------------------------------------------------------------------------ - -9.7. The Intermediate queueing device (IMQ) - -The Intermediate queueing device is not a qdisc but its usage is tightly -bound to qdiscs. Within linux, qdiscs are attached to network devices and -everything that is queued to the device is first queued to the qdisc. From -this concept, two limitations arise: - -1. Only egress shaping is possible (an ingress qdisc exists, but its -possibilities are very limited compared to classful qdiscs). - -2. A qdisc can only see traffic of one interface, global limitations can't be -placed. - -IMQ is there to help solve those two limitations. In short, you can put -everything you choose in a qdisc. Specially marked packets get intercepted in -netfilter NF_IP_PRE_ROUTING and NF_IP_POST_ROUTING hooks and pass through the -qdisc attached to an imq device. An iptables target is used for marking the -packets. - -This enables you to do ingress shaping as you can just mark packets coming in -from somewhere and/or treat interfaces as classes to set global limits. You -can also do lots of other stuff like just putting your http traffic in a -qdisc, put new connection requests in a qdisc, ... ------------------------------------------------------------------------------ - -9.7.1. Sample configuration - -The first thing that might come to mind is use ingress shaping to give -yourself a high guaranteed bandwidth. ;) Configuration is just like with any -other interface: -+---------------------------------------------------------------------------+ -|tc qdisc add dev imq0 root handle 1: htb default 20 | -| | -|tc class add dev imq0 parent 1: classid 1:1 htb rate 2mbit burst 15k | -| | -|tc class add dev imq0 parent 1:1 classid 1:10 htb rate 1mbit | -|tc class add dev imq0 parent 1:1 classid 1:20 htb rate 1mbit | -| | -|tc qdisc add dev imq0 parent 1:10 handle 10: pfifo | -|tc qdisc add dev imq0 parent 1:20 handle 20: sfq | -| | -|tc filter add dev imq0 parent 10:0 protocol ip prio 1 u32 match \ | -| ip dst 10.0.0.230/32 flowid 1:10 | -+---------------------------------------------------------------------------+ -In this example u32 is used for classification. Other classifiers should work -as expected. Next traffic has to be selected and marked to be enqueued to -imq0. -+---------------------------------------------------------------------------+ -|iptables -t mangle -A PREROUTING -i eth0 -j IMQ --todev 0 | -| | -|ip link set imq0 up | -+---------------------------------------------------------------------------+ - - -The IMQ iptables targets is valid in the PREROUTING and POSTROUTING chains of -the mangle table. It's syntax is -+---------------------------------------------------------------------------+ -|IMQ [ --todev n ] n : number of imq device | -+---------------------------------------------------------------------------+ -An ip6tables target is also provided. - -Please note traffic is not enqueued when the target is hit but afterwards. -The exact location where traffic enters the imq device depends on the -direction of the traffic (in/out). These are the predefined netfilter hooks -used by iptables: -+---------------------------------------------------------------------------+ -|enum nf_ip_hook_priorities { | -| NF_IP_PRI_FIRST = INT_MIN, | -| NF_IP_PRI_CONNTRACK = -200, | -| NF_IP_PRI_MANGLE = -150, | -| NF_IP_PRI_NAT_DST = -100, | -| NF_IP_PRI_FILTER = 0, | -| NF_IP_PRI_NAT_SRC = 100, | -| NF_IP_PRI_LAST = INT_MAX, | -|}; | -+---------------------------------------------------------------------------+ - - -For ingress traffic, imq registers itself with NF_IP_PRI_MANGLE + 1 priority -which means packets enter the imq device directly after the mangle PREROUTING -chain has been passed. - -For egress imq uses NF_IP_PRI_LAST which honours the fact that packets -dropped by the filter table won't occupy bandwidth. - -The patches and some more information can be found at the [http:// -luxik.cdi.cz/~patrick/imq/] imq site. ------------------------------------------------------------------------------ - -Chapter 10. Load sharing over multiple interfaces - -There are several ways of doing this. One of the easiest and straightforward -ways is 'TEQL' - "True" (or "trivial") link equalizer. Like most things -having to do with queueing, load sharing goes both ways. Both ends of a link -may need to participate for full effect. - -Imagine this situation: - - -+---------------------------------------------------------------------------+ -| +-------+ eth1 +-------+ | -| | |==========| | | -| 'network 1' ----| A | | B |---- 'network 2' | -| | |==========| | | -| +-------+ eth2 +-------+ | -+---------------------------------------------------------------------------+ - - -A and B are routers, and for the moment we'll assume both run Linux. If -traffic is going from network 1 to network 2, router A needs to distribute -the packets over both links to B. Router B needs to be configured to accept -this. Same goes the other way around, when packets go from network 2 to -network 1, router B needs to send the packets over both eth1 and eth2. - -The distributing part is done by a 'TEQL' device, like this (it couldn't be -easier): - - -+---------------------------------------------------------------------------+ -|# tc qdisc add dev eth1 root teql0 | -|# tc qdisc add dev eth2 root teql0 | -|# ip link set dev teql0 up | -+---------------------------------------------------------------------------+ - - -Don't forget the 'ip link set up' command! - -This needs to be done on both hosts. The device teql0 is basically a -roundrobbin distributor over eth1 and eth2, for sending packets. No data ever -comes in over an teql device, that just appears on the 'raw' eth1 and eth2. - -But now we just have devices, we also need proper routing. One way to do this -is to assign a /31 network to both links, and a /31 to the teql0 device as -well: - -FIXME: does this need something like 'nobroadcast'? A /31 is too small to -house a network address and a broadcast address - if this doesn't work as -planned, try a /30, and adjust the ip addresses accordingly. You might even -try to make eth1 and eth2 do without an IP address! - -On router A: -+---------------------------------------------------------------------------+ -|# ip addr add dev eth1 10.0.0.0/31 | -|# ip addr add dev eth2 10.0.0.2/31 | -|# ip addr add dev teql0 10.0.0.4/31 | -+---------------------------------------------------------------------------+ - - -On router B: -+---------------------------------------------------------------------------+ -|# ip addr add dev eth1 10.0.0.1/31 | -|# ip addr add dev eth2 10.0.0.3/31 | -|# ip addr add dev teql0 10.0.0.5/31 | -+---------------------------------------------------------------------------+ - - -Router A should now be able to ping 10.0.0.1, 10.0.0.3 and 10.0.0.5 over the -2 real links and the 1 equalized device. Router B should be able to ping -10.0.0.0, 10.0.0.2 and 10.0.0.4 over the links. - -If this works, Router A should make 10.0.0.5 its route for reaching network -2, and Router B should make 10.0.0.4 its route for reaching network 1. For -the special case where network 1 is your network at home, and network 2 is -the Internet, Router A should make 10.0.0.5 its default gateway. ------------------------------------------------------------------------------ - -10.1. Caveats - -Nothing is as easy as it seems. eth1 and eth2 on both router A and B need to -have return path filtering turned off, because they will otherwise drop -packets destined for ip addresses other than their own: - - -+---------------------------------------------------------------------------+ -|# echo 0 > /proc/net/ipv4/conf/eth1/rp_filter | -|# echo 0 > /proc/net/ipv4/conf/eth2/rp_filter | -+---------------------------------------------------------------------------+ - - -Then there is the nasty problem of packet reordering. Let's say 6 packets -need to be sent from A to B - eth1 might get 1, 3 and 5. eth2 would then do -2, 4 and 6. In an ideal world, router B would receive this in order, 1, 2, 3, -4, 5, 6. But the possibility is very real that the kernel gets it like this: -2, 1, 4, 3, 6, 5. The problem is that this confuses TCP/IP. While not a -problem for links carrying many different TCP/IP sessions, you won't be able -to to a bundle multiple links and get to ftp a single file lots faster, -except when your receiving or sending OS is Linux, which is not easily shaken -by some simple reordering. - -However, for lots of applications, link load balancing is a great idea. ------------------------------------------------------------------------------ - -10.2. Other possibilities - -William Stearns has used an advanced tunneling setup to achieve good use of -multiple, unrelated, internet connections together. It can be found on [http: -//www.stearns.org/tunnel/] his tunneling page. - -The HOWTO may feature more about this in the future. ------------------------------------------------------------------------------ - -Chapter 11. Netfilter & iproute - marking packets - -So far we've seen how iproute works, and netfilter was mentioned a few times. -This would be a good time to browse through [http://netfilter.samba.org/ -unreliable-guides/] Rusty's Remarkably Unreliable Guides. Netfilter itself -can be found [http://netfilter.filewatcher.org/] here. - -Netfilter allows us to filter packets, or mangle their headers. One special -feature is that we can mark a packet with a number. This is done with the ---set-mark facility. - -As an example, this command marks all packets destined for port 25, outgoing -mail: - - -+---------------------------------------------------------------------------+ -|# iptables -A PREROUTING -i eth0 -t mangle -p tcp --dport 25 \ | -| -j MARK --set-mark 1 | -+---------------------------------------------------------------------------+ - - -Let's say that we have multiple connections, one that is fast (and expensive, -per megabyte) and one that is slower, but flat fee. We would most certainly -like outgoing mail to go via the cheap route. - -We've already marked the packets with a '1', we now instruct the routing -policy database to act on this: - - -+---------------------------------------------------------------------------+ -|# echo 201 mail.out >> /etc/iproute2/rt_tables | -|# ip rule add fwmark 1 table mail.out | -|# ip rule ls | -|0: from all lookup local | -|32764: from all fwmark 1 lookup mail.out | -|32766: from all lookup main | -|32767: from all lookup default | -+---------------------------------------------------------------------------+ - - -Now we generate the mail.out table with a route to the slow but cheap link: -+---------------------------------------------------------------------------+ -|# /sbin/ip route add default via 195.96.98.253 dev ppp0 table mail.out | -+---------------------------------------------------------------------------+ - - -And we are done. Should we want to make exceptions, there are lots of ways to -achieve this. We can modify the netfilter statement to exclude certain hosts, -or we can insert a rule with a lower priority that points to the main table -for our excepted hosts. - -We can also use this feature to honour TOS bits by marking packets with a -different type of service with different numbers, and creating rules to act -on that. This way you can even dedicate, say, an ISDN line to interactive -sessions. - -Needless to say, this also works fine on a host that's doing NAT -('masquerading'). - -IMPORTANT: We received a report that MASQ and SNAT at least collide with -marking packets. Rusty Russell explains it in [http://lists.samba.org/ -pipermail/netfilter/2000-November/006089.html] this posting. Turn off the -reverse path filter to make it work properly. - -Note: to mark packets, you need to have some options enabled in your kernel: - - -+----------------------------------------------------------------------------+ -|IP: advanced router (CONFIG_IP_ADVANCED_ROUTER) [Y/n/?] | -|IP: policy routing (CONFIG_IP_MULTIPLE_TABLES) [Y/n/?] | -|IP: use netfilter MARK value as routing key (CONFIG_IP_ROUTE_FWMARK) [Y/n/?]| -+----------------------------------------------------------------------------+ - - -See also the Section 15.5 in the Cookbook. ------------------------------------------------------------------------------ - -Chapter 12. Advanced filters for (re-)classifying packets - -As explained in the section on classful queueing disciplines, filters are -needed to classify packets into any of the sub-queues. These filters are -called from within the classful qdisc. - -Here is an incomplete list of classifiers available: - -fw - Bases the decision on how the firewall has marked the packet. This can be - the easy way out if you don't want to learn tc filter syntax. See the - Queueing chapter for details. - -u32 - Bases the decision on fields within the packet (i.e. source IP address, - etc) - -route - Bases the decision on which route the packet will be routed by - -rsvp, rsvp6 - Routes packets based on [http://www.isi.edu/div7/rsvp/overview.html] RSVP - . Only useful on networks you control - the Internet does not respect - RSVP. - -tcindex - Used in the DSMARK qdisc, see the relevant section. - - -Note that in general there are many ways in which you can classify packet and -that it generally comes down to preference as to which system you wish to -use. - -Classifiers in general accept a few arguments in common. They are listed here -for convenience: - -protocol - The protocol this classifier will accept. Generally you will only be - accepting only IP traffic. Required. - -parent - The handle this classifier is to be attached to. This handle must be an - already existing class. Required. - -prio - The priority of this classifier. Lower numbers get tested first. - -handle - This handle means different things to different filters. - - -All the following sections will assume you are trying to shape the traffic -going to HostA. They will assume that the root class has been configured on -1: and that the class you want to send the selected traffic to is 1:1. ------------------------------------------------------------------------------ - -12.1. The u32 classifier - -The U32 filter is the most advanced filter available in the current -implementation. It entirely based on hashing tables, which make it robust -when there are many filter rules. - -In its simplest form the U32 filter is a list of records, each consisting of -two fields: a selector and an action. The selectors, described below, are -compared with the currently processed IP packet until the first match occurs, -and then the associated action is performed. The simplest type of action -would be directing the packet into defined CBQ class. - -The command line of tc filter program, used to configure the filter, consists -of three parts: filter specification, a selector and an action. The filter -specification can be defined as: - - -+---------------------------------------------------------------------------+ -|tc filter add dev IF [ protocol PROTO ] | -| [ (preference|priority) PRIO ] | -| [ parent CBQ ] | -+---------------------------------------------------------------------------+ - - -The protocol field describes protocol that the filter will be applied to. We -will only discuss case of ip protocol. The preference field (priority can be -used alternatively) sets the priority of currently defined filter. This is -important, since you can have several filters (lists of rules) with different -priorities. Each list will be passed in the order the rules were added, then -list with lower priority (higher preference number) will be processed. The -parent field defines the CBQ tree top (e.g. 1:0), the filter should be -attached to. - -The options described above apply to all filters, not only U32. ------------------------------------------------------------------------------ - -12.1.1. U32 selector - -The U32 selector contains definition of the pattern, that will be matched to -the currently processed packet. Precisely, it defines which bits are to be -matched in the packet header and nothing more, but this simple method is very -powerful. Let's take a look at the following examples, taken directly from a -pretty complex, real-world filter: - - -+---------------------------------------------------------------------------+ -|# tc filter add dev eth0 protocol ip parent 1:0 pref 10 u32 \ | -| match u32 00100000 00ff0000 at 0 flowid 1:10 | -+---------------------------------------------------------------------------+ - - -For now, leave the first line alone - all these parameters describe the -filter's hash tables. Focus on the selector line, containing match keyword. -This selector will match to IP headers, whose second byte will be 0x10 -(0010). As you can guess, the 00ff number is the match mask, telling the -filter exactly which bits to match. Here it's 0xff, so the byte will match if -it's exactly 0x10. The at keyword means that the match is to be started at -specified offset (in bytes) -- in this case it's beginning of the packet. -Translating all that to human language, the packet will match if its Type of -Service field will have `low delay' bits set. Let's analyze another rule: - - -+---------------------------------------------------------------------------+ -|# tc filter add dev eth0 protocol ip parent 1:0 pref 10 u32 \ | -| match u32 00000016 0000ffff at nexthdr+0 flowid 1:10 | -+---------------------------------------------------------------------------+ - - -The nexthdr option means next header encapsulated in the IP packet, i.e. -header of upper-layer protocol. The match will also start here at the -beginning of the next header. The match should occur in the second, 32-bit -word of the header. In TCP and UDP protocols this field contains packet's -destination port. The number is given in big-endian format, i.e. older bits -first, so we simply read 0x0016 as 22 decimal, which stands for SSH service -if this was TCP. As you guess, this match is ambiguous without a context, and -we will discuss this later. - -Having understood all the above, we will find the following selector quite -easy to read: match c0a80100 ffffff00 at 16. What we got here is a three byte -match at 17-th byte, counting from the IP header start. This will match for -packets with destination address anywhere in 192.168.1/24 network. After -analyzing the examples, we can summarize what we have learned. ------------------------------------------------------------------------------ - -12.1.2. General selectors - -General selectors define the pattern, mask and offset the pattern will be -matched to the packet contents. Using the general selectors you can match -virtually any single bit in the IP (or upper layer) header. They are more -difficult to write and read, though, than specific selectors that described -below. The general selector syntax is: - - -+---------------------------------------------------------------------------+ -|match [ u32 | u16 | u8 ] PATTERN MASK [ at OFFSET | nexthdr+OFFSET] | -+---------------------------------------------------------------------------+ - - -One of the keywords u32, u16 or u8 specifies length of the pattern in bits. -PATTERN and MASK should follow, of length defined by the previous keyword. -The OFFSET parameter is the offset, in bytes, to start matching. If nexthdr+ -keyword is given, the offset is relative to start of the upper layer header. - -Some examples: - - -+---------------------------------------------------------------------------+ -|# tc filter add dev ppp14 parent 1:0 prio 10 u32 \ | -| match u8 64 0xff at 8 \ | -| flowid 1:4 | -+---------------------------------------------------------------------------+ - - -Packet will match to this rule, if its time to live (TTL) is 64. TTL is the -field starting just after 8-th byte of the IP header. - - -+---------------------------------------------------------------------------+ -|# tc filter add dev ppp14 parent 1:0 prio 10 u32 \ | -| match u8 0x10 0xff at nexthdr+13 \ | -| protocol tcp \ | -| flowid 1:3 | -+---------------------------------------------------------------------------+ - - -FIXME: it has been pointed out that this syntax does not work currently. - -Use this to match ACKs on packets smaller than 64 bytes: - - -+---------------------------------------------------------------------------+ -|## match acks the hard way, | -|## IP protocol 6, | -|## IP header length 0x5(32 bit words), | -|## IP Total length 0x34 (ACK + 12 bytes of TCP options) | -|## TCP ack set (bit 5, offset 33) | -|# tc filter add dev ppp14 parent 1:0 protocol ip prio 10 u32 \ | -| match ip protocol 6 0xff \ | -| match u8 0x05 0x0f at 0 \ | -| match u16 0x0000 0xffc0 at 2 \ | -| match u8 0x10 0xff at 33 \ | -| flowid 1:3 | -+---------------------------------------------------------------------------+ - - -This rule will only match TCP packets with ACK bit set, and no further -payload. Here we can see an example of using two selectors, the final result -will be logical AND of their results. If we take a look at TCP header -diagram, we can see that the ACK bit is second older bit (0x10) in the 14-th -byte of the TCP header (at nexthdr+13). As for the second selector, if we'd -like to make our life harder, we could write match u8 0x06 0xff at 9 instead -of using the specific selector protocol tcp, because 6 is the number of TCP -protocol, present in 10-th byte of the IP header. On the other hand, in this -example we couldn't use any specific selector for the first match - simply -because there's no specific selector to match TCP ACK bits. ------------------------------------------------------------------------------ - -12.1.3. Specific selectors - -The following table contains a list of all specific selectors the author of -this section has found in the tc program source code. They simply make your -life easier and increase readability of your filter's configuration. - -FIXME: table placeholder - the table is in separate file ,,selector.html'' - -FIXME: it's also still in Polish :-( - -FIXME: must be sgml'ized - -Some examples: - - -+---------------------------------------------------------------------------+ -|# tc filter add dev ppp0 parent 1:0 prio 10 u32 \ | -| match ip tos 0x10 0xff \ | -| flowid 1:4 | -+---------------------------------------------------------------------------+ - - -FIXME: tcp dst match does not work as described below: - -The above rule will match packets which have the TOS field set to 0x10. The -TOS field starts at second byte of the packet and is one byte big, so we -could write an equivalent general selector: match u8 0x10 0xff at 1. This -gives us hint to the internals of U32 filter -- the specific rules are always -translated to general ones, and in this form they are stored in the kernel -memory. This leads to another conclusion -- the tcp and udp selectors are -exactly the same and this is why you can't use single match tcp dst 53 0xffff -selector to match TCP packets sent to given port -- they will also match UDP -packets sent to this port. You must remember to also specify the protocol and -end up with the following rule: - - -+---------------------------------------------------------------------------+ -|# tc filter add dev ppp0 parent 1:0 prio 10 u32 \ | -| match tcp dst 53 0xffff \ | -| match ip protocol 0x6 0xff \ | -| flowid 1:2 | -+---------------------------------------------------------------------------+ - ------------------------------------------------------------------------------ - -12.2. The route classifier - -This classifier filters based on the results of the routing tables. When a -packet that is traversing through the classes reaches one that is marked with -the "route" filter, it splits the packets up based on information in the -routing table. - - -+---------------------------------------------------------------------------+ -|# tc filter add dev eth1 parent 1:0 protocol ip prio 100 route | -+---------------------------------------------------------------------------+ - - -Here we add a route classifier onto the parent node 1:0 with priority 100. -When a packet reaches this node (which, since it is the root, will happen -immediately) it will consult the routing table and if one matches will send -it to the given class and give it a priority of 100. Then, to finally kick it -into action, you add the appropriate routing entry: - -The trick here is to define 'realm' based on either destination or source. -The way to do it is like this: - - -+---------------------------------------------------------------------------+ -|# ip route add Host/Network via Gateway dev Device realm RealmNumber | -+---------------------------------------------------------------------------+ - - -For instance, we can define our destination network 192.168.10.0 with a realm -number 10: - - -+---------------------------------------------------------------------------+ -|# ip route add 192.168.10.0/24 via 192.168.10.1 dev eth1 realm 10 | -+---------------------------------------------------------------------------+ - -When adding route filters, we can use realm numbers to represent the networks -or hosts and specify how the routes match the filters. - - -+---------------------------------------------------------------------------+ -|# tc filter add dev eth1 parent 1:0 protocol ip prio 100 \ | -| route to 10 classid 1:10 | -+---------------------------------------------------------------------------+ - -The above rule says packets going to the network 192.168.10.0 match class id -1:10. - -Route filter can also be used to match source routes. For example, there is a -subnetwork attached to the Linux router on eth2. - - -+---------------------------------------------------------------------------+ -|# ip route add 192.168.2.0/24 dev eth2 realm 2 | -|# tc filter add dev eth1 parent 1:0 protocol ip prio 100 \ | -| route from 2 classid 1:2 | -+---------------------------------------------------------------------------+ - - -Here the filter specifies that packets from the subnetwork 192.168.2.0 (realm -2) will match class id 1:2. ------------------------------------------------------------------------------ - -12.3. Policing filters - -To make even more complicated setups possible, you can have filters that only -match up to a certain bandwidth. You can declare a filter to entirely cease -matching above a certain rate, or only to not match only the bandwidth -exceeding a certain rate. - -So if you decided to police at 4mbit/s, but 5mbit/s of traffic is present, -you can stop matching either the entire 5mbit/s, or only not match 1mbit/s, -and do send 4mbit/s to the configured class. - -If bandwidth exceeds the configured rate, you can drop a packet, reclassify -it, or see if another filter will match it. ------------------------------------------------------------------------------ - -12.3.1. Ways to police - -There are basically two ways to police. If you compiled the kernel with -'Estimators', the kernel can measure for each filter how much traffic it is -passing, more or less. These estimators are very easy on the CPU, as they -simply count 25 times per second how many data has been passed, and calculate -the bitrate from that. - -The other way works again via a Token Bucket Filter, this time living within -your filter. The TBF only matches traffic UP TO your configured bandwidth, if -more is offered, only the excess is subject to the configured overlimit -action. ------------------------------------------------------------------------------ - -12.3.1.1. With the kernel estimator - -This is very simple and has only one parameter: avrate. Either the flow -remains below avrate, and the filter classifies the traffic to the classid -configured, or your rate exceeds it in which case the specified action is -taken, which is 'reclassify' by default. - -The kernel uses an Exponential Weighted Moving Average for your bandwidth -which makes it less sensitive to short bursts. ------------------------------------------------------------------------------ - -12.3.1.2. With Token Bucket Filter - -Uses the following parameters: - -  * buffer/maxburst - -  * mtu/minburst - -  * mpu - -  * rate - - - - -Which behave mostly identical to those described in the Token Bucket Filter -section. Please note however that if you set the mtu of a TBF policer too -low, *no* packets will pass, whereas the egress TBF qdisc will just pass them -slower. - -Another difference is that a policer can only let a packet pass, or drop it. -It cannot delay hold on to it in order to delay it. ------------------------------------------------------------------------------ - -12.3.2. Overlimit actions - -If your filter decides that it is overlimit, it can take 'actions'. -Currently, three actions are available: - -continue - Causes this filter not to match, but perhaps other filters will. - -drop - This is a very fierce option which simply discards traffic exceeding a - certain rate. It is often used in the ingress policer and has limited - uses. For example, you may have a name server that falls over if offered - more than 5mbit/s of packets, in which case an ingress filter could be - used to make sure no more is ever offered. - -Pass/OK - Pass on traffic ok. Might be used to disable a complicated filter, but - leave it in place. - -reclassify - Most often comes down to reclassification to Best Effort. This is the - default action. - - ------------------------------------------------------------------------------ -12.3.3. Examples - -The only real example known is mentioned in the 'Protecting your host from -SYN floods' section. - -FIXME: if you have used this, please share your experience with us ------------------------------------------------------------------------------ - -12.4. Hashing filters for very fast massive filtering - -If you have a need for thousands of rules, for example if you have a lot of -clients or computers, all with different QoS specifications, you may find -that the kernel spends a lot of time matching all those rules. - -By default, all filters reside in one big chain which is matched in -descending order of priority. If you have 1000 rules, 1000 checks may be -needed to determine what to do with a packet. - -Matching would go much quicker if you would have 256 chains with each four -rules - if you could divide packets over those 256 chains, so that the right -rule will be there. - -Hashing makes this possible. Let's say you have 1024 cable modem customers in -your network, with IP addresses ranging from 1.2.0.0 to 1.2.3.255, and each -has to go in another bin, for example 'lite', 'regular' and 'premium'. You -would then have 1024 rules like this: - - -+---------------------------------------------------------------------------+ -|# tc filter add dev eth1 parent 1:0 protocol ip prio 100 match ip src \ | -| 1.2.0.0 classid 1:1 | -|# tc filter add dev eth1 parent 1:0 protocol ip prio 100 match ip src \ | -| 1.2.0.1 classid 1:1 | -|... | -|# tc filter add dev eth1 parent 1:0 protocol ip prio 100 match ip src \ | -| 1.2.3.254 classid 1:3 | -|# tc filter add dev eth1 parent 1:0 protocol ip prio 100 match ip src \ | -| 1.2.3.255 classid 1:2 | -+---------------------------------------------------------------------------+ - - -To speed this up, we can use the last part of the IP address as a 'hash key'. -We then get 256 tables, the first of which looks like this: -+---------------------------------------------------------------------------+ -|# tc filter add dev eth1 parent 1:0 protocol ip prio 100 match ip src \ | -| 1.2.0.0 classid 1:1 | -|# tc filter add dev eth1 parent 1:0 protocol ip prio 100 match ip src \ | -| 1.2.1.0 classid 1:1 | -|# tc filter add dev eth1 parent 1:0 protocol ip prio 100 match ip src \ | -| 1.2.2.0 classid 1:3 | -|# tc filter add dev eth1 parent 1:0 protocol ip prio 100 match ip src \ | -| 1.2.3.0 classid 1:2 | -+---------------------------------------------------------------------------+ - - -The next one starts like this: -+---------------------------------------------------------------------------+ -|# tc filter add dev eth1 parent 1:0 protocol ip prio 100 match ip src \ | -| 1.2.0.1 classid 1:1 | -|... | -+---------------------------------------------------------------------------+ - - -This way, only four checks are needed at most, two on average. - -Configuration is pretty complicated, but very worth it by the time you have -this many rules. First we make a filter root, then we create a table with 256 -entries: -+--------------------------------------------------------------------------------+ -|# tc filter add dev eth1 parent 1:0 prio 5 protocol ip u32 | -|# tc filter add dev eth1 parent 1:0 prio 5 handle 2: protocol ip u32 divisor 256| -+--------------------------------------------------------------------------------+ - - -Now we add some rules to entries in the created table: - - -+---------------------------------------------------------------------------+ -|# tc filter add dev eth1 protocol ip parent 1:0 prio 5 u32 ht 2:7b: \ | -| match ip src 1.2.0.123 flowid 1:1 | -|# tc filter add dev eth1 protocol ip parent 1:0 prio 5 u32 ht 2:7b: \ | -| match ip src 1.2.1.123 flowid 1:2 | -|# tc filter add dev eth1 protocol ip parent 1:0 prio 5 u32 ht 2:7b: \ | -| match ip src 1.2.3.123 flowid 1:3 | -|# tc filter add dev eth1 protocol ip parent 1:0 prio 5 u32 ht 2:7b: \ | -| match ip src 1.2.4.123 flowid 1:2 | -+---------------------------------------------------------------------------+ -This is entry 123, which contains matches for 1.2.0.123, 1.2.1.123, -1.2.2.123, 1.2.3.123, and sends them to 1:1, 1:2, 1:3 and 1:2 respectively. -Note that we need to specify our hash bucket in hex, 0x7b is 123. - -Next create a 'hashing filter' that directs traffic to the right entry in the -hashing table: -+---------------------------------------------------------------------------+ -|# tc filter add dev eth1 protocol ip parent 1:0 prio 5 u32 ht 800:: \ | -| match ip src 1.2.0.0/16 \ | -| hashkey mask 0x000000ff at 12 \ | -| link 2: | -+---------------------------------------------------------------------------+ -Ok, some numbers need explaining. The default hash table is called 800:: and -all filtering starts there. Then we select the source address, which lives as -position 12, 13, 14 and 15 in the IP header, and indicate that we are only -interested in the last part. This we send to hash table 2:, which we created -earlier. - -It is quite complicated, but it does work in practice and performance will be -staggering. Note that this example could be improved to the ideal case where -each chain contains 1 filter! ------------------------------------------------------------------------------ - -Chapter 13. Kernel network parameters - -The kernel has lots of parameters which can be tuned for different -circumstances. While, as usual, the default parameters serve 99% of -installations very well, we don't call this the Advanced HOWTO for the fun of -it! - -The interesting bits are in /proc/sys/net, take a look there. Not everything -will be documented here initially, but we're working on it. - -In the meantime you may want to have a look at the Linux-Kernel sources; read -the file Documentation/filesystems/proc.txt. Most of the features are -explained there. - -(FIXME) ------------------------------------------------------------------------------ - -13.1. Reverse Path Filtering - -By default, routers route everything, even packets which 'obviously' don't -belong on your network. A common example is private IP space escaping onto -the Internet. If you have an interface with a route of 195.96.96.0/24 to it, -you do not expect packets from 212.64.94.1 to arrive there. - -Lots of people will want to turn this feature off, so the kernel hackers have -made it easy. There are files in /proc where you can tell the kernel to do -this for you. The method is called "Reverse Path Filtering". Basically, if -the reply to this packet wouldn't go out the interface this packet came in, -then this is a bogus packet and should be ignored. - -The following fragment will turn this on for all current and future -interfaces. - - -+---------------------------------------------------------------------------+ -|# for i in /proc/sys/net/ipv4/conf/*/rp_filter ; do | -|> echo 2 > $i | -|> done | -+---------------------------------------------------------------------------+ - - -Going by the example above, if a packet arrived on the Linux router on eth1 -claiming to come from the Office+ISP subnet, it would be dropped. Similarly, -if a packet came from the Office subnet, claiming to be from somewhere -outside your firewall, it would be dropped also. - -The above is full reverse path filtering. The default is to only filter based -on IPs that are on directly connected networks. This is because the full -filtering breaks in the case of asymmetric routing (where packets come in one -way and go out another, like satellite traffic, or if you have dynamic (bgp, -ospf, rip) routes in your network. The data comes down through the satellite -dish and replies go back through normal land-lines). - -If this exception applies to you (and you'll probably know if it does) you -can simply turn off the rp_filter on the interface where the satellite data -comes in. If you want to see if any packets are being dropped, the -log_martians file in the same directory will tell the kernel to log them to -your syslog. - - -+---------------------------------------------------------------------------+ -|# echo 1 >/proc/sys/net/ipv4/conf//log_martians | -+---------------------------------------------------------------------------+ - - -FIXME: is setting the conf/{default,all}/* files enough? - martijn ------------------------------------------------------------------------------ - -13.2. Obscure settings - -Ok, there are a lot of parameters which can be modified. We try to list them -all. Also documented (partly) in Documentation/ip-sysctl.txt. - -Some of these settings have different defaults based on whether you answered -'Yes' to 'Configure as router and not host' while compiling your kernel. ------------------------------------------------------------------------------ - -13.2.1. Generic ipv4 - -As a generic note, most rate limiting features don't work on loopback, so -don't test them locally. The limits are supplied in 'jiffies', and are -enforced using the earlier mentioned token bucket filter. - -The kernel has an internal clock which runs at 'HZ' ticks (or 'jiffies') per -second. On Intel, 'HZ' is mostly 100. So setting a *_rate file to, say 50, -would allow for 2 packets per second. The token bucket filter is also -configured to allow for a burst of at most 6 packets, if enough tokens have -been earned. - -Several entries in the following list have been copied from /usr/src/linux/ -Documentation/networking/ip-sysctl.txt, written by Alexey Kuznetsov < -kuznet@ms2.inr.ac.ru> and Andi Kleen - -/proc/sys/net/ipv4/icmp_destunreach_rate - If the kernel decides that it can't deliver a packet, it will drop it, - and send the source of the packet an ICMP notice to this effect. - -/proc/sys/net/ipv4/icmp_echo_ignore_all - Don't act on echo packets at all. Please don't set this by default, but - if you are used as a relay in a DoS attack, it may be useful. - -/proc/sys/net/ipv4/icmp_echo_ignore_broadcasts [Useful] - If you ping the broadcast address of a network, all hosts are supposed to - respond. This makes for a dandy denial-of-service tool. Set this to 1 to - ignore these broadcast messages. - -/proc/sys/net/ipv4/icmp_echoreply_rate - The rate at which echo replies are sent to any one destination. - -/proc/sys/net/ipv4/icmp_ignore_bogus_error_responses - Set this to ignore ICMP errors caused by hosts in the network reacting - badly to frames sent to what they perceive to be the broadcast address. - -/proc/sys/net/ipv4/icmp_paramprob_rate - A relatively unknown ICMP message, which is sent in response to incorrect - packets with broken IP or TCP headers. With this file you can control the - rate at which it is sent. - -/proc/sys/net/ipv4/icmp_timeexceed_rate - This the famous cause of the 'Solaris middle star' in traceroutes. Limits - number of ICMP Time Exceeded messages sent. - -/proc/sys/net/ipv4/igmp_max_memberships - Maximum number of listening igmp (multicast) sockets on the host. FIXME: - Is this true? - -/proc/sys/net/ipv4/inet_peer_gc_maxtime - FIXME: Add a little explanation about the inet peer storage? Minimum - interval between garbage collection passes. This interval is in effect - under low (or absent) memory pressure on the pool. Measured in jiffies. - -/proc/sys/net/ipv4/inet_peer_gc_mintime - Minimum interval between garbage collection passes. This interval is in - effect under high memory pressure on the pool. Measured in jiffies. - -/proc/sys/net/ipv4/inet_peer_maxttl - Maximum time-to-live of entries. Unused entries will expire after this - period of time if there is no memory pressure on the pool (i.e. when the - number of entries in the pool is very small). Measured in jiffies. - -/proc/sys/net/ipv4/inet_peer_minttl - Minimum time-to-live of entries. Should be enough to cover fragment - time-to-live on the reassembling side. This minimum time-to-live is - guaranteed if the pool size is less than inet_peer_threshold. Measured in - jiffies. - -/proc/sys/net/ipv4/inet_peer_threshold - The approximate size of the INET peer storage. Starting from this - threshold entries will be thrown aggressively. This threshold also - determines entries' time-to-live and time intervals between garbage - collection passes. More entries, less time-to-live, less GC interval. - -/proc/sys/net/ipv4/ip_autoconfig - This file contains the number one if the host received its IP - configuration by RARP, BOOTP, DHCP or a similar mechanism. Otherwise it - is zero. - -/proc/sys/net/ipv4/ip_default_ttl - Time To Live of packets. Set to a safe 64. Raise it if you have a huge - network. Don't do so for fun - routing loops cause much more damage that - way. You might even consider lowering it in some circumstances. - -/proc/sys/net/ipv4/ip_dynaddr - You need to set this if you use dial-on-demand with a dynamic interface - address. Once your demand interface comes up, any local TCP sockets which - haven't seen replies will be rebound to have the right address. This - solves the problem that the connection that brings up your interface - itself does not work, but the second try does. - -/proc/sys/net/ipv4/ip_forward - If the kernel should attempt to forward packets. Off by default. - -/proc/sys/net/ipv4/ip_local_port_range - Range of local ports for outgoing connections. Actually quite small by - default, 1024 to 4999. - -/proc/sys/net/ipv4/ip_no_pmtu_disc - Set this if you want to disable Path MTU discovery - a technique to - determine the largest Maximum Transfer Unit possible on your path. See - also the section on Path MTU discovery in the Cookbook chapter. - -/proc/sys/net/ipv4/ipfrag_high_thresh - Maximum memory used to reassemble IP fragments. When ipfrag_high_thresh - bytes of memory is allocated for this purpose, the fragment handler will - toss packets until ipfrag_low_thresh is reached. - -/proc/sys/net/ipv4/ip_nonlocal_bind - Set this if you want your applications to be able to bind to an address - which doesn't belong to a device on your system. This can be useful when - your machine is on a non-permanent (or even dynamic) link, so your - services are able to start up and bind to a specific address when your - link is down. - -/proc/sys/net/ipv4/ipfrag_low_thresh - Minimum memory used to reassemble IP fragments. - -/proc/sys/net/ipv4/ipfrag_time - Time in seconds to keep an IP fragment in memory. - -/proc/sys/net/ipv4/tcp_abort_on_overflow - A boolean flag controlling the behaviour under lots of incoming - connections. When enabled, this causes the kernel to actively send RST - packets when a service is overloaded. - -/proc/sys/net/ipv4/tcp_fin_timeout - Time to hold socket in state FIN-WAIT-2, if it was closed by our side. - Peer can be broken and never close its side, or even died unexpectedly. - Default value is 60sec. Usual value used in 2.2 was 180 seconds, you may - restore it, but remember that if your machine is even underloaded WEB - server, you risk to overflow memory with kilotons of dead sockets, - FIN-WAIT-2 sockets are less dangerous than FIN-WAIT-1, because they eat - maximum 1.5K of memory, but they tend to live longer. Cf. - tcp_max_orphans. - -/proc/sys/net/ipv4/tcp_keepalive_time - How often TCP sends out keepalive messages when keepalive is enabled. - Default: 2hours. - -/proc/sys/net/ipv4/tcp_keepalive_intvl - How frequent probes are retransmitted, when a probe isn't acknowledged. - Default: 75 seconds. - -/proc/sys/net/ipv4/tcp_keepalive_probes - How many keepalive probes TCP will send, until it decides that the - connection is broken. Default value: 9. Multiplied with - tcp_keepalive_intvl, this gives the time a link can be non-responsive - after a keepalive has been sent. - -/proc/sys/net/ipv4/tcp_max_orphans - Maximal number of TCP sockets not attached to any user file handle, held - by system. If this number is exceeded orphaned connections are reset - immediately and warning is printed. This limit exists only to prevent - simple DoS attacks, you _must_ not rely on this or lower the limit - artificially, but rather increase it (probably, after increasing - installed memory), if network conditions require more than default value, - and tune network services to linger and kill such states more - aggressively. Let me remind you again: each orphan eats up to  64K of - unswappable memory. - -/proc/sys/net/ipv4/tcp_orphan_retries - How may times to retry before killing TCP connection, closed by our side. - Default value 7 corresponds to  50sec-16min depending on RTO. If your - machine is a loaded WEB server, you should think about lowering this - value, such sockets may consume significant resources. Cf. - tcp_max_orphans. - -/proc/sys/net/ipv4/tcp_max_syn_backlog - Maximal number of remembered connection requests, which still did not - receive an acknowledgment from connecting client. Default value is 1024 - for systems with more than 128Mb of memory, and 128 for low memory - machines. If server suffers of overload, try to increase this number. - Warning! If you make it greater than 1024, it would be better to change - TCP_SYNQ_HSIZE in include/net/tcp.h to keep TCP_SYNQ_HSIZE*16<= - tcp_max_syn_backlog and to recompile kernel. - -/proc/sys/net/ipv4/tcp_max_tw_buckets - Maximal number of timewait sockets held by system simultaneously. If this - number is exceeded time-wait socket is immediately destroyed and warning - is printed. This limit exists only to prevent simple DoS attacks, you - _must_ not lower the limit artificially, but rather increase it - (probably, after increasing installed memory), if network conditions - require more than default value. - -/proc/sys/net/ipv4/tcp_retrans_collapse - Bug-to-bug compatibility with some broken printers. On retransmit try to - send bigger packets to work around bugs in certain TCP stacks. - -/proc/sys/net/ipv4/tcp_retries1 - How many times to retry before deciding that something is wrong and it is - necessary to report this suspicion to network layer. Minimal RFC value is - 3, it is default, which corresponds to  3sec-8min depending on RTO. - -/proc/sys/net/ipv4/tcp_retries2 - How may times to retry before killing alive TCP connection. [http:// - www.ietf.org/rfc/rfc1122.txt] RFC 1122 says that the limit should be - longer than 100 sec. It is too small number. Default value 15 corresponds - to  13-30min depending on RTO. - -/proc/sys/net/ipv4/tcp_rfc1337 - This boolean enables a fix for 'time-wait assassination hazards in tcp', - described in RFC 1337. If enabled, this causes the kernel to drop RST - packets for sockets in the time-wait state. Default: 0 - -/proc/sys/net/ipv4/tcp_sack - Use Selective ACK which can be used to signify that specific packets are - missing - therefore helping fast recovery. - -/proc/sys/net/ipv4/tcp_stdurg - Use the Host requirements interpretation of the TCP urg pointer field. - Most hosts use the older BSD interpretation, so if you turn this on Linux - might not communicate correctly with them. Default: FALSE - -/proc/sys/net/ipv4/tcp_syn_retries - Number of SYN packets the kernel will send before giving up on the new - connection. - -/proc/sys/net/ipv4/tcp_synack_retries - To open the other side of the connection, the kernel sends a SYN with a - piggybacked ACK on it, to acknowledge the earlier received SYN. This is - part 2 of the threeway handshake. This setting determines the number of - SYN+ACK packets sent before the kernel gives up on the connection. - -/proc/sys/net/ipv4/tcp_timestamps - Timestamps are used, amongst other things, to protect against wrapping - sequence numbers. A 1 gigabit link might conceivably re-encounter a - previous sequence number with an out-of-line value, because it was of a - previous generation. The timestamp will let it recognize this 'ancient - packet'. - -/proc/sys/net/ipv4/tcp_tw_recycle - Enable fast recycling TIME-WAIT sockets. Default value is 1. It should - not be changed without advice/request of technical experts. - -/proc/sys/net/ipv4/tcp_window_scaling - TCP/IP normally allows windows up to 65535 bytes big. For really fast - networks, this may not be enough. The window scaling options allows for - almost gigabyte windows, which is good for high bandwidth*delay products. - - ------------------------------------------------------------------------------ -13.2.2. Per device settings - -DEV can either stand for a real interface, or for 'all' or 'default'. Default -also changes settings for interfaces yet to be created. - -/proc/sys/net/ipv4/conf/DEV/accept_redirects - If a router decides that you are using it for a wrong purpose (ie, it - needs to resend your packet on the same interface), it will send us a - ICMP Redirect. This is a slight security risk however, so you may want to - turn it off, or use secure redirects. - -/proc/sys/net/ipv4/conf/DEV/accept_source_route - Not used very much anymore. You used to be able to give a packet a list - of IP addresses it should visit on its way. Linux can be made to honor - this IP option. - -/proc/sys/net/ipv4/conf/DEV/bootp_relay - Accept packets with source address 0.b.c.d with destinations not to this - host as local ones. It is supposed that a BOOTP relay daemon will catch - and forward such packets. - - The default is 0, since this feature is not implemented yet (kernel - version 2.2.12). - -/proc/sys/net/ipv4/conf/DEV/forwarding - Enable or disable IP forwarding on this interface. - -/proc/sys/net/ipv4/conf/DEV/log_martians - See the section on Reverse Path Filtering. - -/proc/sys/net/ipv4/conf/DEV/mc_forwarding - If we do multicast forwarding on this interface - -/proc/sys/net/ipv4/conf/DEV/proxy_arp - If you set this to 1, this interface will respond to ARP requests for - addresses the kernel has routes to. Can be very useful when building 'ip - pseudo bridges'. Do take care that your netmasks are very correct before - enabling this! Also be aware that the rp_filter, mentioned elsewhere, - also operates on ARP queries! - -/proc/sys/net/ipv4/conf/DEV/rp_filter - See the section on Reverse Path Filtering. - -/proc/sys/net/ipv4/conf/DEV/secure_redirects - Accept ICMP redirect messages only for gateways, listed in default - gateway list. Enabled by default. - -/proc/sys/net/ipv4/conf/DEV/send_redirects - If we send the above mentioned redirects. - -/proc/sys/net/ipv4/conf/DEV/shared_media - If it is not set the kernel does not assume that different subnets on - this device can communicate directly. Default setting is 'yes'. - -/proc/sys/net/ipv4/conf/DEV/tag - FIXME: fill this in - - ------------------------------------------------------------------------------ -13.2.3. Neighbor policy - -Dev can either stand for a real interface, or for 'all' or 'default'. Default -also changes settings for interfaces yet to be created. - -/proc/sys/net/ipv4/neigh/DEV/anycast_delay - Maximum for random delay of answers to neighbor solicitation messages in - jiffies (1/100 sec). Not yet implemented (Linux does not have anycast - support yet). - -/proc/sys/net/ipv4/neigh/DEV/app_solicit - Determines the number of requests to send to the user level ARP daemon. - Use 0 to turn off. - -/proc/sys/net/ipv4/neigh/DEV/base_reachable_time - A base value used for computing the random reachable time value as - specified in RFC2461. - -/proc/sys/net/ipv4/neigh/DEV/delay_first_probe_time - Delay for the first time probe if the neighbor is reachable. (see - gc_stale_time) - -/proc/sys/net/ipv4/neigh/DEV/gc_stale_time - Determines how often to check for stale ARP entries. After an ARP entry - is stale it will be resolved again (which is useful when an IP address - migrates to another machine). When ucast_solicit is greater than 0 it - first tries to send an ARP packet directly to the known host When that - fails and mcast_solicit is greater than 0, an ARP request is broadcast. - -/proc/sys/net/ipv4/neigh/DEV/locktime - An ARP/neighbor entry is only replaced with a new one if the old is at - least locktime old. This prevents ARP cache thrashing. - -/proc/sys/net/ipv4/neigh/DEV/mcast_solicit - Maximum number of retries for multicast solicitation. - -/proc/sys/net/ipv4/neigh/DEV/proxy_delay - Maximum time (real time is random [0..proxytime]) before answering to an - ARP request for which we have an proxy ARP entry. In some cases, this is - used to prevent network flooding. - -/proc/sys/net/ipv4/neigh/DEV/proxy_qlen - Maximum queue length of the delayed proxy arp timer. (see proxy_delay). - -/proc/sys/net/ipv4/neigh/DEV/retrans_time - The time, expressed in jiffies (1/100 sec), between retransmitted - Neighbor Solicitation messages. Used for address resolution and to - determine if a neighbor is unreachable. - -/proc/sys/net/ipv4/neigh/DEV/ucast_solicit - Maximum number of retries for unicast solicitation. - -/proc/sys/net/ipv4/neigh/DEV/unres_qlen - Maximum queue length for a pending arp request - the number of packets - which are accepted from other layers while the ARP address is still - resolved. - -Internet QoS: Architectures and Mechanisms for Quality of Service, Zheng - Wang, ISBN 1-55860-608-4 - Hardcover textbook covering topics related to Quality of Service. Good - for understanding basic concepts. - - ------------------------------------------------------------------------------ -13.2.4. Routing settings - -/proc/sys/net/ipv4/route/error_burst - These parameters are used to limit the warning messages written to the - kernel log from the routing code. The higher the error_cost factor is, - the fewer messages will be written. Error_burst controls when messages - will be dropped. The default settings limit warning messages to one every - five seconds. - -/proc/sys/net/ipv4/route/error_cost - These parameters are used to limit the warning messages written to the - kernel log from the routing code. The higher the error_cost factor is, - the fewer messages will be written. Error_burst controls when messages - will be dropped. The default settings limit warning messages to one every - five seconds. - -/proc/sys/net/ipv4/route/flush - Writing to this file results in a flush of the routing cache. - -/proc/sys/net/ipv4/route/gc_elasticity - Values to control the frequency and behavior of the garbage collection - algorithm for the routing cache. This can be important for when doing - fail over. At least gc_timeout seconds will elapse before Linux will skip - to another route because the previous one has died. By default set to - 300, you may want to lower it if you want to have a speedy fail over. - - Also see [http://mailman.ds9a.nl/pipermail/lartc/2002q1/002667.html] this - post by Ard van Breemen. - -/proc/sys/net/ipv4/route/gc_interval - See /proc/sys/net/ipv4/route/gc_elasticity. - -/proc/sys/net/ipv4/route/gc_min_interval - See /proc/sys/net/ipv4/route/gc_elasticity. - -/proc/sys/net/ipv4/route/gc_thresh - See /proc/sys/net/ipv4/route/gc_elasticity. - -/proc/sys/net/ipv4/route/gc_timeout - See /proc/sys/net/ipv4/route/gc_elasticity. - -/proc/sys/net/ipv4/route/max_delay - Delays for flushing the routing cache. - -/proc/sys/net/ipv4/route/max_size - Maximum size of the routing cache. Old entries will be purged once the - cache reached has this size. - -/proc/sys/net/ipv4/route/min_adv_mss - FIXME: fill this in - -/proc/sys/net/ipv4/route/min_delay - Delays for flushing the routing cache. - -/proc/sys/net/ipv4/route/min_pmtu - FIXME: fill this in - -/proc/sys/net/ipv4/route/mtu_expires - FIXME: fill this in - -/proc/sys/net/ipv4/route/redirect_load - Factors which determine if more ICMP redirects should be sent to a - specific host. No redirects will be sent once the load limit or the - maximum number of redirects has been reached. - -/proc/sys/net/ipv4/route/redirect_number - See /proc/sys/net/ipv4/route/redirect_load. - -/proc/sys/net/ipv4/route/redirect_silence - Timeout for redirects. After this period redirects will be sent again, - even if this has been stopped, because the load or number limit has been - reached. - - ------------------------------------------------------------------------------ -Chapter 14. Advanced & less common queueing disciplines - -Should you find that you have needs not addressed by the queues mentioned -earlier, the kernel contains some other more specialized queues mentioned -here. ------------------------------------------------------------------------------ - -14.1. bfifo/pfifo - -These classless queues are even simpler than pfifo_fast in that they lack the -internal bands - all traffic is really equal. They have one important benefit -though, they have some statistics. So even if you don't need shaping or -prioritizing, you can use this qdisc to determine the backlog on your -interface. - -pfifo has a length measured in packets, bfifo in bytes. ------------------------------------------------------------------------------ - -14.1.1. Parameters & usage - -limit - Specifies the length of the queue. Measured in bytes for bfifo, in - packets for pfifo. Defaults to the interface txqueuelen (see pfifo_fast - chapter) packets long or txqueuelen*mtu bytes for bfifo. - - ------------------------------------------------------------------------------ -14.2. Clark-Shenker-Zhang algorithm (CSZ) - -This is so theoretical that not even Alexey (the main CBQ author) claims to -understand it. From his source: - - - David D. Clark, Scott Shenker and Lixia Zhang Supporting Real-Time - Applications in an Integrated Services Packet Network: Architecture and - Mechanism. - - As I understand it, the main idea is to create WFQ flows for each - guaranteed service and to allocate the rest of bandwith to dummy flow-0. - Flow-0 comprises the predictive services and the best effort traffic; it - is handled by a priority scheduler with the highest priority band - allocated for predictive services, and the rest --- to the best effort - packets. - - Note that in CSZ flows are NOT limited to their bandwidth. It is supposed - that the flow passed admission control at the edge of the QoS network and - it doesn't need further shaping. Any attempt to improve the flow or to - shape it to a token bucket at intermediate hops will introduce undesired - delays and raise jitter. - - At the moment CSZ is the only scheduler that provides true guaranteed - service. Another schemes (including CBQ) do not provide guaranteed delay - and randomize jitter." - - Does not currently seem like a good candidate to use, unless you've read - and understand the article mentioned. - ------------------------------------------------------------------------------ -14.3. DSMARK - - - Esteve Camps - - - This text is an extract from my thesis on QoS Support in Linux, September - 2000. - - -Source documents: - -  * [ftp://icaftp.epfl.ch/pub/linux/diffserv/misc/dsid-01.txt.gz] - Draft-almesberger-wajhak-diffserv-linux-01.txt. - -  * Examples in iproute2 distribution. - -  * [http://www.qosforum.com/white-papers/qosprot_v3.pdf] White Paper-QoS - protocols and architectures and [http://www.qosforum.com/docs/faq] IP QoS - Frequently Asked Questions both by Quality of Service Forum. - - -This chapter was written by Esteve Camps . ------------------------------------------------------------------------------ - -14.3.1. Introduction - -First of all, first of all, it would be a great idea for you to read RFCs -written about this (RFC2474, RFC2475, RFC2597 and RFC2598) at [http:// -www.ietf.org/html.charters/diffserv-charter.html] IETF DiffServ working Group -web site and [http://diffserv.sf.net/] Werner Almesberger web site (he wrote -the code to support Differentiated Services on Linux). ------------------------------------------------------------------------------ - -14.3.2. What is Dsmark related to? - -Dsmark is a queueing discipline that offers the capabilities needed in -Differentiated Services (also called DiffServ or, simply, DS). DiffServ is -one of two actual QoS architectures (the other one is called Integrated -Services) that is based on a value carried by packets in the DS field of the -IP header. - -One of the first solutions in IP designed to offer some QoS level was the -Type of Service field (TOS byte) in IP header. By changing that value, we -could choose a high/low level of throughput, delay or reliability. But this -didn't provide sufficient flexibility to the needs of new services (such as -real-time applications, interactive applications and others). After this, new -architectures appeared. One of these was DiffServ which kept TOS bits and -renamed DS field. ------------------------------------------------------------------------------ - -14.3.3. Differentiated Services guidelines - -Differentiated Services is group-oriented. I mean, we don't know anything -about flows (this will be the Integrated Services purpose); we know about -flow aggregations and we will apply different behaviours depending on which -aggregation a packet belongs to. - -When a packet arrives to an edge node (entry node to a DiffServ domain) -entering to a DiffServ Domain we'll have to policy, shape and/or mark those -packets (marking refers to assigning a value to the DS field. It's just like -the cows :-) ). This will be the mark/value that the internal/core nodes on -our DiffServ Domain will look at to determine which behaviour or QoS level -apply. - -As you can deduce, Differentiated Services involves a domain on which all DS -rules will have to be applied. In fact you can think I will classify all the -packets entering my domain. Once they enter my domain they will be subjected -to the rules that my classification dictates and every traversed node will -apply that QoS level. - -In fact, you can apply your own policies into your local domains, but some -Service Level Agreements should be considered when connecting to other DS -domains. - -At this point, you maybe have a lot of questions. DiffServ is more than I've -explained. In fact, you can understand that I can not resume more than 3 RFCs -in just 50 lines :-). ------------------------------------------------------------------------------ - -14.3.4. Working with Dsmark - -As the DiffServ bibliography specifies, we differentiate boundary nodes and -interior nodes. These are two important points in the traffic path. Both -types perform a classification when the packets arrive. Its result may be -used in different places along the DS process before the packet is released -to the network. It's just because of this that the diffserv code supplies an -structure called sk_buff, including a new field called skb->tc_index where -we'll store the result of initial classification that may be used in several -points in DS treatment. - -The skb->tc_index value will be initially set by the DSMARK qdisc, retrieving -it from the DS field in IP header of every received packet. Besides, -cls_tcindex classifier will read all or part of skb->tcindex value and use it -to select classes. - -But, first of all, take a look at DSMARK qdisc command and its parameters: -+---------------------------------------------------------------------------+ -|... dsmark indices INDICES [ default_index DEFAULT_INDEX ] [ set_tc_index ]| -+---------------------------------------------------------------------------+ -What do these parameters mean? - -  * indices: size of table of (mask,value) pairs. Maximum value is 2^n, where - n>=0. - -  * Default_index: the default table entry index if classifier finds no - match. - -  * Set_tc_index: instructs dsmark discipline to retrieve the DS field and - store it onto skb->tc_index. - - -Let's see the DSMARK process. ------------------------------------------------------------------------------ - -14.3.5. How SCH_DSMARK works. - -This qdisc will apply the next steps: - -  * If we have declared set_tc_index option in qdisc command, DS field is - retrieved and stored onto skb->tc_index variable. - -  * Classifier is invoked. The classifier will be executed and it will return - a class ID that will be stored in skb->tc_index variable.If no filter - matches are found, we consider the default_index option to be the classId - to store. If neither set_tc_index nor default_index has been declared - results may be unpredictable. - -  * After been sent to internal qdiscs where you can reuse the result of the - filter, the classid returned by the internal qdisc is stored into skb-> - tc_index. We will use this value in the future to index a mask- value - table. The final result to assign to the packet will be that resulting - from next operation: - +---------------------------------------------------------------+ - |New_Ds_field = ( Old_DS_field & mask ) | value | - +---------------------------------------------------------------+ - - -  * Thus, new value will result from "anding" ds_field and mask values and - next, this result "ORed" with value parameter. See next diagram to - understand all this process: - - -+---------------------------------------------------------------------------------------+ -| skb->ihp->tos | -|- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - > | -| | | ^ | -| | -- If you declare set_tc_index, we set DS | | <-----May change | -| | value into skb->tc_index variable | |O DS field | -| | A| |R | -| +-|-+ +------+ +---+-+ Internal +-+ +---N|-----|----+ | -| | | | | tc |--->| | |--> . . . -->| | | D| | | | -| | | |----->|index |--->| | | Qdisc | |---->| v | | | -| | | | |filter|--->| | | +---------------+ | ---->(mask,value) | | -|-->| O | +------+ +-|-+--------------^----+ / | (. , .) | | -| | | | ^ | | | | (. , .) | | -| | | +----------|---------|----------------|-------|--+ (. , .) | | -| | | sch_dsmark | | | | | | -| +-|------------|---------|----------------|-------|------------------+ | -| | | | <- tc_index -> | | | -| | |(read) | may change | | <--------------Index to the | -| | | | | | (mask,value) | -| v | v v | pairs table | -|- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -> | -| skb->tc_index | -+---------------------------------------------------------------------------------------+ - - -How to do marking? Just change the mask and value of the class you want to -remark. See next line of code: -+---------------------------------------------------------------------------+ -|tc class change dev eth0 classid 1:1 dsmark mask 0x3 value 0xb8 | -+---------------------------------------------------------------------------+ -This changes the (mask,value) pair in hash table, to remark packets belonging -to class 1:1.You have to "change" this values because of default values that -(mask,value) gets initially (see table below). - -Now, we'll explain how TC_INDEX filter works and how fits into this. Besides, -TCINDEX filter can be used in other configurations rather than those -including DS services. ------------------------------------------------------------------------------ - -14.3.6. TC_INDEX Filter - -This is the basic command to declare a TC_INDEX filter: -+---------------------------------------------------------------------------+ -|... tcindex [ hash SIZE ] [ mask MASK ] [ shift SHIFT ] | -| [ pass_on | fall_through ] | -| [ classid CLASSID ] [ police POLICE_SPEC ] | -+---------------------------------------------------------------------------+ -Next, we show the example used to explain TC_INDEX operation mode. Pay -attention to bolded words: tc qdisc add dev eth0 handle 1:0 root dsmark -indices 64 set_tc_index tc filter add dev eth0 parent 1:0 protocol ip prio 1 -tcindex mask 0xfc shift 2 tc qdisc add dev eth0 parent 1:0 handle 2:0 cbq -bandwidth 10Mbit cell 8 avpkt 1000 mpu 64 # EF traffic class tc class add dev -eth0 parent 2:0 classid 2:1 cbq bandwidth 10Mbit rate 1500Kbit avpkt 1000 -prio 1 bounded isolated allot 1514 weight 1 maxburst 10 # Packet fifo qdisc -for EF traffic tc qdisc add dev eth0 parent 2:1 pfifo limit 5 tc filter add -dev eth0 parent 2:0 protocol ip prio 1 handle 0x2e tcindex classid 2:1 -pass_on (This code is not complete. It's just an extract from EFCBQ example -included in iproute2 distribution). - -First of all, suppose we receive a packet marked as EF . If you read RFC2598, -you'll see that DSCP recommended value for EF traffic is 101110. This means -that DS field will be 10111000 (remember that less significant bits in TOS -byte are not used in DS) or 0xb8 in hexadecimal codification. - - -+----------------------------------------------------------------------------------+ -| TC INDEX | -| FILTER | -| +---+ +-------+ +---+-+ +------+ +-+ +-------+ | -| | | | | | | | |FILTER| +-+ +-+ | | | | | -| | |----->| MASK | -> | | | -> |HANDLE|->| | | | -> | | -> | | | -| | | . | =0xfc | | | | |0x2E | | +----+ | | | | | | -| | | . | | | | | +------+ +--------+ | | | | | -| | | . | | | | | | | | | | -|-->| | . | SHIFT | | | | | | | |--> | -| | | . | =2 | | | +----------------------------+ | | | | -| | | | | | | CBQ 2:0 | | | | -| | | +-------+ +---+--------------------------------+ | | | -| | | | | | -| | +-------------------------------------------------------------+ | | -| | DSMARK 1:0 | | -| +-------------------------------------------------------------------------+ | -+----------------------------------------------------------------------------------+ - - -The packet arrives, then, set with 0xb8 value at DS field. As we explained -before, dsmark qdisc identified by 1:0 id in the example, retrieves DS field -and store it in skb->tc_index variable. Next step in the example will -correspond to the filter associated to this qdisc (second line in the -example). This will perform next operations: -+---------------------------------------------------------------------------+ -|Value1 = skb->tc_index & MASK | -|Key = Value1 >> SHIFT | -+---------------------------------------------------------------------------+ - - -In the example, MASK=0xFC i SHIFT=2. -+---------------------------------------------------------------------------+ -|Value1 = 10111000 & 11111100 = 10111000 | -|Key = 10111000 >> 2 = 00101110 -> 0x2E in hexadecimal | -+---------------------------------------------------------------------------+ - - -The returned value will correspond to a qdisc internal filter handle (in the -example, identifier 2:0). If a filter with this id exists, policing and -metering conditions will be verified (in case that filter includes this) and -the classid will be returned (in our example, classid 2:1) and stored in skb- ->tc_index variable. - -But if any filter with that identifier is found, the result will depend on -fall_through flag declaration. If so, value key is returned as classid. If -not, an error is returned and process continues with the rest filters. Be -careful if you use fall_through flag; this can be done if a simple relation -exists between values of skb->tc_index variable and class id's. - -The latest parameters to comment on are hash and pass_on. The first one -relates to hash table size. Pass_on will be used to indicate that if no -classid equal to the result of this filter is found, try next filter. The -default action is fall_through (look at next table). - -Finally, let's see which possible values can be set to all this TCINDEX -parameters: -+---------------------------------------------------------------------------+ -|TC Name Value Default | -|----------------------------------------------------------------- | -|Hash 1...0x10000 Implementation dependent | -|Mask 0...0xffff 0xffff | -|Shift 0...15 0 | -|Fall through / Pass_on Flag Fall_through | -|Classid Major:minor None | -|Police ..... None | -+---------------------------------------------------------------------------+ - - -This kind of filter is very powerful. It's necessary to explore all -possibilities. Besides, this filter is not only used in DiffServ -configurations. You can use it as any other kind of filter. - -I recommend you to look at all DiffServ examples included in iproute2 -distribution. I promise I will try to complement this text as soon as I can. -Besides, all I have explained is the result of a lot of tests. I would thank -you tell me if I'm wrong in any point. ------------------------------------------------------------------------------ - -14.4. Ingress qdisc - -All qdiscs discussed so far are egress qdiscs. Each interface however can -also have an ingress qdisc which is not used to send packets out to the -network adaptor. Instead, it allows you to apply tc filters to packets coming -in over the interface, regardless of whether they have a local destination or -are to be forwarded. - -As the tc filters contain a full Token Bucket Filter implementation, and are -also able to match on the kernel flow estimator, there is a lot of -functionality available. This effectively allows you to police incoming -traffic, before it even enters the IP stack. ------------------------------------------------------------------------------ - -14.4.1. Parameters & usage - -The ingress qdisc itself does not require any parameters. It differs from -other qdiscs in that it does not occupy the root of a device. Attach it like -this: -+---------------------------------------------------------------------------+ -|# tc qdisc add dev eth0 ingress | -+---------------------------------------------------------------------------+ -This allows you to have other, sending, qdiscs on your device besides the -ingress qdisc. - -For a contrived example how the ingress qdisc could be used, see the -Cookbook. ------------------------------------------------------------------------------ - -14.5. Random Early Detection (RED) - -This section is meant as an introduction to backbone routing, which often -involves <100 megabit bandwidths, which requires a different approach than -your ADSL modem at home. - -The normal behaviour of router queues on the Internet is called tail-drop. -Tail-drop works by queueing up to a certain amount, then dropping all traffic -that 'spills over'. This is very unfair, and also leads to retransmit -synchronization. When retransmit synchronization occurs, the sudden burst of -drops from a router that has reached its fill will cause a delayed burst of -retransmits, which will over fill the congested router again. - -In order to cope with transient congestion on links, backbone routers will -often implement large queues. Unfortunately, while these queues are good for -throughput, they can substantially increase latency and cause TCP connections -to behave very burstily during congestion. - -These issues with tail-drop are becoming increasingly troublesome on the -Internet because the use of network unfriendly applications is increasing. -The Linux kernel offers us RED, short for Random Early Detect, also called -Random Early Drop, as that is how it works. - -RED isn't a cure-all for this, applications which inappropriately fail to -implement exponential backoff still get an unfair share of the bandwidth, -however, with RED they do not cause as much harm to the throughput and -latency of other connections. - -RED statistically drops packets from flows before it reaches its hard limit. -This causes a congested backbone link to slow more gracefully, and prevents -retransmit synchronization. This also helps TCP find its 'fair' speed faster -by allowing some packets to get dropped sooner keeping queue sizes low and -latency under control. The probability of a packet being dropped from a -particular connection is proportional to its bandwidth usage rather than the -number of packets it transmits. - -RED is a good queue for backbones, where you can't afford the complexity of -per-session state tracking needed by fairness queueing. - -In order to use RED, you must decide on three parameters: Min, Max, and -burst. Min sets the minimum queue size in bytes before dropping will begin, -Max is a soft maximum that the algorithm will attempt to stay under, and -burst sets the maximum number of packets that can 'burst through'. - -You should set the min by calculating that highest acceptable base queueing -latency you wish, and multiply it by your bandwidth. For instance, on my -64kbit/s ISDN link, I might want a base queueing latency of 200ms so I set -min to 1600 bytes. Setting min too small will degrade throughput and too -large will degrade latency. Setting a small min is not a replacement for -reducing the MTU on a slow link to improve interactive response. - -You should make max at least twice min to prevent synchronization. On slow -links with small Min's it might be wise to make max perhaps four or more -times large then min. - -Burst controls how the RED algorithm responds to bursts. Burst must be set -larger then min/avpkt. Experimentally, I've found (min+min+max)/(3*avpkt) to -work ok. - -Additionally, you need to set limit and avpkt. Limit is a safety value, after -there are limit bytes in the queue, RED 'turns into' tail-drop. I typical set -limit to eight times max. Avpkt should be your average packet size. 1000 -works OK on high speed Internet links with a 1500byte MTU. - -Read [http://www.aciri.org/floyd/papers/red/red.html] the paper on RED -queueing by Sally Floyd and Van Jacobson for technical information. ------------------------------------------------------------------------------ - -14.6. Generic Random Early Detection - -Not a lot is known about GRED. It looks like GRED with several internal -queues, whereby the internal queue is chosen based on the Diffserv tcindex -field. According to a slide found [http://www.davin.ottawa.on.ca/ols/ -img22.htm] here, it contains the capabilities of Cisco's 'Distributed -Weighted RED', as well as Dave Clark's RIO. - -Each virtual queue can have its own Drop Parameters specified. - -FIXME: get Jamal or Werner to tell us more ------------------------------------------------------------------------------ - -14.7. VC/ATM emulation - -This is quite a major effort by Werner Almesberger to allow you to build -Virtual Circuits over TCP/IP sockets. A Virtual Circuit is a concept from ATM -network theory. - -For more information, see the [http://linux-atm.sourceforge.net/] ATM on -Linux homepage. ------------------------------------------------------------------------------ - -14.8. Weighted Round Robin (WRR) - -This qdisc is not included in the standard kernels but can be downloaded from -[http://wipl-wrr.dkik.dk/wrr/] ??. Currently the qdisc is only tested with -Linux 2.2 kernels but it will probably work with 2.4/2.5 kernels too. - -The WRR qdisc distributes bandwidth between its classes using the weighted -round robin scheme. That is, like the CBQ qdisc it contains classes into -which arbitrary qdiscs can be plugged. All classes which have sufficient -demand will get bandwidth proportional to the weights associated with the -classes. The weights can be set manually using the tc program. But they can -also be made automatically decreasing for classes transferring much data. - -The qdisc has a built-in classifier which assigns packets coming from or sent -to different machines to different classes. Either the MAC or IP and either -source or destination addresses can be used. The MAC address can only be used -when the Linux box is acting as an ethernet bridge, however. The classes are -automatically assigned to machines based on the packets seen. - -The qdisc can be very useful at sites such as dorms where a lot of unrelated -individuals share an Internet connection. A set of scripts setting up a -relevant behavior for such a site is a central part of the WRR distribution. ------------------------------------------------------------------------------ - -Chapter 15. Cookbook - -This section contains 'cookbook' entries which may help you solve problems. A -cookbook is no replacement for understanding however, so try and comprehend -what is going on. ------------------------------------------------------------------------------ - -15.1. Running multiple sites with different SLAs - -You can do this in several ways. Apache has some support for this with a -module, but we'll show how Linux can do this for you, and do so for other -services as well. These commands are stolen from a presentation by Jamal Hadi -that's referenced below. - -Let's say we have two customers, with http, ftp and streaming audio, and we -want to sell them a limited amount of bandwidth. We do so on the server -itself. - -Customer A should have at most 2 megabits, customer B has paid for 5 -megabits. We separate our customers by creating virtual IP addresses on our -server. - - -+---------------------------------------------------------------------------+ -|# ip address add 188.177.166.1 dev eth0 | -|# ip address add 188.177.166.2 dev eth0 | -+---------------------------------------------------------------------------+ - - -It is up to you to attach the different servers to the right IP address. All -popular daemons have support for this. - -We first attach a CBQ qdisc to eth0: -+--------------------------------------------------------------------------------+ -|# tc qdisc add dev eth0 root handle 1: cbq bandwidth 10Mbit cell 8 avpkt 1000 \ | -| mpu 64 | -+--------------------------------------------------------------------------------+ - - -We then create classes for our customers: - - -+---------------------------------------------------------------------------+ -|# tc class add dev eth0 parent 1:0 classid 1:1 cbq bandwidth 10Mbit rate \ | -| 2MBit avpkt 1000 prio 5 bounded isolated allot 1514 weight 1 maxburst 21 | -|# tc class add dev eth0 parent 1:0 classid 1:2 cbq bandwidth 10Mbit rate \ | -| 5Mbit avpkt 1000 prio 5 bounded isolated allot 1514 weight 1 maxburst 21 | -+---------------------------------------------------------------------------+ - - -Then we add filters for our two classes: -+-------------------------------------------------------------------------------+ -|##FIXME: Why this line, what does it do?, what is a divisor?: | -|##FIXME: A divisor has something to do with a hash table, and the number of | -|## buckets - ahu | -|# tc filter add dev eth0 parent 1:0 protocol ip prio 5 handle 1: u32 divisor 1 | -|# tc filter add dev eth0 parent 1:0 prio 5 u32 match ip src 188.177.166.1 | -| flowid 1:1 | -|# tc filter add dev eth0 parent 1:0 prio 5 u32 match ip src 188.177.166.2 | -| flowid 1:2 | -+-------------------------------------------------------------------------------+ - - -And we're done. - -FIXME: why no token bucket filter? is there a default pfifo_fast fallback -somewhere? ------------------------------------------------------------------------------ - -15.2. Protecting your host from SYN floods - ->From Alexey's iproute documentation, adapted to netfilter and with more -plausible paths. If you use this, take care to adjust the numbers to -reasonable values for your system. - -If you want to protect an entire network, skip this script, which is best -suited for a single host. - -It appears that you need the very latest version of the iproute2 tools to get -this to work with 2.4.0. - - -+---------------------------------------------------------------------------+ -|#! /bin/sh -x | -|# | -|# sample script on using the ingress capabilities | -|# this script shows how one can rate limit incoming SYNs | -|# Useful for TCP-SYN attack protection. You can use | -|# IPchains to have more powerful additions to the SYN (eg | -|# in addition the subnet) | -|# | -|#path to various utilities; | -|#change to reflect yours. | -|# | -|TC=/sbin/tc | -|IP=/sbin/ip | -|IPTABLES=/sbin/iptables | -|INDEV=eth2 | -|# | -|# tag all incoming SYN packets through $INDEV as mark value 1 | -|############################################################ | -|$iptables -A PREROUTING -i $INDEV -t mangle -p tcp --syn \ | -| -j MARK --set-mark 1 | -|############################################################ | -|# | -|# install the ingress qdisc on the ingress interface | -|############################################################ | -|$TC qdisc add dev $INDEV handle ffff: ingress | -|############################################################ | -| | -|# | -|# | -|# SYN packets are 40 bytes (320 bits) so three SYNs equals | -|# 960 bits (approximately 1kbit); so we rate limit below | -|# the incoming SYNs to 3/sec (not very useful really; but | -|#serves to show the point - JHS | -|############################################################ | -|$TC filter add dev $INDEV parent ffff: protocol ip prio 50 handle 1 fw \ | -|police rate 1kbit burst 40 mtu 9k drop flowid :1 | -|############################################################ | -| | -| | -|# | -|echo "---- qdisc parameters Ingress ----------" | -|$TC qdisc ls dev $INDEV | -|echo "---- Class parameters Ingress ----------" | -|$TC class ls dev $INDEV | -|echo "---- filter parameters Ingress ----------" | -|$TC filter ls dev $INDEV parent ffff: | -| | -|#deleting the ingress qdisc | -|#$TC qdisc del $INDEV ingress | -+---------------------------------------------------------------------------+ - ------------------------------------------------------------------------------ - -15.3. Rate limit ICMP to prevent dDoS - -Recently, distributed denial of service attacks have become a major nuisance -on the Internet. By properly filtering and rate limiting your network, you -can both prevent becoming a casualty or the cause of these attacks. - -You should filter your networks so that you do not allow non-local IP source -addressed packets to leave your network. This stops people from anonymously -sending junk to the Internet. - -Rate limiting goes much as shown earlier. To refresh your memory, our -ASCIIgram again: - - -+---------------------------------------------------------------------------+ -|[The Internet] ------ [Linux router] --- [Office+ISP] | -| eth1 eth0 | -+---------------------------------------------------------------------------+ - - -We first set up the prerequisite parts: - - -+-----------------------------------------------------------------------------+ -|# tc qdisc add dev eth0 root handle 10: cbq bandwidth 10Mbit avpkt 1000 | -|# tc class add dev eth0 parent 10:0 classid 10:1 cbq bandwidth 10Mbit rate \ | -| 10Mbit allot 1514 prio 5 maxburst 20 avpkt 1000 | -+-----------------------------------------------------------------------------+ - - -If you have 100Mbit, or more, interfaces, adjust these numbers. Now you need -to determine how much ICMP traffic you want to allow. You can perform -measurements with tcpdump, by having it write to a file for a while, and -seeing how much ICMP passes your network. Do not forget to raise the snapshot -length! - -If measurement is impractical, you might want to choose 5% of your available -bandwidth. Let's set up our class: -+-------------------------------------------------------------------------------+ -|# tc class add dev eth0 parent 10:1 classid 10:100 cbq bandwidth 10Mbit rate \ | -| 100Kbit allot 1514 weight 800Kbit prio 5 maxburst 20 avpkt 250 \ | -| bounded | -+-------------------------------------------------------------------------------+ - - -This limits at 100Kbit. Now we need a filter to assign ICMP traffic to this -class: -+---------------------------------------------------------------------------+ -|# tc filter add dev eth0 parent 10:0 protocol ip prio 100 u32 match ip | -| protocol 1 0xFF flowid 10:100 | -+---------------------------------------------------------------------------+ - ------------------------------------------------------------------------------ - -15.4. Prioritizing interactive traffic - -If lots of data is coming down your link, or going up for that matter, and -you are trying to do some maintenance via telnet or ssh, this may not go too -well. Other packets are blocking your keystrokes. Wouldn't it be great if -there were a way for your interactive packets to sneak past the bulk traffic? -Linux can do this for you! - -As before, we need to handle traffic going both ways. Evidently, this works -best if there are Linux boxes on both ends of your link, although other -UNIX's are able to do this. Consult your local Solaris/BSD guru for this. - -The standard pfifo_fast scheduler has 3 different 'bands'. Traffic in band 0 -is transmitted first, after which traffic in band 1 and 2 gets considered. It -is vital that our interactive traffic be in band 0! - -We blatantly adapt from the (soon to be obsolete) ipchains HOWTO: - -There are four seldom-used bits in the IP header, called the Type of Service -(TOS) bits. They effect the way packets are treated; the four bits are -"Minimum Delay", "Maximum Throughput", "Maximum Reliability" and "Minimum -Cost". Only one of these bits is allowed to be set. Rob van Nieuwkerk, the -author of the ipchains TOS-mangling code, puts it as follows: - - -+---------------------------------------------------------------------------+ -|Especially the "Minimum Delay" is important for me. I switch it on for | -|"interactive" packets in my upstream (Linux) router. I'm | -|behind a 33k6 modem link. Linux prioritizes packets in 3 queues. This | -|way I get acceptable interactive performance while doing bulk | -|downloads at the same time. | -+---------------------------------------------------------------------------+ - - -The most common use is to set telnet & ftp control connections to "Minimum -Delay" and FTP data to "Maximum Throughput". This would be done as follows, -on your upstream router: - - -+---------------------------------------------------------------------------+ -|# iptables -A PREROUTING -t mangle -p tcp --sport telnet \ | -| -j TOS --set-tos Minimize-Delay | -|# iptables -A PREROUTING -t mangle -p tcp --sport ftp \ | -| -j TOS --set-tos Minimize-Delay | -|# iptables -A PREROUTING -t mangle -p tcp --sport ftp-data \ | -| -j TOS --set-tos Maximize-Throughput | -+---------------------------------------------------------------------------+ - - -Now, this only works for data going from your telnet foreign host to your -local computer. The other way around appears to be done for you, ie, telnet, -ssh & friends all set the TOS field on outgoing packets automatically. - -Should you have an application that does not do this, you can always do it -with netfilter. On your local box: - - -+---------------------------------------------------------------------------+ -|# iptables -A OUTPUT -t mangle -p tcp --dport telnet \ | -| -j TOS --set-tos Minimize-Delay | -|# iptables -A OUTPUT -t mangle -p tcp --dport ftp \ | -| -j TOS --set-tos Minimize-Delay | -|# iptables -A OUTPUT -t mangle -p tcp --dport ftp-data \ | -| -j TOS --set-tos Maximize-Throughput | -+---------------------------------------------------------------------------+ - ------------------------------------------------------------------------------ - -15.5. Transparent web-caching using netfilter, iproute2, ipchains and squid - -This section was sent in by reader Ram Narula from Internet for Education -(Thailand). - -The regular technique in accomplishing this in Linux is probably with use of -ipchains AFTER making sure that the "outgoing" port 80(web) traffic gets -routed through the server running squid. - -There are 3 common methods to make sure "outgoing" port 80 traffic gets -routed to the server running squid and 4th one is being introduced here. - -Making the gateway router do it. - If you can tell your gateway router to match packets that has outgoing - destination port of 80 to be sent to the IP address of squid server. - - BUT - - This would put additional load on the router and some commercial routers - might not even support this. - -Using a Layer 4 switch. - Layer 4 switches can handle this without any problem. - - BUT - - The cost for this equipment is usually very high. Typical layer 4 switch - would normally cost more than a typical router+good linux server. - -Using cache server as network's gateway. - You can force ALL traffic through cache server. - - BUT - - This is quite risky because Squid does utilize lots of CPU power which - might result in slower over-all network performance or the server itself - might crash and no one on the network will be able to access the Internet - if that occurs. - -Linux+NetFilter router. - By using NetFilter another technique can be implemented which is using - NetFilter for "mark"ing the packets with destination port 80 and using - iproute2 to route the "mark"ed packets to the Squid server. - - -+---------------------------------------------------------------------------+ -||----------------| | -|| Implementation | | -||----------------| | -| | -| Addresses used | -| 10.0.0.1 naret (NetFilter server) | -| 10.0.0.2 silom (Squid server) | -| 10.0.0.3 donmuang (Router connected to the Internet) | -| 10.0.0.4 kaosarn (other server on network) | -| 10.0.0.5 RAS | -| 10.0.0.0/24 main network | -| 10.0.0.0/19 total network | -| | -||---------------| | -||Network diagram| | -||---------------| | -| | -|Internet | -|| | -|donmuang | -|| | -|------------hub/switch---------- | -|| | | | | -|naret silom kaosarn RAS etc. | -+---------------------------------------------------------------------------+ -First, make all traffic pass through naret by making sure it is the default -gateway except for silom. Silom's default gateway has to be donmuang -(10.0.0.3) or this would create web traffic loop. - -(all servers on my network had 10.0.0.1 as the default gateway which was the -former IP address of donmuang router so what I did was changed the IP address -of donmuang to 10.0.0.3 and gave naret ip address of 10.0.0.1) - - -+---------------------------------------------------------------------------+ -|Silom | -|----- | -|-setup squid and ipchains | -+---------------------------------------------------------------------------+ - - -Setup Squid server on silom, make sure it does support transparent caching/ -proxying, the default port is usually 3128, so all traffic for port 80 has to -be redirected to port 3128 locally. This can be done by using ipchains with -the following: - - -+---------------------------------------------------------------------------+ -|silom# ipchains -N allow1 | -|silom# ipchains -A allow1 -p TCP -s 10.0.0.0/19 -d 0/0 80 -j REDIRECT 3128 | -|silom# ipchains -I input -j allow1 | -+---------------------------------------------------------------------------+ - - -Or, in netfilter lingo: -+-----------------------------------------------------------------------------------------+ -|silom# iptables -t nat -A PREROUTING -i eth0 -p tcp --dport 80 -j REDIRECT --to-port 3128| -+-----------------------------------------------------------------------------------------+ - - -(note: you might have other entries as well) - -For more information on setting Squid server please refer to Squid FAQ page -on [http://squid.nlanr.net] http://squid.nlanr.net). - -Make sure ip forwarding is enabled on this server and the default gateway for -this server is donmuang router (NOT naret). - - -+---------------------------------------------------------------------------+ -|Naret | -|----- | -|-setup iptables and iproute2 | -|-disable icmp REDIRECT messages (if needed) | -+---------------------------------------------------------------------------+ - - - - - 1. "Mark" packets of destination port 80 with value 2 - +--------------------------------------------------------------------+ - | | - |naret# iptables -A PREROUTING -i eth0 -t mangle -p tcp --dport 80 \ | - | -j MARK --set-mark 2 | - +--------------------------------------------------------------------+ - - - 2. Setup iproute2 so it will route packets with "mark" 2 to silom - +----------------------------------------------------------------+ - |naret# echo 202 www.out >> /etc/iproute2/rt_tables | - |naret# ip rule add fwmark 2 table www.out | - |naret# ip route add default via 10.0.0.2 dev eth0 table www.out | - |naret# ip route flush cache | - +----------------------------------------------------------------+ - - - If donmuang and naret is on the same subnet then naret should not send - out icmp REDIRECT messages. In this case it is, so icmp REDIRECTs has to - be disabled by: - +---------------------------------------------------------------+ - |naret# echo 0 > /proc/sys/net/ipv4/conf/all/send_redirects | - |naret# echo 0 > /proc/sys/net/ipv4/conf/default/send_redirects | - |naret# echo 0 > /proc/sys/net/ipv4/conf/eth0/send_redirects | - +---------------------------------------------------------------+ - - - - - -The setup is complete, check the configuration - - -+--------------------------------------------------------------------------------------+ -|On naret: | -| | -|naret# iptables -t mangle -L | -|Chain PREROUTING (policy ACCEPT) | -|target prot opt source destination | -|MARK tcp -- anywhere anywhere tcp dpt:www MARK set 0x2 | -| | -|Chain OUTPUT (policy ACCEPT) | -|target prot opt source destination | -| | -|naret# ip rule ls | -|0: from all lookup local | -|32765: from all fwmark 2 lookup www.out | -|32766: from all lookup main | -|32767: from all lookup default | -| | -|naret# ip route list table www.out | -|default via 203.114.224.8 dev eth0 | -| | -|naret# ip route | -|10.0.0.1 dev eth0 scope link | -|10.0.0.0/24 dev eth0 proto kernel scope link src 10.0.0.1 | -|127.0.0.0/8 dev lo scope link | -|default via 10.0.0.3 dev eth0 | -| | -|(make sure silom belongs to one of the above lines, in this case | -|it's the line with 10.0.0.0/24) | -| | -||------| | -||-DONE-| | -||------| | -| | -+--------------------------------------------------------------------------------------+ - ------------------------------------------------------------------------------ - -15.5.1. Traffic flow diagram after implementation - -+---------------------------------------------------------------------------+ -||-----------------------------------------| | -||Traffic flow diagram after implementation| | -||-----------------------------------------| | -| | -|INTERNET | -|/\ | -||| | -|\/ | -|-----------------donmuang router--------------------- | -|/\ /\ || | -||| || || | -||| \/ || | -|naret silom || | -|*destination port 80 traffic=========>(cache) || | -|/\ || || | -||| \/ \/ | -|\\===================================kaosarn, RAS, etc. | -+---------------------------------------------------------------------------+ - -Note that the network is asymmetric as there is one extra hop on general -outgoing path. - - -+---------------------------------------------------------------------------+ -|Here is run down for packet traversing the network from kaosarn | -|to and from the Internet. | -| | -|For web/http traffic: | -|kaosarn http request->naret->silom->donmuang->internet | -|http replies from Internet->donmuang->silom->kaosarn | -| | -|For non-web/http requests(eg. telnet): | -|kaosarn outgoing data->naret->donmuang->internet | -|incoming data from Internet->donmuang->kaosarn | -+---------------------------------------------------------------------------+ - ------------------------------------------------------------------------------ - -15.6. Circumventing Path MTU Discovery issues with per route MTU settings - -For sending bulk data, the Internet generally works better when using larger -packets. Each packet implies a routing decision, when sending a 1 megabyte -file, this can either mean around 700 packets when using packets that are as -large as possible, or 4000 if using the smallest default. - -However, not all parts of the Internet support full 1460 bytes of payload per -packet. It is therefore necessary to try and find the largest packet that -will 'fit', in order to optimize a connection. - -This process is called 'Path MTU Discovery', where MTU stands for 'Maximum -Transfer Unit.' - -When a router encounters a packet that's too big too send in one piece, AND -it has been flagged with the "Don't Fragment" bit, it returns an ICMP message -stating that it was forced to drop a packet because of this. The sending host -acts on this hint by sending smaller packets, and by iterating it can find -the optimum packet size for a connection over a certain path. - -This used to work well until the Internet was discovered by hooligans who do -their best to disrupt communications. This in turn lead administrators to -either block or shape ICMP traffic in a misguided attempt to improve security -or robustness of their Internet service. - -What has happened now is that Path MTU Discovery is working less and less -well and fails for certain routes, which leads to strange TCP/IP sessions -which die after a while. - -Although I have no proof for this, two sites who I used to have this problem -with both run Alteon Acedirectors before the affected systems - perhaps -somebody more knowledgeable can provide clues as to why this happens. ------------------------------------------------------------------------------ - -15.6.1. Solution - -When you encounter sites that suffer from this problem, you can disable Path -MTU discovery by setting it manually. Koos van den Hout, slightly edited, -writes: - - - The following problem: I set the mtu/mru of my leased line running ppp to - 296 because it's only 33k6 and I cannot influence the queueing on the - other side. At 296, the response to a key press is within a reasonable - time frame. - - And, on my side I have a masqrouter running (of course) Linux. - - Recently I split 'server' and 'router' so most applications are run on a - different machine than the routing happens on. - - I then had trouble logging into irc. Big panic! Some digging did find out - that I got connected to irc, even showed up as 'connected' on irc but I - did not receive the motd from irc. I checked what could be wrong and - noted that I already had some previous trouble reaching certain websites - related to the MTU, since I had no trouble reaching them when the MTU was - 1500, the problem just showed when the MTU was set to 296. Since irc - servers block about every kind of traffic not needed for their immediate - operation, they also block icmp. - - I managed to convince the operators of a webserver that this was the - cause of a problem, but the irc server operators were not going to fix - this. - - So, I had to make sure outgoing masqueraded traffic started with the - lower mtu of the outside link. But I want local ethernet traffic to have - the normal mtu (for things like nfs traffic). - - Solution: - +-----------------------------------------------------------------------+ - |ip route add default via 10.0.0.1 mtu 296 | - +-----------------------------------------------------------------------+ - - (10.0.0.1 being the default gateway, the inside address of the - masquerading router) - -In general, it is possible to override PMTU Discovery by setting specific -routes. For example, if only a certain subnet is giving problems, this should -help: -+---------------------------------------------------------------------------+ -|ip route add 195.96.96.0/24 via 10.0.0.1 mtu 1000 | -+---------------------------------------------------------------------------+ ------------------------------------------------------------------------------ - -15.7. Circumventing Path MTU Discovery issues with MSS Clamping (for ADSL, -cable, PPPoE & PPtP users) - -As explained above, Path MTU Discovery doesn't work as well as it should -anymore. If you know for a fact that a hop somewhere in your network has a -limited (<1500) MTU, you cannot rely on PMTU Discovery finding this out. - -Besides MTU, there is yet another way to set the maximum packet size, the so -called Maximum Segment Size. This is a field in the TCP Options part of a SYN -packet. - -Recent Linux kernels, and a few PPPoE drivers (notably, the excellent Roaring -Penguin one), feature the possibility to 'clamp the MSS'. - -The good thing about this is that by setting the MSS value, you are telling -the remote side unequivocally 'do not ever try to send me packets bigger than -this value'. No ICMP traffic is needed to get this to work. - -The bad thing is that it's an obvious hack - it breaks 'end to end' by -modifying packets. Having said that, we use this trick in many places and it -works like a charm. - -In order for this to work you need at least iptables-1.2.1a and Linux 2.4.3 -or higher. The basic command line is: -+-----------------------------------------------------------------------------------+ -|# iptables -A FORWARD -p tcp --tcp-flags SYN,RST SYN -j TCPMSS --clamp-mss-to-pmtu| -+-----------------------------------------------------------------------------------+ - - -This calculates the proper MSS for your link. If you are feeling brave, or -think that you know best, you can also do something like this: - - -+----------------------------------------------------------------------------+ -|# iptables -A FORWARD -p tcp --tcp-flags SYN,RST SYN -j TCPMSS --set-mss 128| -+----------------------------------------------------------------------------+ - - -This sets the MSS of passing SYN packets to 128. Use this if you have VoIP -with tiny packets, and huge http packets which are causing chopping in your -voice calls. ------------------------------------------------------------------------------ - -15.8. The Ultimate Traffic Conditioner: Low Latency, Fast Up & Downloads - -Note: This script has recently been upgraded and previously only worked for -Linux clients in your network! So you might want to update if you have -Windows machines or Macs in your network and noticed that they were not able -to download faster while others were uploading. - -I attempted to create the holy grail: - -Maintain low latency for interactive traffic at all times - This means that downloading or uploading files should not disturb SSH or - even telnet. These are the most important things, even 200ms latency is - sluggish to work over. - -Allow 'surfing' at reasonable speeds while up or downloading - Even though http is 'bulk' traffic, other traffic should not drown it out - too much. - -Make sure uploads don't harm downloads, and the other way around - This is a much observed phenomenon where upstream traffic simply destroys - download speed. - - -It turns out that all this is possible, at the cost of a tiny bit of -bandwidth. The reason that uploads, downloads and ssh hurt each other is the -presence of large queues in many domestic access devices like cable or DSL -modems. - -The next section explains in depth what causes the delays, and how we can fix -them. You can safely skip it and head straight for the script if you don't -care how the magic is performed. ------------------------------------------------------------------------------ - -15.8.1. Why it doesn't work well by default - -ISPs know that they are benchmarked solely on how fast people can download. -Besides available bandwidth, download speed is influenced heavily by packet -loss, which seriously hampers TCP/IP performance. Large queues can help -prevent packet loss, and speed up downloads. So ISPs configure large queues. - -These large queues however damage interactivity. A keystroke must first -travel the upstream queue, which may be seconds (!) long and go to your -remote host. It is then displayed, which leads to a packet coming back, which -must then traverse the downstream queue, located at your ISP, before it -appears on your screen. - -This HOWTO teaches you how to mangle and process the queue in many ways, but -sadly, not all queues are accessible to us. The queue over at the ISP is -completely off-limits, whereas the upstream queue probably lives inside your -cable modem or DSL device. You may or may not be able to configure it. Most -probably not. - -So, what next? As we can't control either of those queues, they must be -eliminated, and moved to your Linux router. Luckily this is possible. - -Limit upload speed - By limiting our upload speed to slightly less than the truly available - rate, no queues are built up in our modem. The queue is now moved to - Linux. - -Limit download speed - This is slightly trickier as we can't really influence how fast the - internet ships us data. We can however drop packets that are coming in - too fast, which causes TCP/IP to slow down to just the rate we want. - Because we don't want to drop traffic unnecessarily, we configure a - 'burst' size we allow at higher speed. - - -Now, once we have done this, we have eliminated the downstream queue totally -(except for short bursts), and gain the ability to manage the upstream queue -with all the power Linux offers. - -What remains to be done is to make sure interactive traffic jumps to the -front of the upstream queue. To make sure that uploads don't hurt downloads, -we also move ACK packets to the front of the queue. This is what normally -causes the huge slowdown observed when generating bulk traffic both ways. The -ACKnowledgements for downstream traffic must compete with upstream traffic, -and get delayed in the process. - -If we do all this we get the following measurements using an excellent ADSL -connection from xs4all in the Netherlands: - - -+---------------------------------------------------------------------------+ -|Baseline latency: | -|round-trip min/avg/max = 14.4/17.1/21.7 ms | -| | -|Without traffic conditioner, while downloading: | -|round-trip min/avg/max = 560.9/573.6/586.4 ms | -| | -|Without traffic conditioner, while uploading: | -|round-trip min/avg/max = 2041.4/2332.1/2427.6 ms | -| | -|With conditioner, during 220kbit/s upload: | -|round-trip min/avg/max = 15.7/51.8/79.9 ms | -| | -|With conditioner, during 850kbit/s download: | -|round-trip min/avg/max = 20.4/46.9/74.0 ms | -| | -|When uploading, downloads proceed at ~80% of the available speed. Uploads | -|at around 90%. Latency then jumps to 850 ms, still figuring out why. | -+---------------------------------------------------------------------------+ - - -What you can expect from this script depends a lot on your actual uplink -speed. When uploading at full speed, there will always be a single packet -ahead of your keystroke. That is the lower limit to the latency you can -achieve - divide your MTU by your upstream speed to calculate. Typical values -will be somewhat higher than that. Lower your MTU for better effects! - -Next, two versions of this script, one with Devik's excellent HTB, the other -with CBQ which is in each Linux kernel, unlike HTB. Both are tested and work -well. ------------------------------------------------------------------------------ - -15.8.2. The actual script (CBQ) - -Works on all kernels. Within the CBQ qdisc we place two Stochastic Fairness -Queues that make sure that multiple bulk streams don't drown each other out. - -Downstream traffic is policed using a tc filter containing a Token Bucket -Filter. - -You might improve on this script by adding 'bounded' to the line that starts -with 'tc class add .. classid 1:20'. If you lowered your MTU, also lower the -allot & avpkt numbers! - - -+-----------------------------------------------------------------------------+ -|#!/bin/bash | -| | -|# The Ultimate Setup For Your Internet Connection At Home | -|# | -|# | -|# Set the following values to somewhat less than your actual download | -|# and uplink speed. In kilobits | -|DOWNLINK=800 | -|UPLINK=220 | -|DEV=ppp0 | -| | -|# clean existing down- and uplink qdiscs, hide errors | -|tc qdisc del dev $DEV root 2> /dev/null > /dev/null | -|tc qdisc del dev $DEV ingress 2> /dev/null > /dev/null | -| | -|###### uplink | -| | -|# install root CBQ | -| | -|tc qdisc add dev $DEV root handle 1: cbq avpkt 1000 bandwidth 10mbit | -| | -|# shape everything at $UPLINK speed - this prevents huge queues in your | -|# DSL modem which destroy latency: | -|# main class | -| | -|tc class add dev $DEV parent 1: classid 1:1 cbq rate ${UPLINK}kbit \ | -|allot 1500 prio 5 bounded isolated | -| | -|# high prio class 1:10: | -| | -|tc class add dev $DEV parent 1:1 classid 1:10 cbq rate ${UPLINK}kbit \ | -| allot 1600 prio 1 avpkt 1000 | -| | -|# bulk and default class 1:20 - gets slightly less traffic, | -|# and a lower priority: | -| | -|tc class add dev $DEV parent 1:1 classid 1:20 cbq rate $[9*$UPLINK/10]kbit \ | -| allot 1600 prio 2 avpkt 1000 | -| | -|# both get Stochastic Fairness: | -|tc qdisc add dev $DEV parent 1:10 handle 10: sfq perturb 10 | -|tc qdisc add dev $DEV parent 1:20 handle 20: sfq perturb 10 | -| | -|# start filters | -|# TOS Minimum Delay (ssh, NOT scp) in 1:10: | -|tc filter add dev $DEV parent 1:0 protocol ip prio 10 u32 \ | -| match ip tos 0x10 0xff flowid 1:10 | -| | -|# ICMP (ip protocol 1) in the interactive class 1:10 so we | -|# can do measurements & impress our friends: | -|tc filter add dev $DEV parent 1:0 protocol ip prio 11 u32 \ | -| match ip protocol 1 0xff flowid 1:10 | -| | -|# To speed up downloads while an upload is going on, put ACK packets in | -|# the interactive class: | -| | -|tc filter add dev $DEV parent 1: protocol ip prio 12 u32 \ | -| match ip protocol 6 0xff \ | -| match u8 0x05 0x0f at 0 \ | -| match u16 0x0000 0xffc0 at 2 \ | -| match u8 0x10 0xff at 33 \ | -| flowid 1:10 | -| | -|# rest is 'non-interactive' ie 'bulk' and ends up in 1:20 | -| | -|tc filter add dev $DEV parent 1: protocol ip prio 13 u32 \ | -| match ip dst 0.0.0.0/0 flowid 1:20 | -| | -|########## downlink ############# | -|# slow downloads down to somewhat less than the real speed to prevent | -|# queuing at our ISP. Tune to see how high you can set it. | -|# ISPs tend to have *huge* queues to make sure big downloads are fast | -|# | -|# attach ingress policer: | -| | -|tc qdisc add dev $DEV handle ffff: ingress | -| | -|# filter *everything* to it (0.0.0.0/0), drop everything that's | -|# coming in too fast: | -| | -|tc filter add dev $DEV parent ffff: protocol ip prio 50 u32 match ip src \ | -| 0.0.0.0/0 police rate ${DOWNLINK}kbit burst 10k drop flowid :1 | -+-----------------------------------------------------------------------------+ -If you want this script to be run by ppp on connect, copy it to /etc/ppp/ -ip-up.d. - -If the last two lines give an error, update your tc tool to a newer version! ------------------------------------------------------------------------------ - -15.8.3. The actual script (HTB) - -The following script achieves all goals using the wonderful HTB queue, see -the relevant chapter. Well worth patching your kernel for! -+-----------------------------------------------------------------------------+ -|#!/bin/bash | -| | -|# The Ultimate Setup For Your Internet Connection At Home | -|# | -|# | -|# Set the following values to somewhat less than your actual download | -|# and uplink speed. In kilobits | -|DOWNLINK=800 | -|UPLINK=220 | -|DEV=ppp0 | -| | -|# clean existing down- and uplink qdiscs, hide errors | -|tc qdisc del dev $DEV root 2> /dev/null > /dev/null | -|tc qdisc del dev $DEV ingress 2> /dev/null > /dev/null | -| | -|###### uplink | -| | -|# install root HTB, point default traffic to 1:20: | -| | -|tc qdisc add dev $DEV root handle 1: htb default 20 | -| | -|# shape everything at $UPLINK speed - this prevents huge queues in your | -|# DSL modem which destroy latency: | -| | -|tc class add dev $DEV parent 1: classid 1:1 htb rate ${UPLINK}kbit burst 6k | -| | -|# high prio class 1:10: | -| | -|tc class add dev $DEV parent 1:1 classid 1:10 htb rate ${UPLINK}kbit \ | -| burst 6k prio 1 | -| | -|# bulk & default class 1:20 - gets slightly less traffic, | -|# and a lower priority: | -| | -|tc class add dev $DEV parent 1:1 classid 1:20 htb rate $[9*$UPLINK/10]kbit \ | -| burst 6k prio 2 | -| | -|# both get Stochastic Fairness: | -|tc qdisc add dev $DEV parent 1:10 handle 10: sfq perturb 10 | -|tc qdisc add dev $DEV parent 1:20 handle 20: sfq perturb 10 | -| | -|# TOS Minimum Delay (ssh, NOT scp) in 1:10: | -|tc filter add dev $DEV parent 1:0 protocol ip prio 10 u32 \ | -| match ip tos 0x10 0xff flowid 1:10 | -| | -|# ICMP (ip protocol 1) in the interactive class 1:10 so we | -|# can do measurements & impress our friends: | -|tc filter add dev $DEV parent 1:0 protocol ip prio 10 u32 \ | -| match ip protocol 1 0xff flowid 1:10 | -| | -|# To speed up downloads while an upload is going on, put ACK packets in | -|# the interactive class: | -| | -|tc filter add dev $DEV parent 1: protocol ip prio 10 u32 \ | -| match ip protocol 6 0xff \ | -| match u8 0x05 0x0f at 0 \ | -| match u16 0x0000 0xffc0 at 2 \ | -| match u8 0x10 0xff at 33 \ | -| flowid 1:10 | -| | -|# rest is 'non-interactive' ie 'bulk' and ends up in 1:20 | -| | -| | -|########## downlink ############# | -|# slow downloads down to somewhat less than the real speed to prevent | -|# queuing at our ISP. Tune to see how high you can set it. | -|# ISPs tend to have *huge* queues to make sure big downloads are fast | -|# | -|# attach ingress policer: | -| | -|tc qdisc add dev $DEV handle ffff: ingress | -| | -|# filter *everything* to it (0.0.0.0/0), drop everything that's | -|# coming in too fast: | -| | -|tc filter add dev $DEV parent ffff: protocol ip prio 50 u32 match ip src \ | -| 0.0.0.0/0 police rate ${DOWNLINK}kbit burst 10k drop flowid :1 | -+-----------------------------------------------------------------------------+ - - -If you want this script to be run by ppp on connect, copy it to /etc/ppp/ -ip-up.d. - -If the last two lines give an error, update your tc tool to a newer version! ------------------------------------------------------------------------------ - -15.9. Rate limiting a single host or netmask - -Although this is described in stupendous details elsewhere and in our -manpages, this question gets asked a lot and happily there is a simple answer -that does not need full comprehension of traffic control. - -This three line script does the trick: - -+--------------------------------------------------------------------------------+ -| tc qdisc add dev $DEV root handle 1: cbq avpkt 1000 bandwidth 10mbit | -| | -| tc class add dev $DEV parent 1: classid 1:1 cbq rate 512kbit \ | -| allot 1500 prio 5 bounded isolated | -| | -| tc filter add dev $DEV parent 1: protocol ip prio 16 u32 \ | -| match ip dst 195.96.96.97 flowid 1:1 | -| | -+--------------------------------------------------------------------------------+ - -The first line installs a class based queue on your interface, and tells the -kernel that for calculations, it can be assumed to be a 10mbit interface. If -you get this wrong, no real harm is done. But getting it right will make -everything more precise. - -The second line creates a 512kbit class with some reasonable defaults. For -details, see the cbq manpages and Chapter 9. - -The last line tells which traffic should go to the shaped class. Traffic not -matched by this rule is NOT shaped. To make more complicated matches -(subnets, source ports, destination ports), see Section 9.6.2. - -If you changed anything and want to reload the script, execute 'tc qdisc del -dev $DEV root' to clean up your existing configuration. - -The script can further be improved by adding a last optional line 'tc qdisc -add dev $DEV parent 1:1 sfq perturb 10'. See Section 9.2.3 for details on -what this does. ------------------------------------------------------------------------------ - -Chapter 16. Building bridges, and pseudo-bridges with Proxy ARP - -Bridges are devices which can be installed in a network without any -reconfiguration. A network switch is basically a many-port bridge. A bridge -is often a 2-port switch. Linux does however support multiple interfaces in a -bridge, making it a true switch. - -Bridges are often deployed when confronted with a broken network that needs -to be fixed without any alterations. Because the bridge is a layer-2 device, -one layer below IP, routers and servers are not aware of its existence. This -means that you can transparently block or modify certain packets, or do -shaping. - -Another good thing is that a bridge can often be replaced by a cross cable or -a hub, should it break down. - -The bad news is that a bridge can cause great confusion unless it is very -well documented. It does not appear in traceroutes, but somehow packets -disappear or get changed from point A to point B ('this network is HAUNTED! -'). You should also wonder if an organization that 'does not want to change -anything' is doing the right thing. - -The Linux 2.4/2.5 bridge is documented on [ http://bridge.sourceforge.net/] -this page. ------------------------------------------------------------------------------ - -16.1. State of bridging and iptables - -As of Linux 2.4.14, bridging and iptables do not 'see' each other without -help. If you bridge packets from eth0 to eth1, they do not 'pass' by -iptables. This means that you cannot do filtering, or NAT or mangling or -whatever. - -There are several projects going on to fix this, the truly right one is by -the author of the Linux 2.4 bridging code, Lennert Buytenhek. He recently -informed us that as of bridge-nf 0.0.2 (see the url above), the code is -stable and usable in production environments. He is now asking the kernel -people if and how the patch can be merged, stay tuned! ------------------------------------------------------------------------------ - -16.2. Bridging and shaping - -This does work as advertised. Be sure to figure out which side each interface -is on, otherwise you might be shaping outbound traffic in your internal -interface, which won't work. Use tcpdump if needed. ------------------------------------------------------------------------------ - -16.3. Pseudo-bridges with Proxy-ARP - -If you just want to implement a Pseudo-bridge, skip down a few sections to -'Implementing it', but it is wise to read a bit about how it works in -practice. - -A Pseudo-bridge works a bit differently. By default, a bridge passes packets -unaltered from one interface to the other. It only looks at the hardware -address of packets to determine what goes where. This in turn means that you -can bridge traffic that Linux does not understand, as long as it has an -hardware address it does. - -A 'Pseudo-bridge' works differently and looks more like a hidden router than -a bridge, but like a bridge, it has little impact on network design. - -An advantage of the fact that it is not a bridge lies in the fact that -packets really pass through the kernel, and can be filtered, changed, -redirected or rerouted. - -A real bridge can also be made to perform these feats, but it needs special -code, like the Ethernet Frame Diverter, or the above mentioned patch. - -Another advantage of a pseudo-bridge is that it does not pass packets it does -not understand - thus cleaning your network of a lot of cruft. In cases where -you need this cruft (like SAP packets, or Netbeui), use a real bridge. ------------------------------------------------------------------------------ - -16.3.1. ARP & Proxy-ARP - -When a host wants to talk to another host on the same physical network -segment, it sends out an Address Resolution Protocol packet, which, somewhat -simplified, reads like this 'who has 10.0.0.1, tell 10.0.0.7'. In response to -this, 10.0.0.1 replies with a short 'here' packet. - -10.0.0.7 then sends packets to the hardware address mentioned in the 'here' -packet. It caches this hardware address for a relatively long time, and after -the cache expires, it re-asks the question. - -When building a Pseudo-bridge, we instruct the bridge to reply to these ARP -packets, which causes the hosts in the network to send its packets to the -bridge. The bridge then processes these packets, and sends them to the -relevant interface. - -So, in short, whenever a host on one side of the bridge asks for the hardware -address of a host on the other, the bridge replies with a packet that says -'hand it to me'. - -This way, all data traffic gets transmitted to the right place, and always -passes through the bridge. ------------------------------------------------------------------------------ - -16.3.2. Implementing it - -In the bad old days, it used to be possible to instruct the Linux Kernel to -perform 'proxy-ARP' for just any subnet. So, to configure a pseudo-bridge, -you would have to specify both the proper routes to both sides of the bridge -AND create matching proxy-ARP rules. This is bad in that it requires a lot of -typing, but also because it easily allows you to make mistakes which make -your bridge respond to ARP queries for networks it does not know how to -route. - -With Linux 2.4/2.5 (and possibly 2.2), this possibility has been withdrawn -and has been replaced by a flag in the /proc directory, called 'proxy_arp'. -The procedure for building a pseudo-bridge is then: - - - - 1. Assign an IP address to both interfaces, the 'left' and the 'right' one - - 2. Create routes so your machine knows which hosts reside on the left, and - which on the right - - 3. Turn on proxy-ARP on both interfaces, echo 1 > /proc/sys/net/ipv4/conf/ - ethL/proxy_arp, echo 1 > /proc/sys/net/ipv4/conf/ethR/proxy_arp, where L - and R stand for the numbers of your interfaces on the left and on the - right side - - - - -Also, do not forget to turn on the ip_forwarding flag! When converting from a -true bridge, you may find that this flag was turned off as it is not needed -when bridging. - -Another thing you might note when converting is that you need to clear the -arp cache of computers in the network - the arp cache might contain old -pre-bridge hardware addresses which are no longer correct. - -On a Cisco, this is done using the command 'clear arp-cache', under Linux, -use 'arp -d ip.address'. You can also wait for the cache to expire manually, -which can take rather long. - -You can speed this up using the wonderful 'arping' tool, which on many -distributions is part of the 'iputils' package. Using 'arping' you can send -out unsolicited ARP messages so as to update remote arp caches. - -This is a very powerful technique that is also used by 'black hats' to -subvert your routing! - -Note On Linux 2.4, you may need to execute 'echo 1 > /proc/sys/net/ipv4/ - ip_nonlocal_bind' before being able to send out unsolicited ARP - messages! - -You may also discover that your network was misconfigured if you are/were of -the habit of specifying routes without netmasks. To explain, some versions of -route may have guessed your netmask right in the past, or guessed wrong -without you noticing. When doing surgical routing like described above, it is -*vital* that you check your netmasks! ------------------------------------------------------------------------------ - -Chapter 17. Dynamic routing - OSPF and BGP - -Once your network starts to get really big, or you start to consider 'the -internet' as your network, you need tools which dynamically route your data. -Sites are often connected to each other with multiple links, and more are -popping up all the time. - -The Internet has mostly standardized on OSPF and BGP4 (rfc1771). Linux -supports both, by way of gated and zebra - -While currently not within the scope of this document, we would like to point -you to the definitive works: - -Overview: - -Cisco Systems [http://www.cisco.com/univercd/cc/td/doc/cisintwk/idg4/ -nd2003.htm] Designing large-scale IP Internetworks - -For OSPF: - -Moy, John T. "OSPF. The anatomy of an Internet routing protocol" Addison -Wesley. Reading, MA. 1998. - -Halabi has also written a good guide to OSPF routing design, but this appears -to have been dropped from the Cisco web site. - -For BGP: - -Halabi, Bassam "Internet routing architectures" Cisco Press (New Riders -Publishing). Indianapolis, IN. 1997. - -also - -Cisco Systems - -[http://www.cisco.com/univercd/cc/td/doc/cisintwk/ics/icsbgp4.htm] Using the -Border Gateway Protocol for interdomain routing - -Although the examples are Cisco-specific, they are remarkably similar to the -configuration language in Zebra :-) ------------------------------------------------------------------------------ - -Chapter 18. Other possibilities - -This chapter is a list of projects having to do with advanced Linux routing & -traffic shaping. Some of these links may deserve chapters of their own, some -are documented very well of themselves, and don't need more HOWTO. - -802.1Q VLAN Implementation for Linux [http://scry.wanfear.com/~greear/ - vlan.html] (site) - VLANs are a very cool way to segregate your networks in a more virtual - than physical way. Good information on VLANs can be found [ftp:// - ftp.netlab.ohio-state.edu/pub/jain/courses/cis788-97/virtual_lans/ - index.htm] here. With this implementation, you can have your Linux box - talk VLANs with machines like Cisco Catalyst, 3Com: {Corebuilder, - Netbuilder II, SuperStack II switch 630}, Extreme Ntwks Summit 48, - Foundry: {ServerIronXL, FastIron}. - - A great HOWTO about VLANs can be found [http://scry.wanfear.com/~greear/ - vlan/cisco_howto.html] here. - - Update: has been included in the kernel as of 2.4.14 (perhaps 13). - -Alternate 802.1Q VLAN Implementation for Linux [http://vlan.sourceforge.net ] - (site) - Alternative VLAN implementation for linux. This project was started out - of disagreement with the 'established' VLAN project's architecture and - coding style, resulting in a cleaner overall design. - -Linux Virtual Server [http://www.LinuxVirtualServer.org/] (site) - These people are brilliant. The Linux Virtual Server is a highly scalable - and highly available server built on a cluster of real servers, with the - load balancer running on the Linux operating system. The architecture of - the cluster is transparent to end users. End users only see a single - virtual server. - - In short whatever you need to load balance, at whatever level of traffic, - LVS will have a way of doing it. Some of their techniques are positively - evil! For example, they let several machines have the same IP address on - a segment, but turn off ARP on them. Only the LVS machine does ARP - it - then decides which of the backend hosts should handle an incoming packet, - and sends it directly to the right MAC address of the backend server. - Outgoing traffic will flow directly to the router, and not via the LVS - machine, which does therefor not need to see your 5Gbit/s of content - flowing to the world, and cannot be a bottleneck. - - The LVS is implemented as a kernel patch in Linux 2.0 and 2.2, but as a - Netfilter module in 2.4/2.5, so it does not need kernel patches! Their - 2.4 support is still in early development, so beat on it and give - feedback or send patches. - -CBQ.init [ftp://ftp.equinox.gu.net/pub/linux/cbq/] (site) - Configuring CBQ can be a bit daunting, especially if all you want to do - is shape some computers behind a router. CBQ.init can help you configure - Linux with a simplified syntax. - - For example, if you want all computers in your 192.168.1.0/24 subnet (on - 10mbit eth1) to be limited to 28kbit/s download speed, put this in the - CBQ.init configuration file: - - - +---------------------------------------------------------------+ - |DEVICE=eth1,10Mbit,1Mbit | - |RATE=28Kbit | - |WEIGHT=2Kbit | - |PRIO=5 | - |RULE=192.168.1.0/24 | - +---------------------------------------------------------------+ - - - By all means use this program if the 'how and why' don't interest you. - We're using CBQ.init in production and it works very well. It can even do - some more advanced things, like time dependent shaping. The documentation - is embedded in the script, which explains why you can't find a README. - -Chronox easy shaping scripts [http://www.chronox.de] (site) - Stephan Mueller (smueller@chronox.de) wrote two useful scripts, - 'limit.conn' and 'shaper'. The first one allows you to easily throttle a - single download session, like this: - - - +---------------------------------------------------------------+ - |# limit.conn -s SERVERIP -p SERVERPORT -l LIMIT | - +---------------------------------------------------------------+ - - - It works on Linux 2.2 and 2.4/2.5. - - The second script is more complicated, and can be used to make lots of - different queues based on iptables rules, which are used to mark packets - which are then shaped. - -Virtual Router Redundancy Protocol implementation [http://w3.arobas.net/ - ~jetienne/vrrpd/index.html] (site) - This is purely for redundancy. Two machines with their own IP address and - MAC Address together create a third IP Address and MAC Address, which is - virtual. Originally intended purely for routers, which need constant MAC - addresses, it also works for other servers. - - The beauty of this approach is the incredibly easy configuration. No - kernel compiling or patching required, all userspace. - - Just run this on all machines participating in a service: - +---------------------------------------------------------------+ - |# vrrpd -i eth0 -v 50 10.0.0.22 | - +---------------------------------------------------------------+ - - - And you are in business! 10.0.0.22 is now carried by one of your servers, - probably the first one to run the vrrp daemon. Now disconnect that - computer from the network and very rapidly one of the other computers - will assume the 10.0.0.22 address, as well as the MAC address. - - I tried this over here and had it up and running in 1 minute. For some - strange reason it decided to drop my default gateway, but the -n flag - prevented that. - - This is a 'live' fail over: - - - +---------------------------------------------------------------+ - |64 bytes from 10.0.0.22: icmp_seq=3 ttl=255 time=0.2 ms | - |64 bytes from 10.0.0.22: icmp_seq=4 ttl=255 time=0.2 ms | - |64 bytes from 10.0.0.22: icmp_seq=5 ttl=255 time=16.8 ms | - |64 bytes from 10.0.0.22: icmp_seq=6 ttl=255 time=1.8 ms | - |64 bytes from 10.0.0.22: icmp_seq=7 ttl=255 time=1.7 ms | - +---------------------------------------------------------------+ - - - Not *one* ping packet was lost! Just after packet 4, I disconnected my - P200 from the network, and my 486 took over, which you can see from the - higher latency. - - ------------------------------------------------------------------------------ -Chapter 19. Further reading - -[http://snafu.freedom.org/linux2.2/iproute-notes.html] http:// - snafu.freedom.org/linux2.2/iproute-notes.html - Contains lots of technical information, comments from the kernel - -[http://www.davin.ottawa.on.ca/ols/] http://www.davin.ottawa.on.ca/ols/ - Slides by Jamal Hadi Salim, one of the authors of Linux traffic control - -[http://defiant.coinet.com/iproute2/ip-cref/] http://defiant.coinet.com/ - iproute2/ip-cref/ - HTML version of Alexeys LaTeX documentation - explains part of iproute2 - in great detail - -[http://www.aciri.org/floyd/cbq.html] http://www.aciri.org/floyd/cbq.html - Sally Floyd has a good page on CBQ, including her original papers. None - of it is Linux specific, but it does a fair job discussing the theory and - uses of CBQ. Very technical stuff, but good reading for those so - inclined. - -Differentiated Services on Linux - This [ftp://icaftp.epfl.ch/pub/linux/diffserv/misc/dsid-01.txt.gz] - document by Werner Almesberger, Jamal Hadi Salim and Alexey Kuznetsov - describes DiffServ facilities in the Linux kernel, amongst which are TBF, - GRED, the DSMARK qdisc and the tcindex classifier. - -[http://ceti.pl/~kravietz/cbq/NET4_tc.html] http://ceti.pl/~kravietz/cbq/ - NET4_tc.html - Yet another HOWTO, this time in Polish! You can copy/paste command lines - however, they work just the same in every language. The author is - cooperating with us and may soon author sections of this HOWTO. - -[http://www.cisco.com/univercd/cc/td/doc/product/software/ios111/cc111/ - car.htm] IOS Committed Access Rate - >From the helpful folks of Cisco who have the laudable habit of putting - their documentation online. Cisco syntax is different but the concepts - are the same, except that we can do more and do it without routers the - price of cars :-) - -Docum experimental site[http://www.docum.org] (site) - Stef Coene is busy convincing his boss to sell Linux support, and so he - is experimenting a lot, especially with managing bandwidth. His site has - a lot of practical information, examples, tests and also points out some - CBQ/tc bugs. - -TCP/IP Illustrated, volume 1, W. Richard Stevens, ISBN 0-201-63346-9 - Required reading if you truly want to understand TCP/IP. Entertaining as - well. - - ------------------------------------------------------------------------------ -Chapter 20. Acknowledgements - -It is our goal to list everybody who has contributed to this HOWTO, or helped -us demystify how things work. While there are currently no plans for a -Netfilter type scoreboard, we do like to recognize the people who are -helping. - - - -  * Junk Alins - - - -  * Joe Van Andel - -  * Michael T. Babcock - - - -  * Christopher Barton - - - -  * Ard van Breemen - - - -  * Ron Brinker - - - -  * ?ukasz Bromirski - - - -  * Lennert Buytenhek - - - -  * Esteve Camps - - - -  * Stef Coene - - - -  * Don Cohen - - - -  * Jonathan Corbet - - - -  * Gerry N5JXS Creager - - - -  * Marco Davids - - - -  * Jonathan Day - - - -  * Martin aka devik Devera - - - -  * Stephan "Kobold" Gehring - - - -  * Jacek Glinkowski - - - -  * Andrea Glorioso - - - -  * Nadeem Hasan - - - -  * Erik Hensema - - - -  * Vik Heyndrickx - - - -  * Spauldo Da Hippie - - - -  * Koos van den Hout - - - -  * Stefan Huelbrock - -  * Alexander W. Janssen - -  * Gareth John - -  * Martin Josefsson - -  * Andi Kleen - -  * Andreas J. Koenig - -  * Pawel Krawczyk - -  * Amit Kucheria - -  * Edmund Lau - -  * Philippe Latu - -  * Arthur van Leeuwen - -  * Jason Lunz - -  * Stuart Lynne - -  * Alexey Mahotkin - -  * Predrag Malicevic - -  * Patrick McHardy - -  * Andreas Mohr - -  * Andrew Morton - -  * Wim van der Most - -  * Stephan Mueller - -  * Togan Muftuoglu - -  * Chris Murray - -  * Patrick Nagelschmidt - -  * Ram Narula - -  * Jorge Novo - -  * Patrik - -  * P?l Osgy?ny - -  * Lutz Preßler - -  * Jason Pyeron - -  * Rusty Russell - -  * Mihai RUSU - -  * Jamal Hadi Salim - -  * David Sauer - -  * Sheharyar Suleman Shaikh - -  * Stewart Shields - -  * Nick Silberstein - -  * Konrads Smelkov - -  * William Stearns - - - -  * Andreas Steinmetz - -  * Jason Tackaberry - -  * Charles Tassell - -  * Glen Turner - -  * Tea Sponsor: Eric Veldhuyzen - -  * Song Wang - -  * Lazar Yanackiev - - - - -Linksys Blue Box Router HOWTO - -Eric Steven Raymond - -[http://www.catb.org/~esr/] Thyrsus Enterprises -Revision History -Revision 1.6 2004-02-26 Revised by: esr -Added Link-n-Log -Revision 1.5 2003-07-31 Revised by: esr -Added the Seattle wireless.net link. -Revision 1.4 2003-07-03 Revised by: esr -Linksys has released source code. -Revision 1.3 2003-06-08 Revised by: esr -Added notes about SNMP security problems, casemodding, Linksys tech support. -The Linksys turns out to have Linux inside. -Revision 1.2 2003-04-29 Revised by: esr -Typo corrections. -Revision 1.1 2003-04-25 Revised by: esr -Added link to the linksysmon project. More configuration tips. -Revision 1.0 2003-04-09 Revised by: esr -Initial release, reviewed by LDP. - - -Linksys makes a line of cheap, ubiquitous router/firewall boxes (models -BEFSR41 and up) well-suited for use on a home DSL connection and popular -among Linux hackers. This HOWTO gives hints and tips for managing Linksys -routers from a Linux system, including the firmware upgrade procedure. - ------------------------------------------------------------------------------ -Table of Contents -1. Introduction - 1.1. Why this document? - 1.2. New versions of this document - 1.3. License and Copyright - - -2. How and where to deploy -3. Lost the manual? -4. Configuration hints -5. Software -6. Modding and reverse-engineering -7. Troubleshooting tips - 7.1. Occasional catatonia and epilepsy - 7.2. Mozilla interface quirks under 1.38 and earlier firmware - - -8. Upgrading the firmware -9. Related Resources - -1. Introduction - -1.1. Why this document? - -Linksys makes a line of cheap, ubiquitous router/firewall boxes well-suited -for use on a home DSL connection and popular among Linux hackers. This HOWTO -gives hints and tips for managing Linksys routers from a Linux system. - -The specific recipes described here are derived from long experience with a -BEFSR41, the 4-port router/firewall box. I have also configured a BEFW11S4v2, -the 4-port router with 80211b wireless, and it behaves so similarly to the -BEFSR41 that I suspect they're using the firmware images mostly generated -from common source code ?? in fact, it wouldn't surprise me if it were the -same firmware, doing port tests to figure out what pieces of the user -interface it should enable. The firmware and web interfaces on all these blue -boxes are very similar, and most of the advice should generalize. ------------------------------------------------------------------------------ - -1.2. New versions of this document - -You can also view the latest version of this HOWTO on the World Wide Web via -the URL [http://www.tldp.org/HOWTO/Linksys-Blue-Box-Router-HOWTO.html] http: -//www.tldp.org/HOWTO/Linksys-Blue-Box-HOWTO.html. ------------------------------------------------------------------------------ - -1.3. License and Copyright - -Copyright (c) 2003, Eric S. Raymond. - -Permission is granted to copy, distribute and/or modify this document under -the terms of the GNU Free Documentation License, Version 1.2 or any later -version published by the Free Software Foundation; with no Invariant -Sections, no Front-Cover Texts, and no Back-Cover Texts. A copy of the -license is located at [http://www.gnu.org/copyleft/fdl.html] www.gnu.org/ -copyleft/fdl.html. - -Feel free to mail any questions or comments about this HOWTO to Eric S. -Raymond, . But please don't ask me to troubleshoot -your general networking problems; if you do, I'll just ignore you. ------------------------------------------------------------------------------ - -2. How and where to deploy - -The Linksys BEFSR41 and its higher-end siblings are designed to be used as -gateway boxes on a home Ethernet. Typically, you'll hook one up to a DSL or -cable modem, which will automatically switch into bridge mode and simply pass -packets between your ISP's router and the Linksys box. - -If you want to use a general-purpose PC running Linux as a firewall, have fun -?? but these little boxes are more efficient. The nicest thing about Linksys -boxes is that they run out of firmware and are too stupid to be cracked. -Also, they don't generate fan noise or heat. Finally, they have no moving -parts, so you can expect a good long mean time between failures. - -At minimum, your Linksys box will do the following things for you: - - 1. Act as an Ethernet router. You can plug all your lines and hubs and hosts - into it to exchange packets even when your outside link is down. - - 2. Act as a smart gateway. When you configure the Linksys with a public - static IP address (or tell it to grab a dynamic IP address from your ISP - at startup time), it will gateway between hosts on your private network - and the Internet, performing all the IP masquerading and address - translation required to route your traffic. - - 3. Firewall your connection. You can tell it to block out all but the - minimum sevice channels you need. You can specify separately, for each - service, to which of your internal machines the traffic should be routed. - - -Some of the higher-end versions will do extras like virtual private -networking and wireless. - -I give my Linksys box the standard private-network gateway address, -192.168.1.1. I then give all my boxes 192.168.1.x addresses and tell them the -Linksys is their gateway. Everything works. ------------------------------------------------------------------------------ - -3. Lost the manual? - -If you've lost the manual, or acquired a secondhand unit that doesn't have -one with it, never fear. Under the Help tab there are links to the PDF and to -the Linksys corporate website. - -Unfortunately, you're in trouble if you have to call Linksys tech support. On -the one occasion that I called them, the first tech I raised couldn't even -speak English, and the second was barely competent at it. Both were complete -and utter idiots whose response to any nontrivial question was to put me on -infinite hold while they went off to query someone else ?? and then garbled -the answer. ------------------------------------------------------------------------------ - -4. Configuration hints - -For security and performance, do these things: - -First, make sure AOL Parental Controls (under Security) is turned off (off is -the default); otherwise the Linksys won't pass packets for your Unix box at -all. - -For security, make sure the DMZ host feature is disabled (under Advanced->DMZ -Host). Port forward specific services instead, and as few of those as you can -get away with. A good minimum set is 22 (ssh), and 80 (http). If you want to -receive mail add 25. If you need to serve DNS queries, add 53. - -Disable Universal Plug and Play (under Password). There is a radio button for -this under the "Password" tab. UPnP is a notorious security hole in Windows, -and up to at least firmware version 1.44 there was a lot of Web scuttlebutt -that the Linksys implementation is flaky. While this won't affect operating -systems written by competent people, there is no point in having traffic from -a bunch of script-kiddie probes even reach your network. - -If you want to run a server, you also need to make sure stateful packet -inspection is off ?? this feature restricts incoming packets to those -associated with an outbound connection and is intended for heightened -security on client-only systems. On the Filters page, make sure SPI is off. -If you don't see a radiobutton for SPI, relax ?? the feature isn't present in -all versions of the firmware, and in fact was removed in 1.43 for stability -reasons. - -To speed up sending of outbound mail, go to Advanced->Forwarding and click -the Port Triggering button. Specify 25,25 a the trigger port range and -113,113 as its incoming-port range. What this will do is punch a temporary -hole through the firewall during each outbound SMTP session that will allow -the receiving system to get to port 113, which is identd service. This will -enable the receiving SMTP to do an identd check on your connection rather -than timing out. - -Some bug was introduced in firmware revision 1.42.3 that broke traceroute. -This was fixed in 1.42.6; just upgrade to the latest version. ------------------------------------------------------------------------------ - -5. Software - -There is a Unix utility called linksysmon that talks with these boxes via -SNMP. There is a [http://woogie.net/linksysmon/] Linksysmon project site. - -Linksysmon is a tool for monitoring Linksys BEFSR41 and BEFSR11 firewalls -under Linux and other Unix-like operating systems. It accepts log messages -from the Linksys, and logs the messages to /var/log/linksys.log. It handles -the standard activity logs, as well as the "secret" extended logging, and can -handle logs from multiple firewalls. When using extended logging, it can -detect external IP address changes (if you are using either DHCP or PPPOE) -and can call an external program to process the change. - -Link-n-Log is a similar tool that includes a GUI and logs to an SQL database. -Details at the Link-n-Log project page. ------------------------------------------------------------------------------ - -6. Modding and reverse-engineering - -There is a [http://www.bextreme.net/wap11web/] page that tells you how to -casemod the Linksys wireless router (they just call it the WAP11 but it -appears to be one of the BEFW11S4 variants. - -The Linksys has Linux inside. Intrepid hacker Erik Andersen tells us: -#!/bin/sh -# This is what I did to open up the Linksys rom... - -wget ftp://ftp.linksys.com/pub/network/WRT54G_1.02.1_US_code.bin - -# I noticed a GZIP signature for a file name "piggy" at offset -# 60 bytes from the start, suggesting we have a compressed Linux -# kernel -dd if=WRT54G_1.02.1_US_code.bin bs=60 skip=1 | zcat > kernel - -# Noticed there was a cramfs magic signature (bytes 45 3D CD 28 -followed shortly by "Compressed ROMFS") at offset 786464 -dd if=WRT54G_1.02.1_US_code.bin of=cramfs.image bs=786464 skip=1 -file cramfs.image - -sudo mount -o loop,ro -t cramfs ./cramfs.image /mnt -ls -la /mnt/bin -file /mnt/bin/busybox -strings /mnt/bin/busybox | grep BusyBox -# Use uClibc's ldd to get useful answers for non-x86 binaries -/usr/i386-linux-uclibc/bin/i386-uclibc-ldd /mnt/bin/busybox - -Linksys now supplies [http://www.linksys.com/support/gpl.asp] source code on -its site (I don't know what's in the various archives, though). Several other -similar products, including the Buffalo Technology box, seem to use the same -firmware. - -There's an interesting site on hacking the Wrt54g by Seattle wireless.net. ------------------------------------------------------------------------------ - -7. Troubleshooting tips - -7.1. Occasional catatonia and epilepsy - -Linksys boxes freeze up occasionally (once every few months) and have to be -power-cycled. Suspect this is happening if your outside Web access suddenly -stops working; ping the Linksys box to check. - -These catatonic episodes may be related to dirty power; at least, they seems -to happen more frequently in association with electrical storms and -brownouts. If you think this has happened, just pull the power connector out -of the back and plug it back in. The Linksys should reboot itself within 30 -seconds or so. - -There is a more severe failure mode that I've only seen once; it's more like -an epileptic seizure than catatonia, and involves strange blink patterns on -the Link, Collision, and 100Mbit diagnostic lights (the 100Mbit light should -not normally ever blink). - -If this happens, power-cycling the Linksys won't suffice; you'll have to -hard-reset the thing. Some versions (like the BEFSR41) have a reset pin that -you poke with a paperclip end through a small hole in the front panel labeled -Reset. Some versions (like the BEFW11S4) have a reset button on the back. You -have to hold these down for about thirty seconds to hard-reset the -nonvolatile RAM. This will lose your configuration settings. - -Linksys boxes support MRTG, the Multi Router Traffic Grapher, which queries -devices via SNMP. It is not clear what the extent of the SNMP support is, as -it's not documented. This security advisory from January 2002 claims Linkys -doesn't distribute a MIB because their implementation of SNMP is broken. ------------------------------------------------------------------------------ - -7.2. Mozilla interface quirks under 1.38 and earlier firmware - -Linksys blue boxes have a webserver embedded in their firmware. The normal -way to administer one is to point a browser at its IP address on your -network. You program the box by filling out HTML forms. - -This is a nice bit of design that neatly avoids having OS-specific client -software. But some older versions of the webserver firmware have a quirk that -interacts with a bug in Mozilla (at least at release 1.0.1) to make the -interface almost unusable. Fortunately, the recovery procedure is trivial. -This bug was known to be present as late as 1.40, and also interfered with -Netscape; it is absent in 1.44 and a good reason to upgrade. We have a report -that Mozilla 1.3 fails with 1.43, so whatever change fixed the problem likely -came in with 1.44. - -The symptom you're likely to see is a broken-image icon at the upper left -hand corner of each page. The broken image is a series of file-folder tabs -for an image map. That image map is how you get to the other web pages. - -You can recover by right-clicking on the broken-image icon. Select "View -Image", then back out. This will build the image map correctly. - -You will almost always have to do this on the first page, but it often won't -trigger on later page loads. - -Here's what's going on. Mozilla tries to stream multiple concurrent requests -at the webservers it talks to in order to speed up page loading. The -dimwitted little firmware webserver in the Linksys is only single-threaded -and doesn't handle concurrent requests. So there's a race condition. When you -hit the window just right, you get an aborted request and a broken graphic. - -Most other browsers are immune to this problem. Konqueror doesn't trigger it. -Neither does Internet Explorer. ------------------------------------------------------------------------------ - -8. Upgrading the firmware - -Before you upgrade, here is a tip the documentation does not mention: -disconnect all the patch cables except the one from the machine you are using -to upgrade the box. Handling a lot of other network traffic while the -firmware load is gong on can corrupt the firmware. - -There are three ways you can upgrade your Linksys firmware. - -One is to click the "Upgrade firmware" link on the help page. Unfortunately, -this required Java in the browser under the 1.38 firmware. That has changed -under 1.44. It looks as though you can now fill in the field that says " -Please select a file to upgrade:", click the Upgrade button, and have the -right thing happen. - -Another way is to use one of Linkys's firmware-upgrade floppy images from -their website. This requires that you boot Windows or use WINE. - -The third way is to use tftp. This is how I did it. There is a tftp client -included with Red Hat Linux. To upgrade your firmware this way, do the -following steps: - - 1. Capture a copy of your settings. The firmware upgrade may wipe some of - them. Older versions nuked everything back to factory defaults; newer - versions preserve your basic settings but clear some advanced ones. - - 2. Download a copy of the new firmware. You should find it at Firmware - Upgrades for your Linksys Products on the Linksys site. Note that what - you get may well be marked "For Windows Users" and be a zip archive. Open - it in a scratch directory, because it will rudely create several Windows - files wherever you unpack it. The file you need will be called CODE.BIN. - - 3. Disable the router password Note that every attempt I made to do this - with Mozilla failed (both under 1.38 and 1.44). Konqueror worked fine. Go - to the Password tab, backspace over both sets of asterisks until both the - Password and Confirm fields are blank, and click Apply. - - 4. Cross your fingers and load the firmware The command session you want - will to see will look something like this, with your router's IP address - substituted for 192.168.1.1: - tftp 192.168.1.1 - tftp> binary - tftp> put code.bin - Sent 386048 bytes in 10.3 seconds - tftp> - - Don't panic if the client hangs for a bit before returning and do not - abort the transfer. The command is writing to firmware, and the Linksys - hasn't got much of a brain. Wait for it to finish. - - 5. Re-enable your router password and other settings. You'll be able to tell - the upgrade worked because the firmware version number has changed. - - -You're done. ------------------------------------------------------------------------------ - -9. Related Resources - -There is a site called [http://www.hansenonline.net/Networking/ -linksysFW.html] HansenOnline.net that seems to be mainly devoted to tracking -and critiquing the Linksys firmware releases. Alas, the monitoring software -it offers is for Windows. - -There is a Linksys tips and tricks [http://www.dslreports.com/faq/linksys] -FAQ; it's mostly Windows stuff, but a few of the war stories may be useful. - -There is a good article on configuring the BEFSR41, and its limitations, at -[http://www.arstechnica.com/reviews/3q00/linksys/befsr41-2.html] Linksys -EtherFast Cable/DSL Router, Model BEFSR41. It dates from August of 2000. - - diff --git a/LDP/guide/docbook/Linux-Networking/Services.xml b/LDP/guide/docbook/Linux-Networking/Services.xml index a8764f8a..3f70082c 100644 --- a/LDP/guide/docbook/Linux-Networking/Services.xml +++ b/LDP/guide/docbook/Linux-Networking/Services.xml @@ -5984,4 +5984,1055 @@ capabilities and implementation of traffic control under Linux. + + +Load-Balancing + + +Demand for load balancing usually arises in database/web access when +many clients make simultaneous requests to a server. It would be +desirable to have multiple identical servers and redirect requests to +the less loaded server. This can be achieved through Network Address +Translation techniques (NAT) of which IP masquerading is a subset. +Network administrators can replace a single server providing Web +services - or any other application - with a logical pool of servers +sharing a common IP address. Incoming connections are directed to a +particular server using one load-balancing algorithm. The virtual +server rewrites incoming and outgoing packets to give clients the +appearance that only one server exists. + + + +Linux IP-NAT information may be found here + + + + + + +Bandwidth-Limiting + + +This section describes how to set up your Linux server to limit download +bandwidth or incoming traffic and how to use your internet link more +efficiently. It is meant to provide an easy solution for limiting +incoming traffic, thus preventing our LAN users from consuming all the +bandwidth of our internet link. This is useful when our internet link +is slow or our LAN users download tons of mp3s and the newest Linux +distro's *.iso files. + + +* Bandwidth Limiting HOWTO + +6. Miscellaneous + +6.1. Useful resources + +Squid Web Proxy Cache +[http://www.squid-cache.org] http://www.squid-cache.org + +Squid 2.4 Stable 1 Configuration manual +[http://www.visolve.com/squidman/Configuration%20Guide.html] http:// +www.visolve.com/squidman/Configuration%20Guide.html +[http://www.visolve.com/squidman/Delaypool%20parameters.htm] http:// +www.visolve.com/squidman/Delaypool%20parameters.htm + +Squid FAQ +[http://www.squid-cache.org/Doc/FAQ/FAQ-19.html#ss19.8] http:// +www.squid-cache.org/Doc/FAQ/FAQ-19.html#ss19.8 + +cbq-init script +[ftp://ftp.equinox.gu.net/pub/linux/cbq/] ftp://ftp.equinox.gu.net/pub/linux/ +cbq/ + +Linux 2.4 Advanced Routing HOWTO +[http://www.linuxdoc.org/HOWTO/Adv-Routing-HOWTO.html] http:// +www.linuxdoc.org/HOWTO/Adv-Routing-HOWTO.html + +Traffic control (in Polish) +[http://ceti.pl/~kravietz/cbq/] http://ceti.pl/~kravietz/cbq/ + +Securing and Optimizing Linux Red Hat Edition - A Hands on Guide +[http://www.linuxdoc.org/guides.html] http://www.linuxdoc.org/guides.html + +IPTraf +[http://cebu.mozcom.com/riker/iptraf/] http://cebu.mozcom.com/riker/iptraf/ + +IPCHAINS +[http://www.linuxdoc.org/HOWTO/IPCHAINS-HOWTO.html] http://www.linuxdoc.org/ +HOWTO/IPCHAINS-HOWTO.html + +Nylon socks proxy server +[http://mesh.eecs.umich.edu/projects/nylon/] http://mesh.eecs.umich.edu/ +projects/nylon/ + +Indonesian translation of this HOWTO by Rahmat Rafiudin mjl_id@yahoo.com +[http://raf.unisba.ac.id/resources/BandwidthLimitingHOWTO/index.html] http:// +raf.unisba.ac.id/resources/BandwidthLimitingHOWTO/index.html + + + + + +Compressed-TCP + + +In the past, we used to compress files in order to save disk space. +Today, disk space is cheap - but bandwidth is limited. By compressing +data streams such as TCP/IP-Sessions using SSH-like tools, you achieve +two goals: + + + 1) You save bandwidth/transfered volume (that is important if you have + to pay for traffic or if your network is loaded.). + 2) Speeding up low-bandwidth connections (Modem, GSM, ISDN). + + +This HowTo explains how to save both bandwith and connection time by +using tools like SSH1, SSH2, OpenSSH or LSH. + + +2. Compressing HTTP/FTP,... + + +My office is connected with a 64KBit ISDN line to the internet, so the +maximum transfer rate is about 7K/s. You can speed up the connection +by compressing it: when I download files, Netscape shows up a transfer +rate of up to 40K/s (Logfiles are compressable by factor 15). SSH is a +tool that is mainly designed to build up secure connections over +unsecured networks. Further more, SSH is able to compress connections +and to do port forwarding (like rinetd or redir). So it is the +appropriate tool to compress any simple TCP/IP connection. "Simple" +means, that only one TCP-connection is opened. An FTP-connections or +the connection between M$-Outlook and MS-Exchange are not simple as +several connections are established. SSH uses the LempleZiv (LZ77) +compression algorithm - so you will achieve the same high compression +rate as winzip/pkzip. In order to compress all HTTP-connections from +my intranet to the internet, I just have to execute one command on my +dial-in machine: + + + + +ssh -l -C -L8080::80 -f sleep +10000 + + + + + + = host that is located at my ISP. SSH-access is required. + = my login-ID on + = the web proxy of my ISP + + + + +My browser is configured to use localhost:8080 as proxy. My laptop +connects to the same socket. The connection is compressed and +forwarded to the real proxy by SSH. The infrastructure looks like: + + + + + 64KBit ISDN + My PC--------------------------------A PC (Unix/Linux/Win-NT) at my ISP + SSH-Client compressed SSH-Server, Port 22 + Port 8080 | + | | + | | + | | + |10MBit Ethernet |100MBit + |not compressed |not compressed + | | + | | + My second PC ISP's WWW-proxy + with Netscape,... Port 80 + (Laptop) + + + +3. Compressing Email + +3.1. Incoming Emails (POP3, IMAP4) + + +Most people fetch their email from the mailserver via POP3. POP3 is a +protocol with many disadvantages: + + + 1. POP3 transfers password in clear text. (There are SSL- + implementations of POP/IMAP and a challenge/response + authentication, defined in RFC-2095/2195). + + 2. POP3 causes much protocol overhead: first the client requests a + message than the server sends the message. After that the client + requests the transferred article to be deleted. The server confirms + the deletion. After that the server is ready for the next + transaction. So 4 transactions are needed for each email. + + 3. POP3 transfers the mails without compression although email is + highly compressible (factor=3.5). + + +You could compress POP3 by forwarding localhost:110 through a +compressed connection to your ISP's POP3-socket. After that you have +to tell your mail client to connect to localhost:110 in order to +download mail. That secures and speeds up the connection -- but the +download time still suffers from the POP3-inherent protocol overhead. + + + +It makes sense to substitute POP3 by a more efficient protocol. The +idea is to download the entire mailbox at once without generating +protocol overhead. Furthermore it makes sense to compress the +connections. The appropriate tool which offers both features is SCP. +You can download your mail-file like this: + + + + + scp -C -l loginId:/var/spool/mail/loginid /tmp/newmail + + + + +But there is a problem: what happens if a new email arrives at the +server during the download of your mailbox? The new mail would be +lost. Therefore it makes more sense to use the following commands: + + + + + ssh -l loginid mailserver -f mv /var/spool/mail/loginid + /tmp/loginid_fetchme + scp -C -l loginid:/tmp/my_new_mail /tmp/loginid_fetchme + + + + +A move (mv) is a elementary operation, so you won't get into truble if +you receive new mail during the execution of the commands. But if the +mail server directories /tmp/ and /var/spool/mail are not on the same +disc you might get problems. A solution is to create a lockfile on the +server before you execute the mv: touch /var/spool/mail/loginid.lock. +You should remove it, after that. A better solution is to move the +file loginid in the same directory: + + + + + ssh -l loginid mailserver -f mv /var/spool/mail/loginid + /var/spool/mail/loginid_fetchme + + + + +After that you can use formail instead of procmail in order to filter +/tmp/newmail into the right folder(s): + + + + +formail -s procmail < /tmp/newmail + + + +3.2. Outgoing Email (SMTP) + + +You send email over compresses and encrypted SSH-connections, in order +to: + + + · Save network traffic + · Secure the connection (This does not make sense, if the mail is + transported over untrusted networks, later.) + · Authenticate the sender. Many mail servers deny mail relaying in + order to prevent abuse. If you send an email over an SSH- + connection, the remote mail server (i.e. sendmail or MS-exchange) + thinks to be connected, locally. + + +If you have SSH-access on the mail server, you need the following +command: + + + + + ssh -C -l loginid mailserver -L2525:mailserver:25 + + + + +If you don't have SSH-access on the mail server but to a server that +is allowed to use your mail server as relay, the command is: + + + + + ssh -C -l loginid other_server -L2525:mailserver:25 + + + + +After that you can configure your mail client (or mail server: see +"smarthost") to send out mails to localhost port 2525. + + +4. Thoughts about performance. + + +Of course compression/encryption takes CPU time. It turned out that an +old Pentium-133 is able to encrypt and compress about 1GB/hour -- +that's quite a lot. If you compile SSH with the option "--with-none" +you can tell SSH to use no encryption. That saves a little +performance. Here is a comprison between several download methods +(during the test, a noncompressed 6MB-file was transfered from a +133MHz-Pentium-1 to a 233MHz Pentium2 laptop over a 10MBit ethernet +without other load). + + + + + +-------------------+--------+----------+-----------+----------------------+ + | | FTP |encrypted |compressed |compressed & encrypted| + +-------------------+--------+----------+-----------+----------------------+ + | Elapsed Time | 17.6s | 26s | 9s | 23s | + +-------------------+--------+----------+-----------+----------------------+ + | Throughput | 790K/s | 232K/s | 320K/s | 264K/s | + +-------------------+--------+----------+-----------+----------------------+ + |Compression Factor | 1 | 1 | 3.8 | 3.8 | + +-------------------+--------+----------+-----------+----------------------+ + + + + + + + +IP-Accounting + + +This option of the Linux kernel keeps track of IP network traffic, +performs packet logging and produces some statistics. A series of +rules may be defined so when a packet matches a given pattern, some +action is performed: a counter is increased, it is accepted/rejected, +etc. + + + +6.3. IP Accounting (for Linux-2.0) +The IP accounting features of the Linux kernel allow you to collect +and analyze some network usage data. The data collected comprises the +number of packets and the number of bytes accumulated since the +figures were last reset. You may specify a variety of rules to +categorize the figures to suit whatever purpose you may have. This +option has been removed in kernel 2.1.102, because the old ipfwadm- +based firewalling was replaced by ``ipfwchains''. + + + + + Kernel Compile Options: + + Networking options ---> + [*] IP: accounting + + + + +After you have compiled and installed the kernel you need to use the +ipfwadm command to configure IP accounting. There are many different +ways of breaking down the accounting information that you might +choose. I've picked a simple example of what might be useful to use, +you should read the ipfwadm man page for more information. +Scenario: You have a ethernet network that is linked to the internet +via a PPP link. On the ethernet you have a machine that offers a +number of services and that you are interested in knowing how much +traffic is generated by each of ftp and world wide web traffic, as +well as total tcp and udp traffic. + + + +You might use a command set that looks like the following, which is +shown as a shell script: + + + + + #!/bin/sh + # + # Flush the accounting rules + ipfwadm -A -f + # + # Set shortcuts + localnet=44.136.8.96/29 + any=0/0 + # Add rules for local ethernet segment + ipfwadm -A in -a -P tcp -D $localnet ftp-data + ipfwadm -A out -a -P tcp -S $localnet ftp-data + ipfwadm -A in -a -P tcp -D $localnet www + ipfwadm -A out -a -P tcp -S $localnet www + ipfwadm -A in -a -P tcp -D $localnet + ipfwadm -A out -a -P tcp -S $localnet + ipfwadm -A in -a -P udp -D $localnet + ipfwadm -A out -a -P udp -S $localnet + # + # Rules for default + ipfwadm -A in -a -P tcp -D $any ftp-data + ipfwadm -A out -a -P tcp -S $any ftp-data + ipfwadm -A in -a -P tcp -D $any www + ipfwadm -A out -a -P tcp -S $any www + ipfwadm -A in -a -P tcp -D $any + ipfwadm -A out -a -P tcp -S $any + ipfwadm -A in -a -P udp -D $any + ipfwadm -A out -a -P udp -S $any + # + # List the rules + ipfwadm -A -l -n + # + + + + +The names ``ftp-data'' and ``www'' refer to lines in /etc/services. +The last command lists each of the Accounting rules and displays the +collected totals. + + + +An important point to note when analyzing IP accounting is that totals +for all rules that match will be incremented so that to obtain +differential figures you need to perform appropriate maths. For +example if I wanted to know how much data was not ftp nor www I would +substract the individual totals from the rule that matches all ports. + + + + + root# ipfwadm -A -l -n + IP accounting rules + pkts bytes dir prot source destination ports + 0 0 in tcp 0.0.0.0/0 44.136.8.96/29 * -> 20 + 0 0 out tcp 44.136.8.96/29 0.0.0.0/0 20 -> * + 10 1166 in tcp 0.0.0.0/0 44.136.8.96/29 * -> 80 + 10 572 out tcp 44.136.8.96/29 0.0.0.0/0 80 -> * + 252 10943 in tcp 0.0.0.0/0 44.136.8.96/29 * -> * + 231 18831 out tcp 44.136.8.96/29 0.0.0.0/0 * -> * + 0 0 in udp 0.0.0.0/0 44.136.8.96/29 * -> * + 0 0 out udp 44.136.8.96/29 0.0.0.0/0 * -> * + 0 0 in tcp 0.0.0.0/0 0.0.0.0/0 * -> 20 + 0 0 out tcp 0.0.0.0/0 0.0.0.0/0 20 -> * + 10 1166 in tcp 0.0.0.0/0 0.0.0.0/0 * -> 80 + 10 572 out tcp 0.0.0.0/0 0.0.0.0/0 80 -> * + 253 10983 in tcp 0.0.0.0/0 0.0.0.0/0 * -> * + 231 18831 out tcp 0.0.0.0/0 0.0.0.0/0 * -> * + 0 0 in udp 0.0.0.0/0 0.0.0.0/0 * -> * + 0 0 out udp 0.0.0.0/0 0.0.0.0/0 * -> * + + + + +6.4. IP Accounting (for Linux-2.2) + +The new accounting code is accessed via ``IP Firewall Chains''. See +the IP chains home page for more information. Among other things, +you'll now need to use ipchains instead of ipfwadm to configure your +filters. (From Documentation/Changes in the latest kernel sources). + + + + + + +IP-Aliasing + + +This is a cookbook recipe on how to set up and run IP aliasing on a Linux box +and how to set up the machine to receive e-mail on the aliased IP addresses. + + + +This feature of the Linux kernel provides the possibility of setting +multiple network addresses on the same low-level network device driver +(e.g two IP addresses in one Ethernet card). It is typically used for +services that act differently based on the address they listen on +(e.g. "multihosting" or "virtual domains" or "virtual hosting +services". + + + +There are some applications where being able to configure multiple IP +addresses to a single network device is useful. Internet Service +Providers often use this facility to provide a `customized' to their +World Wide Web and ftp offerings for their customers. You can refer to +the ``IP-Alias mini-HOWTO'' for more information than you find here. + + + +Quickstart: + + + +After compiling and installing your kernel with IP_Alias support +configuration is very simple. The aliases are added to virtual network +devices associated with the actual network device. A simple naming +convention applies to these devices being :, +e.g. eth0:0, ppp0:10 etc. Note that the the ifname:number device can +only be configured after the main interface has been set up. + + + +For example, assume you have an ethernet network that supports two +different IP subnetworks simultaneously and you wish your machine to +have direct access to both, you could use something like: + + + + + root# ifconfig eth0 192.168.1.1 netmask 255.255.255.0 up + root# route add -net 192.168.1.0 netmask 255.255.255.0 eth0 + root# ifconfig eth0:0 192.168.10.1 netmask 255.255.255.0 up + root# route add -net 192.168.10.0 netmask 255.255.255.0 eth0:0 + + + +----------------------------------------------------------------------------- + +1. My Setup + + + +  * IP Alias is standard in kernels 2.0.x and 2.2.x, and available as a + compile-time option in 2.4.x (IP Alias has been deprecated in 2.4.x and + replaced by a more powerful firewalling mechanism.) +  * IP Alias compiled as a loadable module. You would have indicated in the + "make config" command to make your kernel, that you want the IP Masq to + be compiled as a (M)odule. Check the Modules HOW-TO (if that exists) or + check the info in /usr/src/linux/Documentation/modules.txt. +  * I have to support 2 additional IPs over and above the IP already + allocated to me. +  * A D-Link DE620 pocket adapter (not important, works with any Linux + supported network adapter). + + + + + Kernel Compile Options: + + Networking options ---> + .... + [*] Network aliasing + .... + <*> IP: aliasing support + + + + +----------------------------------------------------------------------------- + + +2. Commands + + + +1. Load the IP Alias module (you can skip this step if you compiled the +module into the kernel): + + + + + /sbin/insmod /lib/modules/`uname -r`/ipv4/ip_alias.o + + + + +2. Setup the loopback, eth0, and all the IP addresses beginning with the +main IP address for the eth0 interface: + + + + + /sbin/ifconfig lo 127.0.0.1 + /sbin/ifconfig eth0 up + /sbin/ifconfig eth0 172.16.3.1 + /sbin/ifconfig eth0:0 172.16.3.10 + /sbin/ifconfig eth0:1 172.16.3.100 + + + + +172.16.3.1 is the main IP address, while .10 and .100 are the aliases. +The magic is the eth0:x where x=0,1,2,...n for the different IP +addresses. The main IP address does not need to be aliased. + + + +3. Setup the routes. First route the loopback, then the net, and finally, +the various IP addresses starting with the default (originally allocated) +one: + + + + + /sbin/route add -net 127.0.0.0 + /sbin/route add -net 172.16.3.0 dev eth0 + /sbin/route add -host 172.16.3.1 dev eth0 + /sbin/route add -host 172.16.3.10 dev eth0:0 + /sbin/route add -host 172.16.3.100 dev eth0:1 + /sbin/route add default gw 172.16.3.200 + + + + +That's it. + + + +In the example IP address above, I am using the Private IP addresses (RFC +1918) for illustrative purposes. Substitute them with your own official or +private IP addresses. + + + +The example shows only 3 IP addresses. The max is defined to be 256 in /usr/ +include/linux/net_alias.h. 256 IP addresses on ONE card is a lot :-)! + + + +Here's what my /sbin/ifconfig looks like: + + + + +lo Link encap:Local Loopback + inet addr:127.0.0.1 Bcast:127.255.255.255 Mask:255.0.0.0 + UP BROADCAST LOOPBACK RUNNING MTU:3584 Metric:1 + RX packets:5088 errors:0 dropped:0 overruns:0 + TX packets:5088 errors:0 dropped:0 overruns:0 + +eth0 Link encap:10Mbps Ethernet HWaddr 00:8E:B8:83:19:20 + inet addr:172.16.3.1 Bcast:172.16.3.255 Mask:255.255.255.0 + UP BROADCAST RUNNING PROMISC MULTICAST MTU:1500 Metric:1 + RX packets:334036 errors:0 dropped:0 overruns:0 + TX packets:11605 errors:0 dropped:0 overruns:0 + Interrupt:7 Base address:0x378 + +eth0:0 Link encap:10Mbps Ethernet HWaddr 00:8E:B8:83:19:20 + inet addr:172.16.3.10 Bcast:172.16.3.255 Mask:255.255.255.0 + UP BROADCAST RUNNING MTU:1500 Metric:1 + RX packets:0 errors:0 dropped:0 overruns:0 + TX packets:0 errors:0 dropped:0 overruns:0 + +eth0:1 Link encap:10Mbps Ethernet HWaddr 00:8E:B8:83:19:20 + inet addr:172.16.3.100 Bcast:172.16.3.255 Mask:255.255.255.0 + UP BROADCAST RUNNING MTU:1500 Metric:1 + RX packets:1 errors:0 dropped:0 overruns:0 + TX packets:0 errors:0 dropped:0 overruns:0 + + + + +And /proc/net/aliases: + + + + +device family address +eth0:0 2 172.16.3.10 +eth0:1 2 172.16.3.100 + + + + +And /proc/net/alias_types: + + + + +type name n_attach +2 ip 2 + + + + +Of course, the stuff in /proc/net was created by the ifconfig command and not +by hand! + +----------------------------------------------------------------------------- + + +3. Troubleshooting: Questions and Answers + + + +3.1. Question: How can I keep the settings through a reboot? + + + +Answer: Whether you are using BSD-style or SysV-style (Redhat?? for example) +init, you can always include it in /etc/rc.d/rc.local. Here's what I have on +my SysV init system (Redhat?? 3.0.3 and 4.0): + + + +My /etc/rc.d/rc.local: (edited to show the relevant portions) + + + + +#setting up IP alias interfaces +echo "Setting 172.16.3.1, 172.16.3.10, 172.16.3.100 IP Aliases ..." +/sbin/ifconfig lo 127.0.0.1 +/sbin/ifconfig eth0 up +/sbin/ifconfig eth0 172.16.3.1 +/sbin/ifconfig eth0:0 172.16.3.10 +/sbin/ifconfig eth0:1 172.16.3.100 +#setting up the routes +echo "Setting IP routes ..." +/sbin/route add -net 127.0.0.0 +/sbin/route add -net 172.16.3.0 dev eth0 +/sbin/route add -host 172.16.3.1 eth0 +/sbin/route add -host 172.16.3.10 eth0:0 +/sbin/route add -host 172.16.3.100 eth0:1 +/sbin/route add default gw 172.16.3.200 +# + + +----------------------------------------------------------------------------- + + +3.2. Question: How do I set up the IP aliased machine to receive e-mail on +the various aliased IP addresses (on a machine using sendmail)? + + + +Answer: Create (if it doesn't already exist) a file called, /etc/ +mynames.cw,for example. The file does not have to be this exact name nor in +the /etc directory. + + + +In that file, place the official domain names of the aliased IP addresses. If +these aliased IP addresses do not have a domain name, then you can place the +IP address itself. + + + +The /etc/mynames.cw might look like this: + + + + +# /etc/mynames.cw - include all aliases for your machine here; # is a comment +domain.one.net +domain.two.com +domain.three.org +4.5.6.7 + + + + +In your sendmail.cf file, where it defines a file class macro Fw, add the +following: + + + + +################## +# local info # +################## + +# file containing names of hosts for which we receive email +Fw/etc/mynames.cw + +That should do it. Test out the new setting by invoking sendmail in test +mode. The following is an example: +ganymede$ /usr/lib/sendmail -bt +ADDRESS TEST MODE (ruleset 3 NOT automatically invoked) +Enter < ruleset> < address> +> 0 me@4.5.6.7 +rewrite: ruleset 0 input: me @ 4 . 5 . 6 . 7 +rewrite: ruleset 98 input: me @ 4 . 5 . 6 . 7 +rewrite: ruleset 98 returns: me @ 4 . 5 . 6 . 7 +rewrite: ruleset 97 input: me @ 4 . 5 . 6 . 7 +rewrite: ruleset 3 input: me @ 4 . 5 . 6 . 7 +rewrite: ruleset 96 input: me < @ 4 . 5 . 6 . 7 > +rewrite: ruleset 96 returns: me < @ 4 . 5 . 6 . 7 . > +rewrite: ruleset 3 returns: me < @ 4 . 5 . 6 . 7 . > +rewrite: ruleset 0 input: me < @ 4 . 5 . 6 . 7 . > +rewrite: ruleset 98 input: me < @ 4 . 5 . 6 . 7 . > +rewrite: ruleset 98 returns: me < @ 4 . 5 . 6 . 7 . > +rewrite: ruleset 0 returns: $# local $: me +rewrite: ruleset 97 returns: $# local $: me +rewrite: ruleset 0 returns: $# local $: me +> 0 me@4.5.6.8 +rewrite: ruleset 0 input: me @ 4 . 5 . 6 . 8 +rewrite: ruleset 98 input: me @ 4 . 5 . 6 . 8 +rewrite: ruleset 98 returns: me @ 4 . 5 . 6 . 8 +rewrite: ruleset 97 input: me @ 4 . 5 . 6 . 8 +rewrite: ruleset 3 input: me @ 4 . 5 . 6 . 8 +rewrite: ruleset 96 input: me < @ 4 . 5 . 6 . 8 > +rewrite: ruleset 96 returns: me < @ 4 . 5 . 6 . 8 > +rewrite: ruleset 3 returns: me < @ 4 . 5 . 6 . 8 > +rewrite: ruleset 0 input: me < @ 4 . 5 . 6 . 8 > +rewrite: ruleset 98 input: me < @ 4 . 5 . 6 . 8 > +rewrite: ruleset 98 returns: me < @ 4 . 5 . 6 . 8 > +rewrite: ruleset 95 input: < > me < @ 4 . 5 . 6 . 8 > +rewrite: ruleset 95 returns: me < @ 4 . 5 . 6 . 8 > +rewrite: ruleset 0 returns: $# smtp $@ 4 . 5 . 6 . 8 $: me < @ 4 . 5 . 6 . 8 > +rewrite: ruleset 97 returns: $# smtp $@ 4 . 5 . 6 . 8 $: me < @ 4 . 5 . 6 . 8 > +rewrite: ruleset 0 returns: $# smtp $@ 4 . 5 . 6 . 8 $: me < @ 4 . 5 . 6 . 8 > +> + + + + +Notice when I tested me@4.5.6.7, it delivered the mail to the local machine, +while me@4.5.6.8 was handed off to the smtp mailer. That is the correct +response. + + + +3.3. Question: How do I delete an alias? + + + +Answer: To delete an alias you simply add a `-' to the end of its name and +refer to it and is as simple as: + + + + + root# ifconfig eth0:0- 0 + + + + +All routes associated with that alias will also be deleted +automatically. + + + + + +You are all set now. + + + + + + +Multicasting + + + * Multicast HOWTO + + A good page providing comparisons between reliable multicast protocols + is + + . + + A very good and up-to-date site, with lots of interesting links + (Internet drafts, RFCs, papers, links to other sites) is: + + . + + is also a good source of + information on the subject. + + Katia Obraczka's "Multicast Transport Protocols: A Survey and + Taxonomy" article gives short descriptions for each protocol and tries + to classify them according to different features. You can read it in + the IEEE Communications magazine, January 1998, vol. 36, No. 1. + + + + 10. References. + + 10.1. RFCs. + + + o RFC 1112 "Host Extensions for IP Multicasting". Steve Deering. + August 1989. + + o RFC 2236 "Internet Group Management Protocol, version 2". W. + Fenner. November 1997. + + o RFC 1458 "Requirements for Multicast Protocols". Braudes, R and + Zabele, S. May 1993. + + o RFC 1469 "IP Multicast over Token-Ring Local Area Networks". T. + Pusateri. June 1993. + + o RFC 1390 "Transmission of IP and ARP over FDDI Networks". D. Katz. + January 1993. + + o RFC 1583 "OSPF Version 2". John Moy. March 1994. + + o RFC 1584 "Multicast Extensions to OSPF". John Moy. March 1994. + + o RFC 1585 "MOSPF: Analysis and Experience". John Moy. March 1994. + + o RFC 1812 "Requirements for IP version 4 Routers". Fred Baker, + Editor. June 1995 + + o RFC 2117 "Protocol Independent Multicast-Sparse Mode (PIM-SM): + Protocol Specification". D. Estrin, D. Farinacci, A. Helmy, D. + Thaler; S. Deering, M. Handley, V. Jacobson, C. Liu, P. Sharma, and + L. Wei. July 1997. + + o RFC 2189 "Core Based Trees (CBT version 2) Multicast Routing". A. + Ballardie. September 1997. + + o RFC 2201 "Core Based Trees (CBT) Multicast Routing Architecture". + A. Ballardie. September 1997. + + + + 10.2. Internet Drafts. + + + o "Introduction to IP Multicast Routing". draft-ietf-mboned-intro- + multicast- 03.txt. T. Maufer, C. Semeria. July 1997. + + o "Administratively Scoped IP Multicast". draft-ietf-mboned-admin-ip- + space-03.txt. D. Meyer. June 10, 1997. + + 10.3. Web pages. + + + o Linux Multicast Homepage. + + + o Linux Multicast FAQ. + + o Multicast and MBONE on Linux. + + + o Christian Daudt's MBONE-Linux Page. + + + o Reliable Multicast Links + + + o Multicast Transport Protocols + + 10.4. Books. + + o "TCP/IP Illustrated: Volume 1 The Protocols". Stevens, W. Richard. + Addison Wesley Publishing Company, Reading MA, 1994 + + o "TCP/IP Illustrated: Volume 2, The Implementation". Wright, Gary + and W. Richard Stevens. Addison Wesley Publishing Company, Reading + MA, 1995 + + o "UNIX Network Programming Volume 1. Networking APIs: Sockets and + XTI". Stevens, W. Richard. Second Edition, Prentice Hall, Inc. + 1998. + + o "Internetworking with TCP/IP Volume 1 Principles, Protocols, and + Architecture". Comer, Douglas E. Second Edition, Prentice Hall, + Inc. Englewood Cliffs, New Jersey, 1991 + + + + + +Network-Management + + +There is an impressive number of tools focused on network management +and remote administration under Linux. Some interesting remote administration +projects are linuxconf and webmin: + + + +· Webmin +· Linuxconf + + + +Other tools include network traffic analysis tools, network security +tools, monitoring tools, configuration tools, etc. An archive of many +of these tools may be found at Metalab + + + +9.2. SNMP + + +The Simple Network Management Protocol is a protocol for Internet +network management services. It allows for remote monitoring and +configuration of routers, bridges, network cards, switches, etc... +There is a large amount of libraries, clients, daemons and SNMP based +monitoring programs available for Linux. A good page dealing with SNMP +and Linux software may be found at : http://linas.org/linux/NMS.html + + +10. Enterprise Linux Networking + + +In certain situations it is necessary for the networking +infrastructure to have proper mechanisms to guarantee network +availability nearly 100% of the time. Some related techniques are +described in the following sections. Most of the following material +can be found at the excellent Linas website: +http://linas.org/linux/index.html and in the Linux High-Availability +HOWTO + + +10.1. High Availability + + +Redundancy is used to prevent the overall IT system from having single +points of failure. A server with only one network card or a single +SCSI disk has two single points of failure. The objective is to mask +unplanned outages from users in a manner that lets users continue to +work quickly. High availability software is a set of scripts and tools +that automatically monitor and detect failures, taking the appropriate +steps to restore normal operation and to notifying system +administrators. + + + + + + +Redundant-Networking + + +IP Address Takeover (IPAT). When a network adapter card fails, its IP +address should be taken by a working network card in the same node or +in another node. MAC Address Takeover: when an IP takeover occurs, it +should be made sure that all the nodes in the network update their ARP +caches (the mapping between IP and MAC addresses). + + + +See the High-Availability HOWTO for more details: +http://metalab.unc.edu/pub/Linux/ALPHA/linux-ha/High-Availability- +HOWTO.html + + + + +10.3. Redundant networking + + IP Address Takeover (IPAT). When a network adapter card fails, its IP + address should be taken by a working network card in the same node or + in another node. MAC Address Takeover: when an IP takeover occurs, it + should be made sure that all the nodes in the network update their ARP + caches (the mapping between IP and MAC addresses). + + See the High-Availability HOWTO for more details: + http://metalab.unc.edu/pub/Linux/ALPHA/linux-ha/High-Availability- + HOWTO.html + +