#!/bin/bash # Network Traffic Control for Home Network # Normally called from /etc/systemd/system/netpolice.J.service . The script # should run after the network starts up, but net services like DNS are not # required. Script exit values (per LSB): 0 = success, 1 = generic error, # 3 = bad command line argument, 5 = missing binaries, # Needs 1 command line argument (as with LSB scripts): start, stop, status. # Reload (and variants) are OK and do the same thing as start. -v (before the # action) causes the traffic control structures to be displayed. -s causes # statistics to be displayed, and the rest of the command line is ignored, # i.e. the traffic control is not reloaded or cleared. # See voip-qos-1609.html for discussion about why I'm ignoring the # ToS/QoS/DiffServ field in the IP header. Basically, nobody actually sets # a QoS code point when sending to me. Much better to give priority to # lower bandwidth flows. # Also note that our ONT enforces our SLA data rates, and also does something # with traffic control so frog flows (lower bandwidth non saturating, vs. hog # flows) get priority. The ONT is sensitive to queue residence time and # the target upper bound is 50ms. Even so I'm doing my own traffic control. # The basic plan is like this: (QD = queue discipline) # Root: TBF to limit the rate to 95% of the SLA (service level agreement). # This avoids queues in other routers, specifically the ONT. # Middle: PRIO, with classes for ranges of data rates. Lower rate flows # (frogs) get to go out first. # Inner: FQ_CODEL. The FQ component makes the flows take turns sending # packets, round robin. This is normally irrelevant for frogs since # they have priority and prior packets will already have gone out (except # during net problems). But it's important for hogs, which will share # the interface approximately equally. # Leaf: CODEL (automatically provided by FQ_CODEL). If packets stay in the # queue too long (default 5ms), ECN is used to signal the endpoints to # reduce their data rate. There is a hard limit on queue size in case # ECN is ignored or botched. # Planned extension: to replace the dumb ingress policer with a IFB/IMQ device # holding a copy of the egress QD assembly. Also, separately manage the # IPv6 tunnel. # According to https://wiki.archlinux.org/index.php/Advanced_traffic_control # the TC root has a fq_codel qdisc on it by default, starting with systemd-217. # How to test: Create a hog flow, use wget on a giant file. Test both outgoing # and incoming hogs (individually). The partner should be outside my ISP's # net, to provoke queuing at many points often traversed in normal use. # . Measure the actual data rate (of the hog only) with wget statistics. # . Use ping to estimate latency and jitter (roughly). # . Interactive use with slogin: subjective quality. # . VoIP call, subjective quality in both directions. # . Web browsing, commercial site with advertising. Measure the time to # refresh a particular page. # I should test with no traffic control, the old netpolice.J, and the new one # (this program). Also test the frogs with no hog flow (shaping is # irrelevant). # References: # http://linux-tc-notes.sourceforge.net/tc/doc/ # Original credits: # Plagarized by Jim Carter from HOWTO by # Bert Hubert (editor): # http://lartc.org/howto/ (dated approx. 2004). # See section 15.8.3 in the above HOWTO, and for docs on HTB (Hierarchical # Token Bucket) queueing discipline, see sect. 9.5.5 and Devik's writeup: # http://luxik.cdi.cz/~devik/qos/htb/manual/userg.htm # The current script was rewritten by jimc in 2016-09-07 incorporating more # modern traffic control features and data rate based priority. # --- Program Begins --- # Check for missing binaries (stale symlinks should not happen) # Note: Special treatment of stop for LSB conformance TC=/usr/sbin/tc test -x $TC || { echo "$TC (package iproute2) not installed" 1>&2 ; if [ "$1" = "stop" ]; then exit 0; else exit 5; fi; } # Interpret command line option(s). while [ "X${1:0:1}" = "X-" ] ; do eval "opt_${1:1:1}=1" shift done # Configuration parms should be in sysconfig but instead are hardwired here. # Parameter format: decimal number followed by optional unit, e.g. 0.4mbps # Rates: mbps, kbps, bps, or bare number = bytes/sec (times power of 2) # mbit, kbit = bits/sec # Amounts: mb, m, kb, k, b, or bare number = bytes (times power of 2) # mbit, kbit = bits. # Durations (seconds): [s, sec, secs], [ms, msec, msecs] (milliseconds), # [us, usec, usecs, or bare number] (microseconds). # Service level "agreements": Rates are implicitly in mbit (2^20 bit/second) # Date Carrier Up/Down DOWNLINK UPLINK MTU # 2016-09-xx Frontier FIOS 50/50 50 mbit 50 mbit 1500b # 2015-xx-xx Verizon FIOS 25/25 25 mbit 25 mbit 1500b # 2013-xx-xx Verizon FIOS 15/5 15 mbit 5 mbit 1500b # 2012-04-10 Time Warner 15/2 15 mbit 2 mbit 576b # 2000-xx-xx Verizon DSL 0.768 730 kbit 120 kbit 1500b DOWNLINK=50 #Data rate, wild side to us, mbit (mega bits/second) UPLINK=50 #Data rate, us to wild side, mbit (mega bits/second) MTU=1500 #Maximum packet size, bytes LATEN=50 #Targeted maximum latency, msec (milliseconds) # Devices being controlled WILD=eth1 IPv6=he-ipv6 IFB4=ifb0 IFB6=ifb1 # Add controlled devices as they become operative. DEVS=( $WILD $IFB4 $IPv6 $IFB6 ) # The arguments of "try" are executed as a command line. If it fails, 1 is # returned, and bad=1. bad=0 function try () { if "$@" ; then : ; else bad=1 ; return 1; fi } # Shows the statistics. List devices to show as $1 $2 ... function do_show () { local dev for dev in $* ; do echo " ---- Statistics for device $dev ----" echo " ---- Queue Disciplines ($dev) ----" $TC -g -s qdisc show dev $dev echo " ---- Sub Classes ($dev) ----" $TC -g -s class show dev $dev echo " ---- Filters on root ($dev) ----" $TC -g -s filter show dev $dev root #DEBUG -- https://bugzilla.redhat.com/show_bug.cgi?id=1297230 echo " ---- Filters on prio queue ($dev) ----" $TC -g -s filter show dev $dev parent 2: done } # Get rid of pre-existing setup (silently). List the devices to be cleared # as $1 $2 ... function do_stop () { local dev for dev in $* ; do if [ ! -e /sys/class/net/$dev ] ; then continue ; fi $TC qdisc del dev $dev root > /dev/null 2>&1 $TC qdisc del dev $dev ingress > /dev/null 2>&1 if [[ $dev =~ ^ifb.* ]] ; then ip link set $dev down fi done modprobe -r ifb return 0 } # Set up the queueing discipline. (do_stop first even on initial setup.) # $1 = net device e.g. eth1; # $2 = SLA data rate in mbit (mega bits/second). function do_generic () { local NET=$1 SLA=$2 local RATE=$(((SLA*19000)/20)) # 95% of SLA data rate in kbit/s # BURST = max number of bytes that can go out during each clock # tick. Not sure how to handle BURST on a no-tick kernel. # Likely it is irrelevant. I'm setting HZ=1000 anyway. # It looks like the man page for tc-tbf recommends BURST=10000 # (bytes) when RATE == 10mbit and HZ=1000 ticks/sec. This # scaling is used below. local MYHZ=$HZ ; if [ -z "$MYHZ" ] ; then MYHZ=1000 ; fi local BURST=$(((RATE*1000)/MYHZ)) # Token Bucket Filter to choke the flow to 95% of the SLA # rate. Choking is by not dequeuing (vs. dropping packets). # (The limit or latency parameters apply to the default bfifo # child QD and are irrelevant in this use case, but are # required.) try $TC qdisc add dev $NET root handle 1: \ tbf rate ${RATE}kbit burst ${BURST} latency ${LATEN}ms # Creates an inner PRIO QD. There will be 2 bands; packets in # lower numbered bands go out before higher numbers. # The priomap is for handling RFC # 1349 ToS (deprecated). The default priomap assumes 3 bands, # so it has to be revised, but is never used. Priomap band # numbers are 0,1 whereas you use 1,2 in filter classid. try $TC qdisc add dev $NET parent 1:0 handle 2: \ prio bands 2 priomap 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # LIMIT = number of packets ($MTU size) that take 1.5*$LATEN # msec to be transmitted. Mind the units. # local LIMIT=$(( 1.5 * (LATEN/1000) * (SLA*1000000) / (8*MTU) )) # -- Simplify to: local LIMIT=$(((LATEN*SLA*187)/MTU)) # Band 1, for frogs, has a pure codel (all frog packets in 1 # queue) with a policing filter that brings in all packets in # flows whose data rate is under SLA/4. try $TC qdisc add dev $NET parent 2:1 handle 11: \ codel limit $LIMIT ecn # This filter places frog packets (rate < $RMAX) in band 1. # Filter options: # pref 1 -- lower numbered filters are tried first. # protocol IP -- applies only to IP packets, vs. IPX, DECnet, # Appletalk, etc. ICMP, TCP, UDP etc. are Next Header # values identifying sub-protocols contained inside a # IP packet. # u32 -- Type of filter (bitmatching). # estimator 250ms 2sec -- (1) Count traffic for this interval, # (2) twice the halflife for an exponential moving # average of the interval counts. Legal values for both # are 250ms and powers of 2 times that. 8x interval is a # good value for the 2*halflife. # List of matches -- Required even if matching all packets. # u8 0 0 at 0 means for byte 0, mask off all bits and see # if the result is 0. # police -- Test on data rate. # avrate $RMAX -- estimator required, match if rate < $RMAX # conform-exceed continue/pass -- If confoming (rate < $RMAX) # put the packet in this codel (pass). If not, do # nothing (continue), let it go to the filter for the # next band. Keyword is backward, should be # exceed-conform. # classid 2:1 -- If all matches are true, put in the CODEL # for band 1 local RMAX=$(((SLA*1000)/4)) # 1/4 SLA in kbit/sec try $TC filter add dev $NET parent 2: pref 1 protocol IP \ estimator 250ms 4sec \ u32 match u8 0 0 at 0 \ police avrate ${RMAX}kbit conform-exceed continue/pass classid 2:1 # Band 2 gets traffic from flows whose data rate is over # $SLA/3 (hog flows). It is stuffed with a FQ_CODEL. # FQ_CODEL does round robin on # each flow. See tc-codel and tc-drr for some of the # parameters. The important ones are: # limit: hard limit on queue size in packets, in case a flow # ignores or botches ECN. Defalt is 10240 packets (oink). # I'm using 1.5*latency*SLA/MTU. Computation above. # flows 1024 (default): size of flows hash table. # target 5ms (default): queue delay where management starts. # interval 100ms (default): history length, use the maximum # expected roundtrip time through the most congested path. # quantum 1514b (default): how much data is dequeued at each # robin turn, default is one Ethernet MTU. See tc-drr. # ecn: Use ECN rather than dropping packets, when target cannot # be met. (A hog is always congested.) try $TC qdisc add dev $NET parent 2:2 handle 12: \ fq_codel limit $LIMIT ecn # This is the filter that snags into band 3 all packets not # taken into band 1 or 2. try $TC filter add dev $NET parent 2: pref 2 protocol IP \ u32 match u8 0 0 at 0 classid 2:2 } # Sets up an ingress filter. # $1 = net device e.g. eth1 # $2 = IFB device e.g. ifb0 # $3 = data rate limit (SLA rate) in Mbit/sec. function do_ingress () { local NET=$1 IFB=$2 SLA=$3 # Create an IFB interface (Intermediate Functional Block). # Idempotent: OK to load the module and bring up the link twice. modprobe ifb ip link set $IFB up # Configure the standard queue discipline # (TBF - PRIO - FQ_CODEL) on the IFB. do_generic $IFB $SLA # Set up downlink (incoming traffic on $1) ingress tap. # Handle must be ffff: (or ffff:0). Arbitrary handle not OK. # I don't see this command in any of the man pages for # the tc family. Various blog and wiki posts copy this motif # exactly. try $TC qdisc add dev $NET handle ffff:0 ingress # Flip the incoming packets over to the IFB that has the # complete queue discipline. On the ingress tap itself you # can only have one policing filter, which is too simple and # can't do ECN. Command line arguments: # u32: Type of filter (bitmatching). # match u8 0 0: Accept the packet if 0 == 0, i.e. all packets. # action: If the match(es) match, this is what you do. There # can be several actions and an outcome of "pipe" means to # do the next action (or same as "pass" if no more). # mirred: Mirror or redirect action. Loads module act_mirred. # egress: On the destination ($IFB) the packets look like this # box is sending them (but they are still being sent to this # box, something like lo:). Ingress is not yet implemented. # redirect: Move the packet over; mirror = clone the packet. # dev $IFB: where you want to send the packet. try $TC filter add dev $NET parent ffff:0 u32 \ match u8 0 0 \ action mirred egress redirect dev $IFB } # Show the statistics. Exit, don't change traffic control. if [ -n "$opt_s" ] ; then echo " ---- Stats from -s ----" do_show ${DEVS[*]} exit 0 fi case "$1" in start | restart | try-restart | reload | force-reload ) echo -n "${1}ing netpolice " 1>&2 do_stop ${DEVS[*]} do_generic $WILD $UPLINK do_ingress $WILD $IFB4 $DOWNLINK do_generic $IPv6 $UPLINK do_ingress $IPv6 $IFB6 $DOWNLINK ;; stop) echo -n "Stopping netpolice " 1>&2 do_stop ${DEVS[*]} ;; status) echo -n "Checking for service netpolice " 1>&2 # Check for a few keywords in the "show" output. This would # detect absence of the mechanism but many incomplete setups # would not be detected. if \ $TC qdisc show dev $WILD | grep -w tbf > /dev/null && \ $TC qdisc show dev $WILD | grep -w fq_codel > /dev/null && \ $TC qdisc show dev $WILD | grep -w ffff:fff1 > /dev/null ; \ then : ; else bad=3 # Different LSB exit codes for status check fi ;; *) echo "Usage: $0 {start|stop|status|try-restart|restart|force-reload|reload}" 1>&2 bad=3 ;; esac if [ $bad -ne 0 ] ; then word=failed ; else word=OK ; fi echo " ... $word" 1>&2 if [ -n "$opt_v" ] ; then echo " ---- Stats from -v ----" do_show ${DEVS[*]} echo "Exit code: $bad" 1>&2 fi exit $bad