summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/ipv4
Linux-2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig411
-rw-r--r--net/ipv4/Makefile33
-rw-r--r--net/ipv4/af_inet.c1188
-rw-r--r--net/ipv4/ah4.c335
-rw-r--r--net/ipv4/arp.c1425
-rw-r--r--net/ipv4/datagram.c73
-rw-r--r--net/ipv4/devinet.c1508
-rw-r--r--net/ipv4/esp4.c510
-rw-r--r--net/ipv4/fib_frontend.c611
-rw-r--r--net/ipv4/fib_hash.c1086
-rw-r--r--net/ipv4/fib_lookup.h43
-rw-r--r--net/ipv4/fib_rules.c437
-rw-r--r--net/ipv4/fib_semantics.c1332
-rw-r--r--net/ipv4/icmp.c1143
-rw-r--r--net/ipv4/igmp.c2473
-rw-r--r--net/ipv4/inetpeer.c460
-rw-r--r--net/ipv4/ip_forward.c127
-rw-r--r--net/ipv4/ip_fragment.c691
-rw-r--r--net/ipv4/ip_gre.c1290
-rw-r--r--net/ipv4/ip_input.c431
-rw-r--r--net/ipv4/ip_options.c625
-rw-r--r--net/ipv4/ip_output.c1359
-rw-r--r--net/ipv4/ip_sockglue.c1093
-rw-r--r--net/ipv4/ipcomp.c524
-rw-r--r--net/ipv4/ipconfig.c1507
-rw-r--r--net/ipv4/ipip.c905
-rw-r--r--net/ipv4/ipmr.c1900
-rw-r--r--net/ipv4/ipvs/Kconfig244
-rw-r--r--net/ipv4/ipvs/Makefile34
-rw-r--r--net/ipv4/ipvs/ip_vs_app.c658
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c920
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c1191
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c2391
-rw-r--r--net/ipv4/ipvs/ip_vs_dh.c258
-rw-r--r--net/ipv4/ipvs/ip_vs_est.c200
-rw-r--r--net/ipv4/ipvs/ip_vs_ftp.c400
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c624
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c888
-rw-r--r--net/ipv4/ipvs/ip_vs_lc.c123
-rw-r--r--net/ipv4/ipvs/ip_vs_nq.c161
-rw-r--r--net/ipv4/ipvs/ip_vs_proto.c244
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_ah.c177
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_esp.c175
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_icmp.c182
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_tcp.c640
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_udp.c427
-rw-r--r--net/ipv4/ipvs/ip_vs_rr.c118
-rw-r--r--net/ipv4/ipvs/ip_vs_sched.c251
-rw-r--r--net/ipv4/ipvs/ip_vs_sed.c163
-rw-r--r--net/ipv4/ipvs/ip_vs_sh.c255
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c892
-rw-r--r--net/ipv4/ipvs/ip_vs_wlc.c151
-rw-r--r--net/ipv4/ipvs/ip_vs_wrr.c235
-rw-r--r--net/ipv4/ipvs/ip_vs_xmit.c563
-rw-r--r--net/ipv4/multipath.c55
-rw-r--r--net/ipv4/multipath_drr.c265
-rw-r--r--net/ipv4/multipath_random.c128
-rw-r--r--net/ipv4/multipath_rr.c115
-rw-r--r--net/ipv4/multipath_wrandom.c344
-rw-r--r--net/ipv4/netfilter/Kconfig696
-rw-r--r--net/ipv4/netfilter/Makefile89
-rw-r--r--net/ipv4/netfilter/arp_tables.c1333
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c104
-rw-r--r--net/ipv4/netfilter/arptable_filter.c214
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c167
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c1247
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c501
-rw-r--r--net/ipv4/netfilter/ip_conntrack_irc.c313
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_generic.c75
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_icmp.c279
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_sctp.c649
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_tcp.c1098
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c146
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c961
-rw-r--r--net/ipv4/netfilter/ip_conntrack_tftp.c159
-rw-r--r--net/ipv4/netfilter/ip_nat_amanda.c88
-rw-r--r--net/ipv4/netfilter/ip_nat_core.c556
-rw-r--r--net/ipv4/netfilter/ip_nat_ftp.c183
-rw-r--r--net/ipv4/netfilter/ip_nat_helper.c430
-rw-r--r--net/ipv4/netfilter/ip_nat_irc.c125
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_icmp.c115
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_tcp.c178
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_udp.c165
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_unknown.c70
-rw-r--r--net/ipv4/netfilter/ip_nat_rule.c319
-rw-r--r--net/ipv4/netfilter/ip_nat_snmp_basic.c1347
-rw-r--r--net/ipv4/netfilter/ip_nat_standalone.c349
-rw-r--r--net/ipv4/netfilter/ip_nat_tftp.c70
-rw-r--r--net/ipv4/netfilter/ip_queue.c741
-rw-r--r--net/ipv4/netfilter/ip_tables.c1964
-rw-r--r--net/ipv4/netfilter/ipt_CLASSIFY.c92
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c761
-rw-r--r--net/ipv4/netfilter/ipt_CONNMARK.c118
-rw-r--r--net/ipv4/netfilter/ipt_DSCP.c106
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c175
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c485
-rw-r--r--net/ipv4/netfilter/ipt_MARK.c162
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c207
-rw-r--r--net/ipv4/netfilter/ipt_NETMAP.c117
-rw-r--r--net/ipv4/netfilter/ipt_NOTRACK.c76
-rw-r--r--net/ipv4/netfilter/ipt_REDIRECT.c129
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c335
-rw-r--r--net/ipv4/netfilter/ipt_SAME.c211
-rw-r--r--net/ipv4/netfilter/ipt_TCPMSS.c262
-rw-r--r--net/ipv4/netfilter/ipt_TOS.c105
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c419
-rw-r--r--net/ipv4/netfilter/ipt_addrtype.c77
-rw-r--r--net/ipv4/netfilter/ipt_ah.c117
-rw-r--r--net/ipv4/netfilter/ipt_comment.c59
-rw-r--r--net/ipv4/netfilter/ipt_connmark.c81
-rw-r--r--net/ipv4/netfilter/ipt_conntrack.c136
-rw-r--r--net/ipv4/netfilter/ipt_dscp.c63
-rw-r--r--net/ipv4/netfilter/ipt_ecn.c131
-rw-r--r--net/ipv4/netfilter/ipt_esp.c118
-rw-r--r--net/ipv4/netfilter/ipt_hashlimit.c731
-rw-r--r--net/ipv4/netfilter/ipt_helper.c113
-rw-r--r--net/ipv4/netfilter/ipt_iprange.c99
-rw-r--r--net/ipv4/netfilter/ipt_length.c64
-rw-r--r--net/ipv4/netfilter/ipt_limit.c157
-rw-r--r--net/ipv4/netfilter/ipt_mac.c79
-rw-r--r--net/ipv4/netfilter/ipt_mark.c64
-rw-r--r--net/ipv4/netfilter/ipt_multiport.c212
-rw-r--r--net/ipv4/netfilter/ipt_owner.c217
-rw-r--r--net/ipv4/netfilter/ipt_physdev.c134
-rw-r--r--net/ipv4/netfilter/ipt_pkttype.c70
-rw-r--r--net/ipv4/netfilter/ipt_realm.c76
-rw-r--r--net/ipv4/netfilter/ipt_recent.c1002
-rw-r--r--net/ipv4/netfilter/ipt_sctp.c203
-rw-r--r--net/ipv4/netfilter/ipt_state.c74
-rw-r--r--net/ipv4/netfilter/ipt_tcpmss.c127
-rw-r--r--net/ipv4/netfilter/ipt_tos.c64
-rw-r--r--net/ipv4/netfilter/ipt_ttl.c79
-rw-r--r--net/ipv4/netfilter/iptable_filter.c194
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c260
-rw-r--r--net/ipv4/netfilter/iptable_raw.c156
-rw-r--r--net/ipv4/proc.c382
-rw-r--r--net/ipv4/protocol.c101
-rw-r--r--net/ipv4/raw.c888
-rw-r--r--net/ipv4/route.c3177
-rw-r--r--net/ipv4/syncookies.c279
-rw-r--r--net/ipv4/sysctl_net_ipv4.c698
-rw-r--r--net/ipv4/tcp.c2386
-rw-r--r--net/ipv4/tcp_diag.c802
-rw-r--r--net/ipv4/tcp_input.c4959
-rw-r--r--net/ipv4/tcp_ipv4.c2663
-rw-r--r--net/ipv4/tcp_minisocks.c1077
-rw-r--r--net/ipv4/tcp_output.c1739
-rw-r--r--net/ipv4/tcp_timer.c656
-rw-r--r--net/ipv4/udp.c1575
-rw-r--r--net/ipv4/utils.c59
-rw-r--r--net/ipv4/xfrm4_input.c160
-rw-r--r--net/ipv4/xfrm4_output.c141
-rw-r--r--net/ipv4/xfrm4_policy.c281
-rw-r--r--net/ipv4/xfrm4_state.c126
-rw-r--r--net/ipv4/xfrm4_tunnel.c144
155 files changed, 82733 insertions, 0 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
new file mode 100644
index 00000000000..6d3e8b1bd1f
--- /dev/null
+++ b/net/ipv4/Kconfig
@@ -0,0 +1,411 @@
+#
+# IP configuration
+#
+config IP_MULTICAST
+ bool "IP: multicasting"
+ depends on INET
+ help
+ This is code for addressing several networked computers at once,
+ enlarging your kernel by about 2 KB. You need multicasting if you
+ intend to participate in the MBONE, a high bandwidth network on top
+ of the Internet which carries audio and video broadcasts. More
+ information about the MBONE is on the WWW at
+ <http://www-itg.lbl.gov/mbone/>. Information about the multicast
+ capabilities of the various network cards is contained in
+ <file:Documentation/networking/multicast.txt>. For most people, it's
+ safe to say N.
+
+config IP_ADVANCED_ROUTER
+ bool "IP: advanced router"
+ depends on INET
+ ---help---
+ If you intend to run your Linux box mostly as a router, i.e. as a
+ computer that forwards and redistributes network packets, say Y; you
+ will then be presented with several options that allow more precise
+ control about the routing process.
+
+ The answer to this question won't directly affect the kernel:
+ answering N will just cause the configurator to skip all the
+ questions about advanced routing.
+
+ Note that your box can only act as a router if you enable IP
+ forwarding in your kernel; you can do that by saying Y to "/proc
+ file system support" and "Sysctl support" below and executing the
+ line
+
+ echo "1" > /proc/sys/net/ipv4/ip_forward
+
+ at boot time after the /proc file system has been mounted.
+
+ If you turn on IP forwarding, you will also get the rp_filter, which
+ automatically rejects incoming packets if the routing table entry
+ for their source address doesn't match the network interface they're
+ arriving on. This has security advantages because it prevents the
+ so-called IP spoofing, however it can pose problems if you use
+ asymmetric routing (packets from you to a host take a different path
+ than packets from that host to you) or if you operate a non-routing
+ host which has several IP addresses on different interfaces. To turn
+ rp_filter off use:
+
+ echo 0 > /proc/sys/net/ipv4/conf/<device>/rp_filter
+ or
+ echo 0 > /proc/sys/net/ipv4/conf/all/rp_filter
+
+ If unsure, say N here.
+
+config IP_MULTIPLE_TABLES
+ bool "IP: policy routing"
+ depends on IP_ADVANCED_ROUTER
+ ---help---
+ Normally, a router decides what to do with a received packet based
+ solely on the packet's final destination address. If you say Y here,
+ the Linux router will also be able to take the packet's source
+ address into account. Furthermore, the TOS (Type-Of-Service) field
+ of the packet can be used for routing decisions as well.
+
+ If you are interested in this, please see the preliminary
+ documentation at <http://www.compendium.com.ar/policy-routing.txt>
+ and <ftp://post.tepkom.ru/pub/vol2/Linux/docs/advanced-routing.tex>.
+ You will need supporting software from
+ <ftp://ftp.tux.org/pub/net/ip-routing/>.
+
+ If unsure, say N.
+
+config IP_ROUTE_FWMARK
+ bool "IP: use netfilter MARK value as routing key"
+ depends on IP_MULTIPLE_TABLES && NETFILTER
+ help
+ If you say Y here, you will be able to specify different routes for
+ packets with different mark values (see iptables(8), MARK target).
+
+config IP_ROUTE_MULTIPATH
+ bool "IP: equal cost multipath"
+ depends on IP_ADVANCED_ROUTER
+ help
+ Normally, the routing tables specify a single action to be taken in
+ a deterministic manner for a given packet. If you say Y here
+ however, it becomes possible to attach several actions to a packet
+ pattern, in effect specifying several alternative paths to travel
+ for those packets. The router considers all these paths to be of
+ equal "cost" and chooses one of them in a non-deterministic fashion
+ if a matching packet arrives.
+
+config IP_ROUTE_MULTIPATH_CACHED
+ bool "IP: equal cost multipath with caching support (EXPERIMENTAL)"
+ depends on: IP_ROUTE_MULTIPATH
+ help
+ Normally, equal cost multipath routing is not supported by the
+ routing cache. If you say Y here, alternative routes are cached
+ and on cache lookup a route is chosen in a configurable fashion.
+
+ If unsure, say N.
+
+config IP_ROUTE_MULTIPATH_RR
+ tristate "MULTIPATH: round robin algorithm"
+ depends on IP_ROUTE_MULTIPATH_CACHED
+ help
+ Mulitpath routes are chosen according to Round Robin
+
+config IP_ROUTE_MULTIPATH_RANDOM
+ tristate "MULTIPATH: random algorithm"
+ depends on IP_ROUTE_MULTIPATH_CACHED
+ help
+ Multipath routes are chosen in a random fashion. Actually,
+ there is no weight for a route. The advantage of this policy
+ is that it is implemented stateless and therefore introduces only
+ a very small delay.
+
+config IP_ROUTE_MULTIPATH_WRANDOM
+ tristate "MULTIPATH: weighted random algorithm"
+ depends on IP_ROUTE_MULTIPATH_CACHED
+ help
+ Multipath routes are chosen in a weighted random fashion.
+ The per route weights are the weights visible via ip route 2. As the
+ corresponding state management introduces some overhead routing delay
+ is increased.
+
+config IP_ROUTE_MULTIPATH_DRR
+ tristate "MULTIPATH: interface round robin algorithm"
+ depends on IP_ROUTE_MULTIPATH_CACHED
+ help
+ Connections are distributed in a round robin fashion over the
+ available interfaces. This policy makes sense if the connections
+ should be primarily distributed on interfaces and not on routes.
+
+config IP_ROUTE_VERBOSE
+ bool "IP: verbose route monitoring"
+ depends on IP_ADVANCED_ROUTER
+ help
+ If you say Y here, which is recommended, then the kernel will print
+ verbose messages regarding the routing, for example warnings about
+ received packets which look strange and could be evidence of an
+ attack or a misconfigured system somewhere. The information is
+ handled by the klogd daemon which is responsible for kernel messages
+ ("man klogd").
+
+config IP_PNP
+ bool "IP: kernel level autoconfiguration"
+ depends on INET
+ help
+ This enables automatic configuration of IP addresses of devices and
+ of the routing table during kernel boot, based on either information
+ supplied on the kernel command line or by BOOTP or RARP protocols.
+ You need to say Y only for diskless machines requiring network
+ access to boot (in which case you want to say Y to "Root file system
+ on NFS" as well), because all other machines configure the network
+ in their startup scripts.
+
+config IP_PNP_DHCP
+ bool "IP: DHCP support"
+ depends on IP_PNP
+ ---help---
+ If you want your Linux box to mount its whole root file system (the
+ one containing the directory /) from some other computer over the
+ net via NFS and you want the IP address of your computer to be
+ discovered automatically at boot time using the DHCP protocol (a
+ special protocol designed for doing this job), say Y here. In case
+ the boot ROM of your network card was designed for booting Linux and
+ does DHCP itself, providing all necessary information on the kernel
+ command line, you can say N here.
+
+ If unsure, say Y. Note that if you want to use DHCP, a DHCP server
+ must be operating on your network. Read
+ <file:Documentation/nfsroot.txt> for details.
+
+config IP_PNP_BOOTP
+ bool "IP: BOOTP support"
+ depends on IP_PNP
+ ---help---
+ If you want your Linux box to mount its whole root file system (the
+ one containing the directory /) from some other computer over the
+ net via NFS and you want the IP address of your computer to be
+ discovered automatically at boot time using the BOOTP protocol (a
+ special protocol designed for doing this job), say Y here. In case
+ the boot ROM of your network card was designed for booting Linux and
+ does BOOTP itself, providing all necessary information on the kernel
+ command line, you can say N here. If unsure, say Y. Note that if you
+ want to use BOOTP, a BOOTP server must be operating on your network.
+ Read <file:Documentation/nfsroot.txt> for details.
+
+config IP_PNP_RARP
+ bool "IP: RARP support"
+ depends on IP_PNP
+ help
+ If you want your Linux box to mount its whole root file system (the
+ one containing the directory /) from some other computer over the
+ net via NFS and you want the IP address of your computer to be
+ discovered automatically at boot time using the RARP protocol (an
+ older protocol which is being obsoleted by BOOTP and DHCP), say Y
+ here. Note that if you want to use RARP, a RARP server must be
+ operating on your network. Read <file:Documentation/nfsroot.txt> for
+ details.
+
+# not yet ready..
+# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
+config NET_IPIP
+ tristate "IP: tunneling"
+ depends on INET
+ select INET_TUNNEL
+ ---help---
+ Tunneling means encapsulating data of one protocol type within
+ another protocol and sending it over a channel that understands the
+ encapsulating protocol. This particular tunneling driver implements
+ encapsulation of IP within IP, which sounds kind of pointless, but
+ can be useful if you want to make your (or some other) machine
+ appear on a different network than it physically is, or to use
+ mobile-IP facilities (allowing laptops to seamlessly move between
+ networks without changing their IP addresses).
+
+ Saying Y to this option will produce two modules ( = code which can
+ be inserted in and removed from the running kernel whenever you
+ want). Most people won't need this and can say N.
+
+config NET_IPGRE
+ tristate "IP: GRE tunnels over IP"
+ depends on INET
+ select XFRM
+ help
+ Tunneling means encapsulating data of one protocol type within
+ another protocol and sending it over a channel that understands the
+ encapsulating protocol. This particular tunneling driver implements
+ GRE (Generic Routing Encapsulation) and at this time allows
+ encapsulating of IPv4 or IPv6 over existing IPv4 infrastructure.
+ This driver is useful if the other endpoint is a Cisco router: Cisco
+ likes GRE much better than the other Linux tunneling driver ("IP
+ tunneling" above). In addition, GRE allows multicast redistribution
+ through the tunnel.
+
+config NET_IPGRE_BROADCAST
+ bool "IP: broadcast GRE over IP"
+ depends on IP_MULTICAST && NET_IPGRE
+ help
+ One application of GRE/IP is to construct a broadcast WAN (Wide Area
+ Network), which looks like a normal Ethernet LAN (Local Area
+ Network), but can be distributed all over the Internet. If you want
+ to do that, say Y here and to "IP multicast routing" below.
+
+config IP_MROUTE
+ bool "IP: multicast routing"
+ depends on IP_MULTICAST
+ help
+ This is used if you want your machine to act as a router for IP
+ packets that have several destination addresses. It is needed on the
+ MBONE, a high bandwidth network on top of the Internet which carries
+ audio and video broadcasts. In order to do that, you would most
+ likely run the program mrouted. Information about the multicast
+ capabilities of the various network cards is contained in
+ <file:Documentation/networking/multicast.txt>. If you haven't heard
+ about it, you don't need it.
+
+config IP_PIMSM_V1
+ bool "IP: PIM-SM version 1 support"
+ depends on IP_MROUTE
+ help
+ Kernel side support for Sparse Mode PIM (Protocol Independent
+ Multicast) version 1. This multicast routing protocol is used widely
+ because Cisco supports it. You need special software to use it
+ (pimd-v1). Please see <http://netweb.usc.edu/pim/> for more
+ information about PIM.
+
+ Say Y if you want to use PIM-SM v1. Note that you can say N here if
+ you just want to use Dense Mode PIM.
+
+config IP_PIMSM_V2
+ bool "IP: PIM-SM version 2 support"
+ depends on IP_MROUTE
+ help
+ Kernel side support for Sparse Mode PIM version 2. In order to use
+ this, you need an experimental routing daemon supporting it (pimd or
+ gated-5). This routing protocol is not used widely, so say N unless
+ you want to play with it.
+
+config ARPD
+ bool "IP: ARP daemon support (EXPERIMENTAL)"
+ depends on INET && EXPERIMENTAL
+ ---help---
+ Normally, the kernel maintains an internal cache which maps IP
+ addresses to hardware addresses on the local network, so that
+ Ethernet/Token Ring/ etc. frames are sent to the proper address on
+ the physical networking layer. For small networks having a few
+ hundred directly connected hosts or less, keeping this address
+ resolution (ARP) cache inside the kernel works well. However,
+ maintaining an internal ARP cache does not work well for very large
+ switched networks, and will use a lot of kernel memory if TCP/IP
+ connections are made to many machines on the network.
+
+ If you say Y here, the kernel's internal ARP cache will never grow
+ to more than 256 entries (the oldest entries are expired in a LIFO
+ manner) and communication will be attempted with the user space ARP
+ daemon arpd. Arpd then answers the address resolution request either
+ from its own cache or by asking the net.
+
+ This code is experimental and also obsolete. If you want to use it,
+ you need to find a version of the daemon arpd on the net somewhere,
+ and you should also say Y to "Kernel/User network link driver",
+ below. If unsure, say N.
+
+config SYN_COOKIES
+ bool "IP: TCP syncookie support (disabled per default)"
+ depends on INET
+ ---help---
+ Normal TCP/IP networking is open to an attack known as "SYN
+ flooding". This denial-of-service attack prevents legitimate remote
+ users from being able to connect to your computer during an ongoing
+ attack and requires very little work from the attacker, who can
+ operate from anywhere on the Internet.
+
+ SYN cookies provide protection against this type of attack. If you
+ say Y here, the TCP/IP stack will use a cryptographic challenge
+ protocol known as "SYN cookies" to enable legitimate users to
+ continue to connect, even when your machine is under attack. There
+ is no need for the legitimate users to change their TCP/IP software;
+ SYN cookies work transparently to them. For technical information
+ about SYN cookies, check out <http://cr.yp.to/syncookies.html>.
+
+ If you are SYN flooded, the source address reported by the kernel is
+ likely to have been forged by the attacker; it is only reported as
+ an aid in tracing the packets to their actual source and should not
+ be taken as absolute truth.
+
+ SYN cookies may prevent correct error reporting on clients when the
+ server is really overloaded. If this happens frequently better turn
+ them off.
+
+ If you say Y here, note that SYN cookies aren't enabled by default;
+ you can enable them by saying Y to "/proc file system support" and
+ "Sysctl support" below and executing the command
+
+ echo 1 >/proc/sys/net/ipv4/tcp_syncookies
+
+ at boot time after the /proc file system has been mounted.
+
+ If unsure, say N.
+
+config INET_AH
+ tristate "IP: AH transformation"
+ depends on INET
+ select XFRM
+ select CRYPTO
+ select CRYPTO_HMAC
+ select CRYPTO_MD5
+ select CRYPTO_SHA1
+ ---help---
+ Support for IPsec AH.
+
+ If unsure, say Y.
+
+config INET_ESP
+ tristate "IP: ESP transformation"
+ depends on INET
+ select XFRM
+ select CRYPTO
+ select CRYPTO_HMAC
+ select CRYPTO_MD5
+ select CRYPTO_SHA1
+ select CRYPTO_DES
+ ---help---
+ Support for IPsec ESP.
+
+ If unsure, say Y.
+
+config INET_IPCOMP
+ tristate "IP: IPComp transformation"
+ depends on INET
+ select XFRM
+ select INET_TUNNEL
+ select CRYPTO
+ select CRYPTO_DEFLATE
+ ---help---
+ Support for IP Payload Compression Protocol (IPComp) (RFC3173),
+ typically needed for IPsec.
+
+ If unsure, say Y.
+
+config INET_TUNNEL
+ tristate "IP: tunnel transformation"
+ depends on INET
+ select XFRM
+ ---help---
+ Support for generic IP tunnel transformation, which is required by
+ the IP tunneling module as well as tunnel mode IPComp.
+
+ If unsure, say Y.
+
+config IP_TCPDIAG
+ tristate "IP: TCP socket monitoring interface"
+ depends on INET
+ default y
+ ---help---
+ Support for TCP socket monitoring interface used by native Linux
+ tools such as ss. ss is included in iproute2, currently downloadable
+ at <http://developer.osdl.org/dev/iproute2>. If you want IPv6 support
+ and have selected IPv6 as a module, you need to build this as a
+ module too.
+
+ If unsure, say Y.
+
+config IP_TCPDIAG_IPV6
+ def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
+
+source "net/ipv4/ipvs/Kconfig"
+
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
new file mode 100644
index 00000000000..8b379627ebb
--- /dev/null
+++ b/net/ipv4/Makefile
@@ -0,0 +1,33 @@
+#
+# Makefile for the Linux TCP/IP (INET) layer.
+#
+
+obj-y := utils.o route.o inetpeer.o protocol.o \
+ ip_input.o ip_fragment.o ip_forward.o ip_options.o \
+ ip_output.o ip_sockglue.o \
+ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \
+ datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
+ sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o
+
+obj-$(CONFIG_PROC_FS) += proc.o
+obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
+obj-$(CONFIG_IP_MROUTE) += ipmr.o
+obj-$(CONFIG_NET_IPIP) += ipip.o
+obj-$(CONFIG_NET_IPGRE) += ip_gre.o
+obj-$(CONFIG_SYN_COOKIES) += syncookies.o
+obj-$(CONFIG_INET_AH) += ah4.o
+obj-$(CONFIG_INET_ESP) += esp4.o
+obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
+obj-$(CONFIG_INET_TUNNEL) += xfrm4_tunnel.o
+obj-$(CONFIG_IP_PNP) += ipconfig.o
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o
+obj-$(CONFIG_NETFILTER) += netfilter/
+obj-$(CONFIG_IP_VS) += ipvs/
+obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
+
+obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
+ xfrm4_output.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
new file mode 100644
index 00000000000..c34dab67e46
--- /dev/null
+++ b/net/ipv4/af_inet.c
@@ -0,0 +1,1188 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * PF_INET protocol family socket handler.
+ *
+ * Version: $Id: af_inet.c,v 1.137 2002/02/01 22:01:03 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Alan Cox, <A.Cox@swansea.ac.uk>
+ *
+ * Changes (see also sock.c)
+ *
+ * piggy,
+ * Karl Knutson : Socket protocol table
+ * A.N.Kuznetsov : Socket death error in accept().
+ * John Richardson : Fix non blocking error in connect()
+ * so sockets that fail to connect
+ * don't return -EINPROGRESS.
+ * Alan Cox : Asynchronous I/O support
+ * Alan Cox : Keep correct socket pointer on sock
+ * structures
+ * when accept() ed
+ * Alan Cox : Semantics of SO_LINGER aren't state
+ * moved to close when you look carefully.
+ * With this fixed and the accept bug fixed
+ * some RPC stuff seems happier.
+ * Niibe Yutaka : 4.4BSD style write async I/O
+ * Alan Cox,
+ * Tony Gale : Fixed reuse semantics.
+ * Alan Cox : bind() shouldn't abort existing but dead
+ * sockets. Stops FTP netin:.. I hope.
+ * Alan Cox : bind() works correctly for RAW sockets.
+ * Note that FreeBSD at least was broken
+ * in this respect so be careful with
+ * compatibility tests...
+ * Alan Cox : routing cache support
+ * Alan Cox : memzero the socket structure for
+ * compactness.
+ * Matt Day : nonblock connect error handler
+ * Alan Cox : Allow large numbers of pending sockets
+ * (eg for big web sites), but only if
+ * specifically application requested.
+ * Alan Cox : New buffering throughout IP. Used
+ * dumbly.
+ * Alan Cox : New buffering now used smartly.
+ * Alan Cox : BSD rather than common sense
+ * interpretation of listen.
+ * Germano Caronni : Assorted small races.
+ * Alan Cox : sendmsg/recvmsg basic support.
+ * Alan Cox : Only sendmsg/recvmsg now supported.
+ * Alan Cox : Locked down bind (see security list).
+ * Alan Cox : Loosened bind a little.
+ * Mike McLagan : ADD/DEL DLCI Ioctls
+ * Willy Konynenberg : Transparent proxying support.
+ * David S. Miller : New socket lookup architecture.
+ * Some other random speedups.
+ * Cyrus Durgin : Cleaned up file for kmod hacks.
+ * Andi Kleen : Fix inet_stream_connect TCP race.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#include <linux/smp_lock.h>
+#include <linux/inet.h>
+#include <linux/igmp.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/arp.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/raw.h>
+#include <net/icmp.h>
+#include <net/ipip.h>
+#include <net/inet_common.h>
+#include <net/xfrm.h>
+#ifdef CONFIG_IP_MROUTE
+#include <linux/mroute.h>
+#endif
+
+DEFINE_SNMP_STAT(struct linux_mib, net_statistics);
+
+#ifdef INET_REFCNT_DEBUG
+atomic_t inet_sock_nr;
+#endif
+
+extern void ip_mc_drop_socket(struct sock *sk);
+
+/* The inetsw table contains everything that inet_create needs to
+ * build a new socket.
+ */
+static struct list_head inetsw[SOCK_MAX];
+static DEFINE_SPINLOCK(inetsw_lock);
+
+/* New destruction routine */
+
+void inet_sock_destruct(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ __skb_queue_purge(&sk->sk_receive_queue);
+ __skb_queue_purge(&sk->sk_error_queue);
+
+ if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
+ printk("Attempt to release TCP socket in state %d %p\n",
+ sk->sk_state, sk);
+ return;
+ }
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ printk("Attempt to release alive inet socket %p\n", sk);
+ return;
+ }
+
+ BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
+ BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
+ BUG_TRAP(!sk->sk_wmem_queued);
+ BUG_TRAP(!sk->sk_forward_alloc);
+
+ if (inet->opt)
+ kfree(inet->opt);
+ dst_release(sk->sk_dst_cache);
+#ifdef INET_REFCNT_DEBUG
+ atomic_dec(&inet_sock_nr);
+ printk(KERN_DEBUG "INET socket %p released, %d are still alive\n",
+ sk, atomic_read(&inet_sock_nr));
+#endif
+}
+
+/*
+ * The routines beyond this point handle the behaviour of an AF_INET
+ * socket object. Mostly it punts to the subprotocols of IP to do
+ * the work.
+ */
+
+/*
+ * Automatically bind an unbound socket.
+ */
+
+static int inet_autobind(struct sock *sk)
+{
+ struct inet_sock *inet;
+ /* We may need to bind the socket. */
+ lock_sock(sk);
+ inet = inet_sk(sk);
+ if (!inet->num) {
+ if (sk->sk_prot->get_port(sk, 0)) {
+ release_sock(sk);
+ return -EAGAIN;
+ }
+ inet->sport = htons(inet->num);
+ }
+ release_sock(sk);
+ return 0;
+}
+
+/*
+ * Move a socket into listening state.
+ */
+int inet_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ unsigned char old_state;
+ int err;
+
+ lock_sock(sk);
+
+ err = -EINVAL;
+ if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
+ goto out;
+
+ old_state = sk->sk_state;
+ if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ goto out;
+
+ /* Really, if the socket is already in listen state
+ * we can only allow the backlog to be adjusted.
+ */
+ if (old_state != TCP_LISTEN) {
+ err = tcp_listen_start(sk);
+ if (err)
+ goto out;
+ }
+ sk->sk_max_ack_backlog = backlog;
+ err = 0;
+
+out:
+ release_sock(sk);
+ return err;
+}
+
+/*
+ * Create an inet socket.
+ */
+
+static int inet_create(struct socket *sock, int protocol)
+{
+ struct sock *sk;
+ struct list_head *p;
+ struct inet_protosw *answer;
+ struct inet_sock *inet;
+ struct proto *answer_prot;
+ unsigned char answer_flags;
+ char answer_no_check;
+ int err;
+
+ sock->state = SS_UNCONNECTED;
+
+ /* Look for the requested type/protocol pair. */
+ answer = NULL;
+ rcu_read_lock();
+ list_for_each_rcu(p, &inetsw[sock->type]) {
+ answer = list_entry(p, struct inet_protosw, list);
+
+ /* Check the non-wild match. */
+ if (protocol == answer->protocol) {
+ if (protocol != IPPROTO_IP)
+ break;
+ } else {
+ /* Check for the two wild cases. */
+ if (IPPROTO_IP == protocol) {
+ protocol = answer->protocol;
+ break;
+ }
+ if (IPPROTO_IP == answer->protocol)
+ break;
+ }
+ answer = NULL;
+ }
+
+ err = -ESOCKTNOSUPPORT;
+ if (!answer)
+ goto out_rcu_unlock;
+ err = -EPERM;
+ if (answer->capability > 0 && !capable(answer->capability))
+ goto out_rcu_unlock;
+ err = -EPROTONOSUPPORT;
+ if (!protocol)
+ goto out_rcu_unlock;
+
+ sock->ops = answer->ops;
+ answer_prot = answer->prot;
+ answer_no_check = answer->no_check;
+ answer_flags = answer->flags;
+ rcu_read_unlock();
+
+ BUG_TRAP(answer_prot->slab != NULL);
+
+ err = -ENOBUFS;
+ sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1);
+ if (sk == NULL)
+ goto out;
+
+ err = 0;
+ sk->sk_no_check = answer_no_check;
+ if (INET_PROTOSW_REUSE & answer_flags)
+ sk->sk_reuse = 1;
+
+ inet = inet_sk(sk);
+
+ if (SOCK_RAW == sock->type) {
+ inet->num = protocol;
+ if (IPPROTO_RAW == protocol)
+ inet->hdrincl = 1;
+ }
+
+ if (ipv4_config.no_pmtu_disc)
+ inet->pmtudisc = IP_PMTUDISC_DONT;
+ else
+ inet->pmtudisc = IP_PMTUDISC_WANT;
+
+ inet->id = 0;
+
+ sock_init_data(sock, sk);
+
+ sk->sk_destruct = inet_sock_destruct;
+ sk->sk_family = PF_INET;
+ sk->sk_protocol = protocol;
+ sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+
+ inet->uc_ttl = -1;
+ inet->mc_loop = 1;
+ inet->mc_ttl = 1;
+ inet->mc_index = 0;
+ inet->mc_list = NULL;
+
+#ifdef INET_REFCNT_DEBUG
+ atomic_inc(&inet_sock_nr);
+#endif
+
+ if (inet->num) {
+ /* It assumes that any protocol which allows
+ * the user to assign a number at socket
+ * creation time automatically
+ * shares.
+ */
+ inet->sport = htons(inet->num);
+ /* Add to protocol hash chains. */
+ sk->sk_prot->hash(sk);
+ }
+
+ if (sk->sk_prot->init) {
+ err = sk->sk_prot->init(sk);
+ if (err)
+ sk_common_release(sk);
+ }
+out:
+ return err;
+out_rcu_unlock:
+ rcu_read_unlock();
+ goto out;
+}
+
+
+/*
+ * The peer socket should always be NULL (or else). When we call this
+ * function we are destroying the object and from then on nobody
+ * should refer to it.
+ */
+int inet_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk) {
+ long timeout;
+
+ /* Applications forget to leave groups before exiting */
+ ip_mc_drop_socket(sk);
+
+ /* If linger is set, we don't return until the close
+ * is complete. Otherwise we return immediately. The
+ * actually closing is done the same either way.
+ *
+ * If the close is due to the process exiting, we never
+ * linger..
+ */
+ timeout = 0;
+ if (sock_flag(sk, SOCK_LINGER) &&
+ !(current->flags & PF_EXITING))
+ timeout = sk->sk_lingertime;
+ sock->sk = NULL;
+ sk->sk_prot->close(sk, timeout);
+ }
+ return 0;
+}
+
+/* It is off by default, see below. */
+int sysctl_ip_nonlocal_bind;
+
+int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+ struct sock *sk = sock->sk;
+ struct inet_sock *inet = inet_sk(sk);
+ unsigned short snum;
+ int chk_addr_ret;
+ int err;
+
+ /* If the socket has its own bind function then use it. (RAW) */
+ if (sk->sk_prot->bind) {
+ err = sk->sk_prot->bind(sk, uaddr, addr_len);
+ goto out;
+ }
+ err = -EINVAL;
+ if (addr_len < sizeof(struct sockaddr_in))
+ goto out;
+
+ chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
+
+ /* Not specified by any standard per-se, however it breaks too
+ * many applications when removed. It is unfortunate since
+ * allowing applications to make a non-local bind solves
+ * several problems with systems using dynamic addressing.
+ * (ie. your servers still start up even if your ISDN link
+ * is temporarily down)
+ */
+ err = -EADDRNOTAVAIL;
+ if (!sysctl_ip_nonlocal_bind &&
+ !inet->freebind &&
+ addr->sin_addr.s_addr != INADDR_ANY &&
+ chk_addr_ret != RTN_LOCAL &&
+ chk_addr_ret != RTN_MULTICAST &&
+ chk_addr_ret != RTN_BROADCAST)
+ goto out;
+
+ snum = ntohs(addr->sin_port);
+ err = -EACCES;
+ if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+ goto out;
+
+ /* We keep a pair of addresses. rcv_saddr is the one
+ * used by hash lookups, and saddr is used for transmit.
+ *
+ * In the BSD API these are the same except where it
+ * would be illegal to use them (multicast/broadcast) in
+ * which case the sending device address is used.
+ */
+ lock_sock(sk);
+
+ /* Check these errors (active socket, double bind). */
+ err = -EINVAL;
+ if (sk->sk_state != TCP_CLOSE || inet->num)
+ goto out_release_sock;
+
+ inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
+ if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+ inet->saddr = 0; /* Use device */
+
+ /* Make sure we are allowed to bind here. */
+ if (sk->sk_prot->get_port(sk, snum)) {
+ inet->saddr = inet->rcv_saddr = 0;
+ err = -EADDRINUSE;
+ goto out_release_sock;
+ }
+
+ if (inet->rcv_saddr)
+ sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+ if (snum)
+ sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+ inet->sport = htons(inet->num);
+ inet->daddr = 0;
+ inet->dport = 0;
+ sk_dst_reset(sk);
+ err = 0;
+out_release_sock:
+ release_sock(sk);
+out:
+ return err;
+}
+
+int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+
+ if (uaddr->sa_family == AF_UNSPEC)
+ return sk->sk_prot->disconnect(sk, flags);
+
+ if (!inet_sk(sk)->num && inet_autobind(sk))
+ return -EAGAIN;
+ return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
+}
+
+static long inet_wait_for_connect(struct sock *sk, long timeo)
+{
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+
+ /* Basic assumption: if someone sets sk->sk_err, he _must_
+ * change state of the socket from TCP_SYN_*.
+ * Connect() does not allow to get error notifications
+ * without closing the socket.
+ */
+ while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ if (signal_pending(current) || !timeo)
+ break;
+ prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ }
+ finish_wait(sk->sk_sleep, &wait);
+ return timeo;
+}
+
+/*
+ * Connect to a remote host. There is regrettably still a little
+ * TCP 'magic' in here.
+ */
+int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+ int err;
+ long timeo;
+
+ lock_sock(sk);
+
+ if (uaddr->sa_family == AF_UNSPEC) {
+ err = sk->sk_prot->disconnect(sk, flags);
+ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+ goto out;
+ }
+
+ switch (sock->state) {
+ default:
+ err = -EINVAL;
+ goto out;
+ case SS_CONNECTED:
+ err = -EISCONN;
+ goto out;
+ case SS_CONNECTING:
+ err = -EALREADY;
+ /* Fall out of switch with err, set for this state */
+ break;
+ case SS_UNCONNECTED:
+ err = -EISCONN;
+ if (sk->sk_state != TCP_CLOSE)
+ goto out;
+
+ err = sk->sk_prot->connect(sk, uaddr, addr_len);
+ if (err < 0)
+ goto out;
+
+ sock->state = SS_CONNECTING;
+
+ /* Just entered SS_CONNECTING state; the only
+ * difference is that return value in non-blocking
+ * case is EINPROGRESS, rather than EALREADY.
+ */
+ err = -EINPROGRESS;
+ break;
+ }
+
+ timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ /* Error code is set above */
+ if (!timeo || !inet_wait_for_connect(sk, timeo))
+ goto out;
+
+ err = sock_intr_errno(timeo);
+ if (signal_pending(current))
+ goto out;
+ }
+
+ /* Connection was closed by RST, timeout, ICMP error
+ * or another process disconnected us.
+ */
+ if (sk->sk_state == TCP_CLOSE)
+ goto sock_error;
+
+ /* sk->sk_err may be not zero now, if RECVERR was ordered by user
+ * and error was received after socket entered established state.
+ * Hence, it is handled normally after connect() return successfully.
+ */
+
+ sock->state = SS_CONNECTED;
+ err = 0;
+out:
+ release_sock(sk);
+ return err;
+
+sock_error:
+ err = sock_error(sk) ? : -ECONNABORTED;
+ sock->state = SS_UNCONNECTED;
+ if (sk->sk_prot->disconnect(sk, flags))
+ sock->state = SS_DISCONNECTING;
+ goto out;
+}
+
+/*
+ * Accept a pending connection. The TCP layer now gives BSD semantics.
+ */
+
+int inet_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+ struct sock *sk1 = sock->sk;
+ int err = -EINVAL;
+ struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
+
+ if (!sk2)
+ goto do_err;
+
+ lock_sock(sk2);
+
+ BUG_TRAP((1 << sk2->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE));
+
+ sock_graft(sk2, newsock);
+
+ newsock->state = SS_CONNECTED;
+ err = 0;
+ release_sock(sk2);
+do_err:
+ return err;
+}
+
+
+/*
+ * This does both peername and sockname.
+ */
+int inet_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddr_len, int peer)
+{
+ struct sock *sk = sock->sk;
+ struct inet_sock *inet = inet_sk(sk);
+ struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+
+ sin->sin_family = AF_INET;
+ if (peer) {
+ if (!inet->dport ||
+ (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
+ peer == 1))
+ return -ENOTCONN;
+ sin->sin_port = inet->dport;
+ sin->sin_addr.s_addr = inet->daddr;
+ } else {
+ __u32 addr = inet->rcv_saddr;
+ if (!addr)
+ addr = inet->saddr;
+ sin->sin_port = inet->sport;
+ sin->sin_addr.s_addr = addr;
+ }
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *uaddr_len = sizeof(*sin);
+ return 0;
+}
+
+int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+ size_t size)
+{
+ struct sock *sk = sock->sk;
+
+ /* We may need to bind the socket. */
+ if (!inet_sk(sk)->num && inet_autobind(sk))
+ return -EAGAIN;
+
+ return sk->sk_prot->sendmsg(iocb, sk, msg, size);
+}
+
+
+static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+
+ /* We may need to bind the socket. */
+ if (!inet_sk(sk)->num && inet_autobind(sk))
+ return -EAGAIN;
+
+ if (sk->sk_prot->sendpage)
+ return sk->sk_prot->sendpage(sk, page, offset, size, flags);
+ return sock_no_sendpage(sock, page, offset, size, flags);
+}
+
+
+int inet_shutdown(struct socket *sock, int how)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ /* This should really check to make sure
+ * the socket is a TCP socket. (WHY AC...)
+ */
+ how++; /* maps 0->1 has the advantage of making bit 1 rcvs and
+ 1->2 bit 2 snds.
+ 2->3 */
+ if ((how & ~SHUTDOWN_MASK) || !how) /* MAXINT->0 */
+ return -EINVAL;
+
+ lock_sock(sk);
+ if (sock->state == SS_CONNECTING) {
+ if ((1 << sk->sk_state) &
+ (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
+ sock->state = SS_DISCONNECTING;
+ else
+ sock->state = SS_CONNECTED;
+ }
+
+ switch (sk->sk_state) {
+ case TCP_CLOSE:
+ err = -ENOTCONN;
+ /* Hack to wake up other listeners, who can poll for
+ POLLHUP, even on eg. unconnected UDP sockets -- RR */
+ default:
+ sk->sk_shutdown |= how;
+ if (sk->sk_prot->shutdown)
+ sk->sk_prot->shutdown(sk, how);
+ break;
+
+ /* Remaining two branches are temporary solution for missing
+ * close() in multithreaded environment. It is _not_ a good idea,
+ * but we have no choice until close() is repaired at VFS level.
+ */
+ case TCP_LISTEN:
+ if (!(how & RCV_SHUTDOWN))
+ break;
+ /* Fall through */
+ case TCP_SYN_SENT:
+ err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
+ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+ break;
+ }
+
+ /* Wake up anyone sleeping in poll. */
+ sk->sk_state_change(sk);
+ release_sock(sk);
+ return err;
+}
+
+/*
+ * ioctl() calls you can issue on an INET socket. Most of these are
+ * device configuration and stuff and very rarely used. Some ioctls
+ * pass on to the socket itself.
+ *
+ * NOTE: I like the idea of a module for the config stuff. ie ifconfig
+ * loads the devconfigure module does its configuring and unloads it.
+ * There's a good 20K of config code hanging around the kernel.
+ */
+
+int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ switch (cmd) {
+ case SIOCGSTAMP:
+ err = sock_get_timestamp(sk, (struct timeval __user *)arg);
+ break;
+ case SIOCADDRT:
+ case SIOCDELRT:
+ case SIOCRTMSG:
+ err = ip_rt_ioctl(cmd, (void __user *)arg);
+ break;
+ case SIOCDARP:
+ case SIOCGARP:
+ case SIOCSARP:
+ err = arp_ioctl(cmd, (void __user *)arg);
+ break;
+ case SIOCGIFADDR:
+ case SIOCSIFADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCSIFBRDADDR:
+ case SIOCGIFNETMASK:
+ case SIOCSIFNETMASK:
+ case SIOCGIFDSTADDR:
+ case SIOCSIFDSTADDR:
+ case SIOCSIFPFLAGS:
+ case SIOCGIFPFLAGS:
+ case SIOCSIFFLAGS:
+ err = devinet_ioctl(cmd, (void __user *)arg);
+ break;
+ default:
+ if (!sk->sk_prot->ioctl ||
+ (err = sk->sk_prot->ioctl(sk, cmd, arg)) ==
+ -ENOIOCTLCMD)
+ err = dev_ioctl(cmd, (void __user *)arg);
+ break;
+ }
+ return err;
+}
+
+struct proto_ops inet_stream_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = inet_bind,
+ .connect = inet_stream_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = inet_accept,
+ .getname = inet_getname,
+ .poll = tcp_poll,
+ .ioctl = inet_ioctl,
+ .listen = inet_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = tcp_sendpage
+};
+
+struct proto_ops inet_dgram_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = inet_bind,
+ .connect = inet_dgram_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = inet_getname,
+ .poll = udp_poll,
+ .ioctl = inet_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = inet_sendpage,
+};
+
+/*
+ * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
+ * udp_poll
+ */
+static struct proto_ops inet_sockraw_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = inet_bind,
+ .connect = inet_dgram_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = inet_getname,
+ .poll = datagram_poll,
+ .ioctl = inet_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = inet_sendpage,
+};
+
+static struct net_proto_family inet_family_ops = {
+ .family = PF_INET,
+ .create = inet_create,
+ .owner = THIS_MODULE,
+};
+
+
+extern void tcp_init(void);
+extern void tcp_v4_init(struct net_proto_family *);
+
+/* Upon startup we insert all the elements in inetsw_array[] into
+ * the linked list inetsw.
+ */
+static struct inet_protosw inetsw_array[] =
+{
+ {
+ .type = SOCK_STREAM,
+ .protocol = IPPROTO_TCP,
+ .prot = &tcp_prot,
+ .ops = &inet_stream_ops,
+ .capability = -1,
+ .no_check = 0,
+ .flags = INET_PROTOSW_PERMANENT,
+ },
+
+ {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_UDP,
+ .prot = &udp_prot,
+ .ops = &inet_dgram_ops,
+ .capability = -1,
+ .no_check = UDP_CSUM_DEFAULT,
+ .flags = INET_PROTOSW_PERMANENT,
+ },
+
+
+ {
+ .type = SOCK_RAW,
+ .protocol = IPPROTO_IP, /* wild card */
+ .prot = &raw_prot,
+ .ops = &inet_sockraw_ops,
+ .capability = CAP_NET_RAW,
+ .no_check = UDP_CSUM_DEFAULT,
+ .flags = INET_PROTOSW_REUSE,
+ }
+};
+
+#define INETSW_ARRAY_LEN (sizeof(inetsw_array) / sizeof(struct inet_protosw))
+
+void inet_register_protosw(struct inet_protosw *p)
+{
+ struct list_head *lh;
+ struct inet_protosw *answer;
+ int protocol = p->protocol;
+ struct list_head *last_perm;
+
+ spin_lock_bh(&inetsw_lock);
+
+ if (p->type >= SOCK_MAX)
+ goto out_illegal;
+
+ /* If we are trying to override a permanent protocol, bail. */
+ answer = NULL;
+ last_perm = &inetsw[p->type];
+ list_for_each(lh, &inetsw[p->type]) {
+ answer = list_entry(lh, struct inet_protosw, list);
+
+ /* Check only the non-wild match. */
+ if (INET_PROTOSW_PERMANENT & answer->flags) {
+ if (protocol == answer->protocol)
+ break;
+ last_perm = lh;
+ }
+
+ answer = NULL;
+ }
+ if (answer)
+ goto out_permanent;
+
+ /* Add the new entry after the last permanent entry if any, so that
+ * the new entry does not override a permanent entry when matched with
+ * a wild-card protocol. But it is allowed to override any existing
+ * non-permanent entry. This means that when we remove this entry, the
+ * system automatically returns to the old behavior.
+ */
+ list_add_rcu(&p->list, last_perm);
+out:
+ spin_unlock_bh(&inetsw_lock);
+
+ synchronize_net();
+
+ return;
+
+out_permanent:
+ printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
+ protocol);
+ goto out;
+
+out_illegal:
+ printk(KERN_ERR
+ "Ignoring attempt to register invalid socket type %d.\n",
+ p->type);
+ goto out;
+}
+
+void inet_unregister_protosw(struct inet_protosw *p)
+{
+ if (INET_PROTOSW_PERMANENT & p->flags) {
+ printk(KERN_ERR
+ "Attempt to unregister permanent protocol %d.\n",
+ p->protocol);
+ } else {
+ spin_lock_bh(&inetsw_lock);
+ list_del_rcu(&p->list);
+ spin_unlock_bh(&inetsw_lock);
+
+ synchronize_net();
+ }
+}
+
+#ifdef CONFIG_IP_MULTICAST
+static struct net_protocol igmp_protocol = {
+ .handler = igmp_rcv,
+};
+#endif
+
+static struct net_protocol tcp_protocol = {
+ .handler = tcp_v4_rcv,
+ .err_handler = tcp_v4_err,
+ .no_policy = 1,
+};
+
+static struct net_protocol udp_protocol = {
+ .handler = udp_rcv,
+ .err_handler = udp_err,
+ .no_policy = 1,
+};
+
+static struct net_protocol icmp_protocol = {
+ .handler = icmp_rcv,
+};
+
+static int __init init_ipv4_mibs(void)
+{
+ net_statistics[0] = alloc_percpu(struct linux_mib);
+ net_statistics[1] = alloc_percpu(struct linux_mib);
+ ip_statistics[0] = alloc_percpu(struct ipstats_mib);
+ ip_statistics[1] = alloc_percpu(struct ipstats_mib);
+ icmp_statistics[0] = alloc_percpu(struct icmp_mib);
+ icmp_statistics[1] = alloc_percpu(struct icmp_mib);
+ tcp_statistics[0] = alloc_percpu(struct tcp_mib);
+ tcp_statistics[1] = alloc_percpu(struct tcp_mib);
+ udp_statistics[0] = alloc_percpu(struct udp_mib);
+ udp_statistics[1] = alloc_percpu(struct udp_mib);
+ if (!
+ (net_statistics[0] && net_statistics[1] && ip_statistics[0]
+ && ip_statistics[1] && tcp_statistics[0] && tcp_statistics[1]
+ && udp_statistics[0] && udp_statistics[1]))
+ return -ENOMEM;
+
+ (void) tcp_mib_init();
+
+ return 0;
+}
+
+static int ipv4_proc_init(void);
+extern void ipfrag_init(void);
+
+static int __init inet_init(void)
+{
+ struct sk_buff *dummy_skb;
+ struct inet_protosw *q;
+ struct list_head *r;
+ int rc = -EINVAL;
+
+ if (sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)) {
+ printk(KERN_CRIT "%s: panic\n", __FUNCTION__);
+ goto out;
+ }
+
+ rc = proto_register(&tcp_prot, 1);
+ if (rc)
+ goto out;
+
+ rc = proto_register(&udp_prot, 1);
+ if (rc)
+ goto out_unregister_tcp_proto;
+
+ rc = proto_register(&raw_prot, 1);
+ if (rc)
+ goto out_unregister_udp_proto;
+
+ /*
+ * Tell SOCKET that we are alive...
+ */
+
+ (void)sock_register(&inet_family_ops);
+
+ /*
+ * Add all the base protocols.
+ */
+
+ if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
+ printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
+ if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
+ printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
+ if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
+ printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
+#ifdef CONFIG_IP_MULTICAST
+ if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
+ printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");
+#endif
+
+ /* Register the socket-side information for inet_create. */
+ for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
+ INIT_LIST_HEAD(r);
+
+ for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
+ inet_register_protosw(q);
+
+ /*
+ * Set the ARP module up
+ */
+
+ arp_init();
+
+ /*
+ * Set the IP module up
+ */
+
+ ip_init();
+
+ tcp_v4_init(&inet_family_ops);
+
+ /* Setup TCP slab cache for open requests. */
+ tcp_init();
+
+
+ /*
+ * Set the ICMP layer up
+ */
+
+ icmp_init(&inet_family_ops);
+
+ /*
+ * Initialise the multicast router
+ */
+#if defined(CONFIG_IP_MROUTE)
+ ip_mr_init();
+#endif
+ /*
+ * Initialise per-cpu ipv4 mibs
+ */
+
+ if(init_ipv4_mibs())
+ printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ;
+
+ ipv4_proc_init();
+
+ ipfrag_init();
+
+ rc = 0;
+out:
+ return rc;
+out_unregister_tcp_proto:
+ proto_unregister(&tcp_prot);
+out_unregister_udp_proto:
+ proto_unregister(&udp_prot);
+ goto out;
+}
+
+module_init(inet_init);
+
+/* ------------------------------------------------------------------------ */
+
+#ifdef CONFIG_PROC_FS
+extern int fib_proc_init(void);
+extern void fib_proc_exit(void);
+extern int ip_misc_proc_init(void);
+extern int raw_proc_init(void);
+extern void raw_proc_exit(void);
+extern int tcp4_proc_init(void);
+extern void tcp4_proc_exit(void);
+extern int udp4_proc_init(void);
+extern void udp4_proc_exit(void);
+
+static int __init ipv4_proc_init(void)
+{
+ int rc = 0;
+
+ if (raw_proc_init())
+ goto out_raw;
+ if (tcp4_proc_init())
+ goto out_tcp;
+ if (udp4_proc_init())
+ goto out_udp;
+ if (fib_proc_init())
+ goto out_fib;
+ if (ip_misc_proc_init())
+ goto out_misc;
+out:
+ return rc;
+out_misc:
+ fib_proc_exit();
+out_fib:
+ udp4_proc_exit();
+out_udp:
+ tcp4_proc_exit();
+out_tcp:
+ raw_proc_exit();
+out_raw:
+ rc = -ENOMEM;
+ goto out;
+}
+
+#else /* CONFIG_PROC_FS */
+static int __init ipv4_proc_init(void)
+{
+ return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
+MODULE_ALIAS_NETPROTO(PF_INET);
+
+EXPORT_SYMBOL(inet_accept);
+EXPORT_SYMBOL(inet_bind);
+EXPORT_SYMBOL(inet_dgram_connect);
+EXPORT_SYMBOL(inet_dgram_ops);
+EXPORT_SYMBOL(inet_getname);
+EXPORT_SYMBOL(inet_ioctl);
+EXPORT_SYMBOL(inet_listen);
+EXPORT_SYMBOL(inet_register_protosw);
+EXPORT_SYMBOL(inet_release);
+EXPORT_SYMBOL(inet_sendmsg);
+EXPORT_SYMBOL(inet_shutdown);
+EXPORT_SYMBOL(inet_sock_destruct);
+EXPORT_SYMBOL(inet_stream_connect);
+EXPORT_SYMBOL(inet_stream_ops);
+EXPORT_SYMBOL(inet_unregister_protosw);
+EXPORT_SYMBOL(net_statistics);
+
+#ifdef INET_REFCNT_DEBUG
+EXPORT_SYMBOL(inet_sock_nr);
+#endif
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
new file mode 100644
index 00000000000..0e98f2235b6
--- /dev/null
+++ b/net/ipv4/ah4.c
@@ -0,0 +1,335 @@
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/ah.h>
+#include <linux/crypto.h>
+#include <linux/pfkeyv2.h>
+#include <net/icmp.h>
+#include <asm/scatterlist.h>
+
+
+/* Clear mutable options and find final destination to substitute
+ * into IP header for icv calculation. Options are already checked
+ * for validity, so paranoia is not required. */
+
+static int ip_clear_mutable_options(struct iphdr *iph, u32 *daddr)
+{
+ unsigned char * optptr = (unsigned char*)(iph+1);
+ int l = iph->ihl*4 - sizeof(struct iphdr);
+ int optlen;
+
+ while (l > 0) {
+ switch (*optptr) {
+ case IPOPT_END:
+ return 0;
+ case IPOPT_NOOP:
+ l--;
+ optptr++;
+ continue;
+ }
+ optlen = optptr[1];
+ if (optlen<2 || optlen>l)
+ return -EINVAL;
+ switch (*optptr) {
+ case IPOPT_SEC:
+ case 0x85: /* Some "Extended Security" crap. */
+ case 0x86: /* Another "Commercial Security" crap. */
+ case IPOPT_RA:
+ case 0x80|21: /* RFC1770 */
+ break;
+ case IPOPT_LSRR:
+ case IPOPT_SSRR:
+ if (optlen < 6)
+ return -EINVAL;
+ memcpy(daddr, optptr+optlen-4, 4);
+ /* Fall through */
+ default:
+ memset(optptr+2, 0, optlen-2);
+ }
+ l -= optlen;
+ optptr += optlen;
+ }
+ return 0;
+}
+
+static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+ struct iphdr *iph, *top_iph;
+ struct ip_auth_hdr *ah;
+ struct ah_data *ahp;
+ union {
+ struct iphdr iph;
+ char buf[60];
+ } tmp_iph;
+
+ top_iph = skb->nh.iph;
+ iph = &tmp_iph.iph;
+
+ iph->tos = top_iph->tos;
+ iph->ttl = top_iph->ttl;
+ iph->frag_off = top_iph->frag_off;
+
+ if (top_iph->ihl != 5) {
+ iph->daddr = top_iph->daddr;
+ memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+ err = ip_clear_mutable_options(top_iph, &top_iph->daddr);
+ if (err)
+ goto error;
+ }
+
+ ah = (struct ip_auth_hdr *)((char *)top_iph+top_iph->ihl*4);
+ ah->nexthdr = top_iph->protocol;
+
+ top_iph->tos = 0;
+ top_iph->tot_len = htons(skb->len);
+ top_iph->frag_off = 0;
+ top_iph->ttl = 0;
+ top_iph->protocol = IPPROTO_AH;
+ top_iph->check = 0;
+
+ ahp = x->data;
+ ah->hdrlen = (XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len) >> 2) - 2;
+
+ ah->reserved = 0;
+ ah->spi = x->id.spi;
+ ah->seq_no = htonl(++x->replay.oseq);
+ ahp->icv(ahp, skb, ah->auth_data);
+
+ top_iph->tos = iph->tos;
+ top_iph->ttl = iph->ttl;
+ top_iph->frag_off = iph->frag_off;
+ if (top_iph->ihl != 5) {
+ top_iph->daddr = iph->daddr;
+ memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+ }
+
+ ip_send_check(top_iph);
+
+ err = 0;
+
+error:
+ return err;
+}
+
+static int ah_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
+{
+ int ah_hlen;
+ struct iphdr *iph;
+ struct ip_auth_hdr *ah;
+ struct ah_data *ahp;
+ char work_buf[60];
+
+ if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr)))
+ goto out;
+
+ ah = (struct ip_auth_hdr*)skb->data;
+ ahp = x->data;
+ ah_hlen = (ah->hdrlen + 2) << 2;
+
+ if (ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len))
+ goto out;
+
+ if (!pskb_may_pull(skb, ah_hlen))
+ goto out;
+
+ /* We are going to _remove_ AH header to keep sockets happy,
+ * so... Later this can change. */
+ if (skb_cloned(skb) &&
+ pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ goto out;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ ah = (struct ip_auth_hdr*)skb->data;
+ iph = skb->nh.iph;
+
+ memcpy(work_buf, iph, iph->ihl*4);
+
+ iph->ttl = 0;
+ iph->tos = 0;
+ iph->frag_off = 0;
+ iph->check = 0;
+ if (iph->ihl != 5) {
+ u32 dummy;
+ if (ip_clear_mutable_options(iph, &dummy))
+ goto out;
+ }
+ {
+ u8 auth_data[MAX_AH_AUTH_LEN];
+
+ memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
+ skb_push(skb, skb->data - skb->nh.raw);
+ ahp->icv(ahp, skb, ah->auth_data);
+ if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) {
+ x->stats.integrity_failed++;
+ goto out;
+ }
+ }
+ ((struct iphdr*)work_buf)->protocol = ah->nexthdr;
+ skb->nh.raw = skb_pull(skb, ah_hlen);
+ memcpy(skb->nh.raw, work_buf, iph->ihl*4);
+ skb->nh.iph->tot_len = htons(skb->len);
+ skb_pull(skb, skb->nh.iph->ihl*4);
+ skb->h.raw = skb->data;
+
+ return 0;
+
+out:
+ return -EINVAL;
+}
+
+static void ah4_err(struct sk_buff *skb, u32 info)
+{
+ struct iphdr *iph = (struct iphdr*)skb->data;
+ struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+(iph->ihl<<2));
+ struct xfrm_state *x;
+
+ if (skb->h.icmph->type != ICMP_DEST_UNREACH ||
+ skb->h.icmph->code != ICMP_FRAG_NEEDED)
+ return;
+
+ x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET);
+ if (!x)
+ return;
+ printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
+ ntohl(ah->spi), ntohl(iph->daddr));
+ xfrm_state_put(x);
+}
+
+static int ah_init_state(struct xfrm_state *x, void *args)
+{
+ struct ah_data *ahp = NULL;
+ struct xfrm_algo_desc *aalg_desc;
+
+ if (!x->aalg)
+ goto error;
+
+ /* null auth can use a zero length key */
+ if (x->aalg->alg_key_len > 512)
+ goto error;
+
+ if (x->encap)
+ goto error;
+
+ ahp = kmalloc(sizeof(*ahp), GFP_KERNEL);
+ if (ahp == NULL)
+ return -ENOMEM;
+
+ memset(ahp, 0, sizeof(*ahp));
+
+ ahp->key = x->aalg->alg_key;
+ ahp->key_len = (x->aalg->alg_key_len+7)/8;
+ ahp->tfm = crypto_alloc_tfm(x->aalg->alg_name, 0);
+ if (!ahp->tfm)
+ goto error;
+ ahp->icv = ah_hmac_digest;
+
+ /*
+ * Lookup the algorithm description maintained by xfrm_algo,
+ * verify crypto transform properties, and store information
+ * we need for AH processing. This lookup cannot fail here
+ * after a successful crypto_alloc_tfm().
+ */
+ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+ BUG_ON(!aalg_desc);
+
+ if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
+ crypto_tfm_alg_digestsize(ahp->tfm)) {
+ printk(KERN_INFO "AH: %s digestsize %u != %hu\n",
+ x->aalg->alg_name, crypto_tfm_alg_digestsize(ahp->tfm),
+ aalg_desc->uinfo.auth.icv_fullbits/8);
+ goto error;
+ }
+
+ ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
+ ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8;
+
+ BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
+
+ ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL);
+ if (!ahp->work_icv)
+ goto error;
+
+ x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len);
+ if (x->props.mode)
+ x->props.header_len += sizeof(struct iphdr);
+ x->data = ahp;
+
+ return 0;
+
+error:
+ if (ahp) {
+ if (ahp->work_icv)
+ kfree(ahp->work_icv);
+ if (ahp->tfm)
+ crypto_free_tfm(ahp->tfm);
+ kfree(ahp);
+ }
+ return -EINVAL;
+}
+
+static void ah_destroy(struct xfrm_state *x)
+{
+ struct ah_data *ahp = x->data;
+
+ if (!ahp)
+ return;
+
+ if (ahp->work_icv) {
+ kfree(ahp->work_icv);
+ ahp->work_icv = NULL;
+ }
+ if (ahp->tfm) {
+ crypto_free_tfm(ahp->tfm);
+ ahp->tfm = NULL;
+ }
+ kfree(ahp);
+}
+
+
+static struct xfrm_type ah_type =
+{
+ .description = "AH4",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_AH,
+ .init_state = ah_init_state,
+ .destructor = ah_destroy,
+ .input = ah_input,
+ .output = ah_output
+};
+
+static struct net_protocol ah4_protocol = {
+ .handler = xfrm4_rcv,
+ .err_handler = ah4_err,
+ .no_policy = 1,
+};
+
+static int __init ah4_init(void)
+{
+ if (xfrm_register_type(&ah_type, AF_INET) < 0) {
+ printk(KERN_INFO "ip ah init: can't add xfrm type\n");
+ return -EAGAIN;
+ }
+ if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) {
+ printk(KERN_INFO "ip ah init: can't add protocol\n");
+ xfrm_unregister_type(&ah_type, AF_INET);
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static void __exit ah4_fini(void)
+{
+ if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0)
+ printk(KERN_INFO "ip ah close: can't remove protocol\n");
+ if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
+ printk(KERN_INFO "ip ah close: can't remove xfrm type\n");
+}
+
+module_init(ah4_init);
+module_exit(ah4_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
new file mode 100644
index 00000000000..a642fd61285
--- /dev/null
+++ b/net/ipv4/arp.c
@@ -0,0 +1,1425 @@
+/* linux/net/inet/arp.c
+ *
+ * Version: $Id: arp.c,v 1.99 2001/08/30 22:55:42 davem Exp $
+ *
+ * Copyright (C) 1994 by Florian La Roche
+ *
+ * This module implements the Address Resolution Protocol ARP (RFC 826),
+ * which is used to convert IP addresses (or in the future maybe other
+ * high-level addresses) into a low-level hardware address (like an Ethernet
+ * address).
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Alan Cox : Removed the Ethernet assumptions in
+ * Florian's code
+ * Alan Cox : Fixed some small errors in the ARP
+ * logic
+ * Alan Cox : Allow >4K in /proc
+ * Alan Cox : Make ARP add its own protocol entry
+ * Ross Martin : Rewrote arp_rcv() and arp_get_info()
+ * Stephen Henson : Add AX25 support to arp_get_info()
+ * Alan Cox : Drop data when a device is downed.
+ * Alan Cox : Use init_timer().
+ * Alan Cox : Double lock fixes.
+ * Martin Seine : Move the arphdr structure
+ * to if_arp.h for compatibility.
+ * with BSD based programs.
+ * Andrew Tridgell : Added ARP netmask code and
+ * re-arranged proxy handling.
+ * Alan Cox : Changed to use notifiers.
+ * Niibe Yutaka : Reply for this device or proxies only.
+ * Alan Cox : Don't proxy across hardware types!
+ * Jonathan Naylor : Added support for NET/ROM.
+ * Mike Shaver : RFC1122 checks.
+ * Jonathan Naylor : Only lookup the hardware address for
+ * the correct hardware type.
+ * Germano Caronni : Assorted subtle races.
+ * Craig Schlenter : Don't modify permanent entry
+ * during arp_rcv.
+ * Russ Nelson : Tidied up a few bits.
+ * Alexey Kuznetsov: Major changes to caching and behaviour,
+ * eg intelligent arp probing and
+ * generation
+ * of host down events.
+ * Alan Cox : Missing unlock in device events.
+ * Eckes : ARP ioctl control errors.
+ * Alexey Kuznetsov: Arp free fix.
+ * Manuel Rodriguez: Gratuitous ARP.
+ * Jonathan Layes : Added arpd support through kerneld
+ * message queue (960314)
+ * Mike Shaver : /proc/sys/net/ipv4/arp_* support
+ * Mike McLagan : Routing by source
+ * Stuart Cheshire : Metricom and grat arp fixes
+ * *** FOR 2.1 clean this up ***
+ * Lawrence V. Stefani: (08/12/96) Added FDDI support.
+ * Alan Cox : Took the AP1000 nasty FDDI hack and
+ * folded into the mainstream FDDI code.
+ * Ack spit, Linus how did you allow that
+ * one in...
+ * Jes Sorensen : Make FDDI work again in 2.1.x and
+ * clean up the APFDDI & gen. FDDI bits.
+ * Alexey Kuznetsov: new arp state machine;
+ * now it is in net/core/neighbour.c.
+ * Krzysztof Halasa: Added Frame Relay ARP support.
+ * Arnaldo C. Melo : convert /proc/net/arp to seq_file
+ * Shmulik Hen: Split arp_send to arp_create and
+ * arp_xmit so intermediate drivers like
+ * bonding can change the skb before
+ * sending (e.g. insert 8021q tag).
+ * Harald Welte : convert to make use of jenkins hash
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/config.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/fddidevice.h>
+#include <linux/if_arp.h>
+#include <linux/trdevice.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/jhash.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+#include <net/ax25.h>
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+#include <net/netrom.h>
+#endif
+#endif
+#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
+#include <net/atmclip.h>
+struct neigh_table *clip_tbl_hook;
+#endif
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/netfilter_arp.h>
+
+/*
+ * Interface to generic neighbour cache.
+ */
+static u32 arp_hash(const void *pkey, const struct net_device *dev);
+static int arp_constructor(struct neighbour *neigh);
+static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
+static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
+static void parp_redo(struct sk_buff *skb);
+
+static struct neigh_ops arp_generic_ops = {
+ .family = AF_INET,
+ .solicit = arp_solicit,
+ .error_report = arp_error_report,
+ .output = neigh_resolve_output,
+ .connected_output = neigh_connected_output,
+ .hh_output = dev_queue_xmit,
+ .queue_xmit = dev_queue_xmit,
+};
+
+static struct neigh_ops arp_hh_ops = {
+ .family = AF_INET,
+ .solicit = arp_solicit,
+ .error_report = arp_error_report,
+ .output = neigh_resolve_output,
+ .connected_output = neigh_resolve_output,
+ .hh_output = dev_queue_xmit,
+ .queue_xmit = dev_queue_xmit,
+};
+
+static struct neigh_ops arp_direct_ops = {
+ .family = AF_INET,
+ .output = dev_queue_xmit,
+ .connected_output = dev_queue_xmit,
+ .hh_output = dev_queue_xmit,
+ .queue_xmit = dev_queue_xmit,
+};
+
+struct neigh_ops arp_broken_ops = {
+ .family = AF_INET,
+ .solicit = arp_solicit,
+ .error_report = arp_error_report,
+ .output = neigh_compat_output,
+ .connected_output = neigh_compat_output,
+ .hh_output = dev_queue_xmit,
+ .queue_xmit = dev_queue_xmit,
+};
+
+struct neigh_table arp_tbl = {
+ .family = AF_INET,
+ .entry_size = sizeof(struct neighbour) + 4,
+ .key_len = 4,
+ .hash = arp_hash,
+ .constructor = arp_constructor,
+ .proxy_redo = parp_redo,
+ .id = "arp_cache",
+ .parms = {
+ .tbl = &arp_tbl,
+ .base_reachable_time = 30 * HZ,
+ .retrans_time = 1 * HZ,
+ .gc_staletime = 60 * HZ,
+ .reachable_time = 30 * HZ,
+ .delay_probe_time = 5 * HZ,
+ .queue_len = 3,
+ .ucast_probes = 3,
+ .mcast_probes = 3,
+ .anycast_delay = 1 * HZ,
+ .proxy_delay = (8 * HZ) / 10,
+ .proxy_qlen = 64,
+ .locktime = 1 * HZ,
+ },
+ .gc_interval = 30 * HZ,
+ .gc_thresh1 = 128,
+ .gc_thresh2 = 512,
+ .gc_thresh3 = 1024,
+};
+
+int arp_mc_map(u32 addr, u8 *haddr, struct net_device *dev, int dir)
+{
+ switch (dev->type) {
+ case ARPHRD_ETHER:
+ case ARPHRD_FDDI:
+ case ARPHRD_IEEE802:
+ ip_eth_mc_map(addr, haddr);
+ return 0;
+ case ARPHRD_IEEE802_TR:
+ ip_tr_mc_map(addr, haddr);
+ return 0;
+ case ARPHRD_INFINIBAND:
+ ip_ib_mc_map(addr, haddr);
+ return 0;
+ default:
+ if (dir) {
+ memcpy(haddr, dev->broadcast, dev->addr_len);
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+
+static u32 arp_hash(const void *pkey, const struct net_device *dev)
+{
+ return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd);
+}
+
+static int arp_constructor(struct neighbour *neigh)
+{
+ u32 addr = *(u32*)neigh->primary_key;
+ struct net_device *dev = neigh->dev;
+ struct in_device *in_dev;
+ struct neigh_parms *parms;
+
+ neigh->type = inet_addr_type(addr);
+
+ rcu_read_lock();
+ in_dev = rcu_dereference(__in_dev_get(dev));
+ if (in_dev == NULL) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ parms = in_dev->arp_parms;
+ __neigh_parms_put(neigh->parms);
+ neigh->parms = neigh_parms_clone(parms);
+ rcu_read_unlock();
+
+ if (dev->hard_header == NULL) {
+ neigh->nud_state = NUD_NOARP;
+ neigh->ops = &arp_direct_ops;
+ neigh->output = neigh->ops->queue_xmit;
+ } else {
+ /* Good devices (checked by reading texts, but only Ethernet is
+ tested)
+
+ ARPHRD_ETHER: (ethernet, apfddi)
+ ARPHRD_FDDI: (fddi)
+ ARPHRD_IEEE802: (tr)
+ ARPHRD_METRICOM: (strip)
+ ARPHRD_ARCNET:
+ etc. etc. etc.
+
+ ARPHRD_IPDDP will also work, if author repairs it.
+ I did not it, because this driver does not work even
+ in old paradigm.
+ */
+
+#if 1
+ /* So... these "amateur" devices are hopeless.
+ The only thing, that I can say now:
+ It is very sad that we need to keep ugly obsolete
+ code to make them happy.
+
+ They should be moved to more reasonable state, now
+ they use rebuild_header INSTEAD OF hard_start_xmit!!!
+ Besides that, they are sort of out of date
+ (a lot of redundant clones/copies, useless in 2.1),
+ I wonder why people believe that they work.
+ */
+ switch (dev->type) {
+ default:
+ break;
+ case ARPHRD_ROSE:
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+ case ARPHRD_AX25:
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+ case ARPHRD_NETROM:
+#endif
+ neigh->ops = &arp_broken_ops;
+ neigh->output = neigh->ops->output;
+ return 0;
+#endif
+ ;}
+#endif
+ if (neigh->type == RTN_MULTICAST) {
+ neigh->nud_state = NUD_NOARP;
+ arp_mc_map(addr, neigh->ha, dev, 1);
+ } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) {
+ neigh->nud_state = NUD_NOARP;
+ memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
+ } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) {
+ neigh->nud_state = NUD_NOARP;
+ memcpy(neigh->ha, dev->broadcast, dev->addr_len);
+ }
+ if (dev->hard_header_cache)
+ neigh->ops = &arp_hh_ops;
+ else
+ neigh->ops = &arp_generic_ops;
+ if (neigh->nud_state&NUD_VALID)
+ neigh->output = neigh->ops->connected_output;
+ else
+ neigh->output = neigh->ops->output;
+ }
+ return 0;
+}
+
+static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
+{
+ dst_link_failure(skb);
+ kfree_skb(skb);
+}
+
+static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
+{
+ u32 saddr = 0;
+ u8 *dst_ha = NULL;
+ struct net_device *dev = neigh->dev;
+ u32 target = *(u32*)neigh->primary_key;
+ int probes = atomic_read(&neigh->probes);
+ struct in_device *in_dev = in_dev_get(dev);
+
+ if (!in_dev)
+ return;
+
+ switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
+ default:
+ case 0: /* By default announce any local IP */
+ if (skb && inet_addr_type(skb->nh.iph->saddr) == RTN_LOCAL)
+ saddr = skb->nh.iph->saddr;
+ break;
+ case 1: /* Restrict announcements of saddr in same subnet */
+ if (!skb)
+ break;
+ saddr = skb->nh.iph->saddr;
+ if (inet_addr_type(saddr) == RTN_LOCAL) {
+ /* saddr should be known to target */
+ if (inet_addr_onlink(in_dev, target, saddr))
+ break;
+ }
+ saddr = 0;
+ break;
+ case 2: /* Avoid secondary IPs, get a primary/preferred one */
+ break;
+ }
+
+ if (in_dev)
+ in_dev_put(in_dev);
+ if (!saddr)
+ saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
+
+ if ((probes -= neigh->parms->ucast_probes) < 0) {
+ if (!(neigh->nud_state&NUD_VALID))
+ printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n");
+ dst_ha = neigh->ha;
+ read_lock_bh(&neigh->lock);
+ } else if ((probes -= neigh->parms->app_probes) < 0) {
+#ifdef CONFIG_ARPD
+ neigh_app_ns(neigh);
+#endif
+ return;
+ }
+
+ arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
+ dst_ha, dev->dev_addr, NULL);
+ if (dst_ha)
+ read_unlock_bh(&neigh->lock);
+}
+
+static int arp_ignore(struct in_device *in_dev, struct net_device *dev,
+ u32 sip, u32 tip)
+{
+ int scope;
+
+ switch (IN_DEV_ARP_IGNORE(in_dev)) {
+ case 0: /* Reply, the tip is already validated */
+ return 0;
+ case 1: /* Reply only if tip is configured on the incoming interface */
+ sip = 0;
+ scope = RT_SCOPE_HOST;
+ break;
+ case 2: /*
+ * Reply only if tip is configured on the incoming interface
+ * and is in same subnet as sip
+ */
+ scope = RT_SCOPE_HOST;
+ break;
+ case 3: /* Do not reply for scope host addresses */
+ sip = 0;
+ scope = RT_SCOPE_LINK;
+ dev = NULL;
+ break;
+ case 4: /* Reserved */
+ case 5:
+ case 6:
+ case 7:
+ return 0;
+ case 8: /* Do not reply */
+ return 1;
+ default:
+ return 0;
+ }
+ return !inet_confirm_addr(dev, sip, tip, scope);
+}
+
+static int arp_filter(__u32 sip, __u32 tip, struct net_device *dev)
+{
+ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = sip,
+ .saddr = tip } } };
+ struct rtable *rt;
+ int flag = 0;
+ /*unsigned long now; */
+
+ if (ip_route_output_key(&rt, &fl) < 0)
+ return 1;
+ if (rt->u.dst.dev != dev) {
+ NET_INC_STATS_BH(LINUX_MIB_ARPFILTER);
+ flag = 1;
+ }
+ ip_rt_put(rt);
+ return flag;
+}
+
+/* OBSOLETE FUNCTIONS */
+
+/*
+ * Find an arp mapping in the cache. If not found, post a request.
+ *
+ * It is very UGLY routine: it DOES NOT use skb->dst->neighbour,
+ * even if it exists. It is supposed that skb->dev was mangled
+ * by a virtual device (eql, shaper). Nobody but broken devices
+ * is allowed to use this function, it is scheduled to be removed. --ANK
+ */
+
+static int arp_set_predefined(int addr_hint, unsigned char * haddr, u32 paddr, struct net_device * dev)
+{
+ switch (addr_hint) {
+ case RTN_LOCAL:
+ printk(KERN_DEBUG "ARP: arp called for own IP address\n");
+ memcpy(haddr, dev->dev_addr, dev->addr_len);
+ return 1;
+ case RTN_MULTICAST:
+ arp_mc_map(paddr, haddr, dev, 1);
+ return 1;
+ case RTN_BROADCAST:
+ memcpy(haddr, dev->broadcast, dev->addr_len);
+ return 1;
+ }
+ return 0;
+}
+
+
+int arp_find(unsigned char *haddr, struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ u32 paddr;
+ struct neighbour *n;
+
+ if (!skb->dst) {
+ printk(KERN_DEBUG "arp_find is called with dst==NULL\n");
+ kfree_skb(skb);
+ return 1;
+ }
+
+ paddr = ((struct rtable*)skb->dst)->rt_gateway;
+
+ if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev))
+ return 0;
+
+ n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
+
+ if (n) {
+ n->used = jiffies;
+ if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) {
+ read_lock_bh(&n->lock);
+ memcpy(haddr, n->ha, dev->addr_len);
+ read_unlock_bh(&n->lock);
+ neigh_release(n);
+ return 0;
+ }
+ neigh_release(n);
+ } else
+ kfree_skb(skb);
+ return 1;
+}
+
+/* END OF OBSOLETE FUNCTIONS */
+
+int arp_bind_neighbour(struct dst_entry *dst)
+{
+ struct net_device *dev = dst->dev;
+ struct neighbour *n = dst->neighbour;
+
+ if (dev == NULL)
+ return -EINVAL;
+ if (n == NULL) {
+ u32 nexthop = ((struct rtable*)dst)->rt_gateway;
+ if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT))
+ nexthop = 0;
+ n = __neigh_lookup_errno(
+#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
+ dev->type == ARPHRD_ATM ? clip_tbl_hook :
+#endif
+ &arp_tbl, &nexthop, dev);
+ if (IS_ERR(n))
+ return PTR_ERR(n);
+ dst->neighbour = n;
+ }
+ return 0;
+}
+
+/*
+ * Check if we can use proxy ARP for this path
+ */
+
+static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt)
+{
+ struct in_device *out_dev;
+ int imi, omi = -1;
+
+ if (!IN_DEV_PROXY_ARP(in_dev))
+ return 0;
+
+ if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0)
+ return 1;
+ if (imi == -1)
+ return 0;
+
+ /* place to check for proxy_arp for routes */
+
+ if ((out_dev = in_dev_get(rt->u.dst.dev)) != NULL) {
+ omi = IN_DEV_MEDIUM_ID(out_dev);
+ in_dev_put(out_dev);
+ }
+ return (omi != imi && omi != -1);
+}
+
+/*
+ * Interface to link layer: send routine and receive handler.
+ */
+
+/*
+ * Create an arp packet. If (dest_hw == NULL), we create a broadcast
+ * message.
+ */
+struct sk_buff *arp_create(int type, int ptype, u32 dest_ip,
+ struct net_device *dev, u32 src_ip,
+ unsigned char *dest_hw, unsigned char *src_hw,
+ unsigned char *target_hw)
+{
+ struct sk_buff *skb;
+ struct arphdr *arp;
+ unsigned char *arp_ptr;
+
+ /*
+ * Allocate a buffer
+ */
+
+ skb = alloc_skb(sizeof(struct arphdr)+ 2*(dev->addr_len+4)
+ + LL_RESERVED_SPACE(dev), GFP_ATOMIC);
+ if (skb == NULL)
+ return NULL;
+
+ skb_reserve(skb, LL_RESERVED_SPACE(dev));
+ skb->nh.raw = skb->data;
+ arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4));
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_ARP);
+ if (src_hw == NULL)
+ src_hw = dev->dev_addr;
+ if (dest_hw == NULL)
+ dest_hw = dev->broadcast;
+
+ /*
+ * Fill the device header for the ARP frame
+ */
+ if (dev->hard_header &&
+ dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len) < 0)
+ goto out;
+
+ /*
+ * Fill out the arp protocol part.
+ *
+ * The arp hardware type should match the device type, except for FDDI,
+ * which (according to RFC 1390) should always equal 1 (Ethernet).
+ */
+ /*
+ * Exceptions everywhere. AX.25 uses the AX.25 PID value not the
+ * DIX code for the protocol. Make these device structure fields.
+ */
+ switch (dev->type) {
+ default:
+ arp->ar_hrd = htons(dev->type);
+ arp->ar_pro = htons(ETH_P_IP);
+ break;
+
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+ case ARPHRD_AX25:
+ arp->ar_hrd = htons(ARPHRD_AX25);
+ arp->ar_pro = htons(AX25_P_IP);
+ break;
+
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+ case ARPHRD_NETROM:
+ arp->ar_hrd = htons(ARPHRD_NETROM);
+ arp->ar_pro = htons(AX25_P_IP);
+ break;
+#endif
+#endif
+
+#ifdef CONFIG_FDDI
+ case ARPHRD_FDDI:
+ arp->ar_hrd = htons(ARPHRD_ETHER);
+ arp->ar_pro = htons(ETH_P_IP);
+ break;
+#endif
+#ifdef CONFIG_TR
+ case ARPHRD_IEEE802_TR:
+ arp->ar_hrd = htons(ARPHRD_IEEE802);
+ arp->ar_pro = htons(ETH_P_IP);
+ break;
+#endif
+ }
+
+ arp->ar_hln = dev->addr_len;
+ arp->ar_pln = 4;
+ arp->ar_op = htons(type);
+
+ arp_ptr=(unsigned char *)(arp+1);
+
+ memcpy(arp_ptr, src_hw, dev->addr_len);
+ arp_ptr+=dev->addr_len;
+ memcpy(arp_ptr, &src_ip,4);
+ arp_ptr+=4;
+ if (target_hw != NULL)
+ memcpy(arp_ptr, target_hw, dev->addr_len);
+ else
+ memset(arp_ptr, 0, dev->addr_len);
+ arp_ptr+=dev->addr_len;
+ memcpy(arp_ptr, &dest_ip, 4);
+
+ return skb;
+
+out:
+ kfree_skb(skb);
+ return NULL;
+}
+
+/*
+ * Send an arp packet.
+ */
+void arp_xmit(struct sk_buff *skb)
+{
+ /* Send it off, maybe filter it using firewalling first. */
+ NF_HOOK(NF_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit);
+}
+
+/*
+ * Create and send an arp packet.
+ */
+void arp_send(int type, int ptype, u32 dest_ip,
+ struct net_device *dev, u32 src_ip,
+ unsigned char *dest_hw, unsigned char *src_hw,
+ unsigned char *target_hw)
+{
+ struct sk_buff *skb;
+
+ /*
+ * No arp on this interface.
+ */
+
+ if (dev->flags&IFF_NOARP)
+ return;
+
+ skb = arp_create(type, ptype, dest_ip, dev, src_ip,
+ dest_hw, src_hw, target_hw);
+ if (skb == NULL) {
+ return;
+ }
+
+ arp_xmit(skb);
+}
+
+static void parp_redo(struct sk_buff *skb)
+{
+ nf_reset(skb);
+ arp_rcv(skb, skb->dev, NULL);
+}
+
+/*
+ * Process an arp request.
+ */
+
+static int arp_process(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct in_device *in_dev = in_dev_get(dev);
+ struct arphdr *arp;
+ unsigned char *arp_ptr;
+ struct rtable *rt;
+ unsigned char *sha, *tha;
+ u32 sip, tip;
+ u16 dev_type = dev->type;
+ int addr_type;
+ struct neighbour *n;
+
+ /* arp_rcv below verifies the ARP header and verifies the device
+ * is ARP'able.
+ */
+
+ if (in_dev == NULL)
+ goto out;
+
+ arp = skb->nh.arph;
+
+ switch (dev_type) {
+ default:
+ if (arp->ar_pro != htons(ETH_P_IP) ||
+ htons(dev_type) != arp->ar_hrd)
+ goto out;
+ break;
+#ifdef CONFIG_NET_ETHERNET
+ case ARPHRD_ETHER:
+#endif
+#ifdef CONFIG_TR
+ case ARPHRD_IEEE802_TR:
+#endif
+#ifdef CONFIG_FDDI
+ case ARPHRD_FDDI:
+#endif
+#ifdef CONFIG_NET_FC
+ case ARPHRD_IEEE802:
+#endif
+#if defined(CONFIG_NET_ETHERNET) || defined(CONFIG_TR) || \
+ defined(CONFIG_FDDI) || defined(CONFIG_NET_FC)
+ /*
+ * ETHERNET, Token Ring and Fibre Channel (which are IEEE 802
+ * devices, according to RFC 2625) devices will accept ARP
+ * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
+ * This is the case also of FDDI, where the RFC 1390 says that
+ * FDDI devices should accept ARP hardware of (1) Ethernet,
+ * however, to be more robust, we'll accept both 1 (Ethernet)
+ * or 6 (IEEE 802.2)
+ */
+ if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
+ arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
+ arp->ar_pro != htons(ETH_P_IP))
+ goto out;
+ break;
+#endif
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+ case ARPHRD_AX25:
+ if (arp->ar_pro != htons(AX25_P_IP) ||
+ arp->ar_hrd != htons(ARPHRD_AX25))
+ goto out;
+ break;
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+ case ARPHRD_NETROM:
+ if (arp->ar_pro != htons(AX25_P_IP) ||
+ arp->ar_hrd != htons(ARPHRD_NETROM))
+ goto out;
+ break;
+#endif
+#endif
+ }
+
+ /* Understand only these message types */
+
+ if (arp->ar_op != htons(ARPOP_REPLY) &&
+ arp->ar_op != htons(ARPOP_REQUEST))
+ goto out;
+
+/*
+ * Extract fields
+ */
+ arp_ptr= (unsigned char *)(arp+1);
+ sha = arp_ptr;
+ arp_ptr += dev->addr_len;
+ memcpy(&sip, arp_ptr, 4);
+ arp_ptr += 4;
+ tha = arp_ptr;
+ arp_ptr += dev->addr_len;
+ memcpy(&tip, arp_ptr, 4);
+/*
+ * Check for bad requests for 127.x.x.x and requests for multicast
+ * addresses. If this is one such, delete it.
+ */
+ if (LOOPBACK(tip) || MULTICAST(tip))
+ goto out;
+
+/*
+ * Special case: We must set Frame Relay source Q.922 address
+ */
+ if (dev_type == ARPHRD_DLCI)
+ sha = dev->broadcast;
+
+/*
+ * Process entry. The idea here is we want to send a reply if it is a
+ * request for us or if it is a request for someone else that we hold
+ * a proxy for. We want to add an entry to our cache if it is a reply
+ * to us or if it is a request for our address.
+ * (The assumption for this last is that if someone is requesting our
+ * address, they are probably intending to talk to us, so it saves time
+ * if we cache their address. Their address is also probably not in
+ * our cache, since ours is not in their cache.)
+ *
+ * Putting this another way, we only care about replies if they are to
+ * us, in which case we add them to the cache. For requests, we care
+ * about those for us and those for our proxies. We reply to both,
+ * and in the case of requests for us we add the requester to the arp
+ * cache.
+ */
+
+ /* Special case: IPv4 duplicate address detection packet (RFC2131) */
+ if (sip == 0) {
+ if (arp->ar_op == htons(ARPOP_REQUEST) &&
+ inet_addr_type(tip) == RTN_LOCAL &&
+ !arp_ignore(in_dev,dev,sip,tip))
+ arp_send(ARPOP_REPLY,ETH_P_ARP,tip,dev,tip,sha,dev->dev_addr,dev->dev_addr);
+ goto out;
+ }
+
+ if (arp->ar_op == htons(ARPOP_REQUEST) &&
+ ip_route_input(skb, tip, sip, 0, dev) == 0) {
+
+ rt = (struct rtable*)skb->dst;
+ addr_type = rt->rt_type;
+
+ if (addr_type == RTN_LOCAL) {
+ n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+ if (n) {
+ int dont_send = 0;
+
+ if (!dont_send)
+ dont_send |= arp_ignore(in_dev,dev,sip,tip);
+ if (!dont_send && IN_DEV_ARPFILTER(in_dev))
+ dont_send |= arp_filter(sip,tip,dev);
+ if (!dont_send)
+ arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
+
+ neigh_release(n);
+ }
+ goto out;
+ } else if (IN_DEV_FORWARD(in_dev)) {
+ if ((rt->rt_flags&RTCF_DNAT) ||
+ (addr_type == RTN_UNICAST && rt->u.dst.dev != dev &&
+ (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, &tip, dev, 0)))) {
+ n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+ if (n)
+ neigh_release(n);
+
+ if (skb->stamp.tv_sec == LOCALLY_ENQUEUED ||
+ skb->pkt_type == PACKET_HOST ||
+ in_dev->arp_parms->proxy_delay == 0) {
+ arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
+ } else {
+ pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb);
+ in_dev_put(in_dev);
+ return 0;
+ }
+ goto out;
+ }
+ }
+ }
+
+ /* Update our ARP tables */
+
+ n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
+
+#ifdef CONFIG_IP_ACCEPT_UNSOLICITED_ARP
+ /* Unsolicited ARP is not accepted by default.
+ It is possible, that this option should be enabled for some
+ devices (strip is candidate)
+ */
+ if (n == NULL &&
+ arp->ar_op == htons(ARPOP_REPLY) &&
+ inet_addr_type(sip) == RTN_UNICAST)
+ n = __neigh_lookup(&arp_tbl, &sip, dev, -1);
+#endif
+
+ if (n) {
+ int state = NUD_REACHABLE;
+ int override;
+
+ /* If several different ARP replies follows back-to-back,
+ use the FIRST one. It is possible, if several proxy
+ agents are active. Taking the first reply prevents
+ arp trashing and chooses the fastest router.
+ */
+ override = time_after(jiffies, n->updated + n->parms->locktime);
+
+ /* Broadcast replies and request packets
+ do not assert neighbour reachability.
+ */
+ if (arp->ar_op != htons(ARPOP_REPLY) ||
+ skb->pkt_type != PACKET_HOST)
+ state = NUD_STALE;
+ neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0);
+ neigh_release(n);
+ }
+
+out:
+ if (in_dev)
+ in_dev_put(in_dev);
+ kfree_skb(skb);
+ return 0;
+}
+
+
+/*
+ * Receive an arp request from the device layer.
+ */
+
+int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+{
+ struct arphdr *arp;
+
+ /* ARP header, plus 2 device addresses, plus 2 IP addresses. */
+ if (!pskb_may_pull(skb, (sizeof(struct arphdr) +
+ (2 * dev->addr_len) +
+ (2 * sizeof(u32)))))
+ goto freeskb;
+
+ arp = skb->nh.arph;
+ if (arp->ar_hln != dev->addr_len ||
+ dev->flags & IFF_NOARP ||
+ skb->pkt_type == PACKET_OTHERHOST ||
+ skb->pkt_type == PACKET_LOOPBACK ||
+ arp->ar_pln != 4)
+ goto freeskb;
+
+ if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+ goto out_of_mem;
+
+ return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
+
+freeskb:
+ kfree_skb(skb);
+out_of_mem:
+ return 0;
+}
+
+/*
+ * User level interface (ioctl)
+ */
+
+/*
+ * Set (create) an ARP cache entry.
+ */
+
+static int arp_req_set(struct arpreq *r, struct net_device * dev)
+{
+ u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+ struct neighbour *neigh;
+ int err;
+
+ if (r->arp_flags&ATF_PUBL) {
+ u32 mask = ((struct sockaddr_in *) &r->arp_netmask)->sin_addr.s_addr;
+ if (mask && mask != 0xFFFFFFFF)
+ return -EINVAL;
+ if (!dev && (r->arp_flags & ATF_COM)) {
+ dev = dev_getbyhwaddr(r->arp_ha.sa_family, r->arp_ha.sa_data);
+ if (!dev)
+ return -ENODEV;
+ }
+ if (mask) {
+ if (pneigh_lookup(&arp_tbl, &ip, dev, 1) == NULL)
+ return -ENOBUFS;
+ return 0;
+ }
+ if (dev == NULL) {
+ ipv4_devconf.proxy_arp = 1;
+ return 0;
+ }
+ if (__in_dev_get(dev)) {
+ __in_dev_get(dev)->cnf.proxy_arp = 1;
+ return 0;
+ }
+ return -ENXIO;
+ }
+
+ if (r->arp_flags & ATF_PERM)
+ r->arp_flags |= ATF_COM;
+ if (dev == NULL) {
+ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
+ .tos = RTO_ONLINK } } };
+ struct rtable * rt;
+ if ((err = ip_route_output_key(&rt, &fl)) != 0)
+ return err;
+ dev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ if (!dev)
+ return -EINVAL;
+ }
+ switch (dev->type) {
+#ifdef CONFIG_FDDI
+ case ARPHRD_FDDI:
+ /*
+ * According to RFC 1390, FDDI devices should accept ARP
+ * hardware types of 1 (Ethernet). However, to be more
+ * robust, we'll accept hardware types of either 1 (Ethernet)
+ * or 6 (IEEE 802.2).
+ */
+ if (r->arp_ha.sa_family != ARPHRD_FDDI &&
+ r->arp_ha.sa_family != ARPHRD_ETHER &&
+ r->arp_ha.sa_family != ARPHRD_IEEE802)
+ return -EINVAL;
+ break;
+#endif
+ default:
+ if (r->arp_ha.sa_family != dev->type)
+ return -EINVAL;
+ break;
+ }
+
+ neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
+ err = PTR_ERR(neigh);
+ if (!IS_ERR(neigh)) {
+ unsigned state = NUD_STALE;
+ if (r->arp_flags & ATF_PERM)
+ state = NUD_PERMANENT;
+ err = neigh_update(neigh, (r->arp_flags&ATF_COM) ?
+ r->arp_ha.sa_data : NULL, state,
+ NEIGH_UPDATE_F_OVERRIDE|
+ NEIGH_UPDATE_F_ADMIN);
+ neigh_release(neigh);
+ }
+ return err;
+}
+
+static unsigned arp_state_to_flags(struct neighbour *neigh)
+{
+ unsigned flags = 0;
+ if (neigh->nud_state&NUD_PERMANENT)
+ flags = ATF_PERM|ATF_COM;
+ else if (neigh->nud_state&NUD_VALID)
+ flags = ATF_COM;
+ return flags;
+}
+
+/*
+ * Get an ARP cache entry.
+ */
+
+static int arp_req_get(struct arpreq *r, struct net_device *dev)
+{
+ u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+ struct neighbour *neigh;
+ int err = -ENXIO;
+
+ neigh = neigh_lookup(&arp_tbl, &ip, dev);
+ if (neigh) {
+ read_lock_bh(&neigh->lock);
+ memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
+ r->arp_flags = arp_state_to_flags(neigh);
+ read_unlock_bh(&neigh->lock);
+ r->arp_ha.sa_family = dev->type;
+ strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
+ neigh_release(neigh);
+ err = 0;
+ }
+ return err;
+}
+
+static int arp_req_delete(struct arpreq *r, struct net_device * dev)
+{
+ int err;
+ u32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+ struct neighbour *neigh;
+
+ if (r->arp_flags & ATF_PUBL) {
+ u32 mask =
+ ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
+ if (mask == 0xFFFFFFFF)
+ return pneigh_delete(&arp_tbl, &ip, dev);
+ if (mask == 0) {
+ if (dev == NULL) {
+ ipv4_devconf.proxy_arp = 0;
+ return 0;
+ }
+ if (__in_dev_get(dev)) {
+ __in_dev_get(dev)->cnf.proxy_arp = 0;
+ return 0;
+ }
+ return -ENXIO;
+ }
+ return -EINVAL;
+ }
+
+ if (dev == NULL) {
+ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
+ .tos = RTO_ONLINK } } };
+ struct rtable * rt;
+ if ((err = ip_route_output_key(&rt, &fl)) != 0)
+ return err;
+ dev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ if (!dev)
+ return -EINVAL;
+ }
+ err = -ENXIO;
+ neigh = neigh_lookup(&arp_tbl, &ip, dev);
+ if (neigh) {
+ if (neigh->nud_state&~NUD_NOARP)
+ err = neigh_update(neigh, NULL, NUD_FAILED,
+ NEIGH_UPDATE_F_OVERRIDE|
+ NEIGH_UPDATE_F_ADMIN);
+ neigh_release(neigh);
+ }
+ return err;
+}
+
+/*
+ * Handle an ARP layer I/O control request.
+ */
+
+int arp_ioctl(unsigned int cmd, void __user *arg)
+{
+ int err;
+ struct arpreq r;
+ struct net_device *dev = NULL;
+
+ switch (cmd) {
+ case SIOCDARP:
+ case SIOCSARP:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ case SIOCGARP:
+ err = copy_from_user(&r, arg, sizeof(struct arpreq));
+ if (err)
+ return -EFAULT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (r.arp_pa.sa_family != AF_INET)
+ return -EPFNOSUPPORT;
+
+ if (!(r.arp_flags & ATF_PUBL) &&
+ (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB)))
+ return -EINVAL;
+ if (!(r.arp_flags & ATF_NETMASK))
+ ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
+ htonl(0xFFFFFFFFUL);
+ rtnl_lock();
+ if (r.arp_dev[0]) {
+ err = -ENODEV;
+ if ((dev = __dev_get_by_name(r.arp_dev)) == NULL)
+ goto out;
+
+ /* Mmmm... It is wrong... ARPHRD_NETROM==0 */
+ if (!r.arp_ha.sa_family)
+ r.arp_ha.sa_family = dev->type;
+ err = -EINVAL;
+ if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type)
+ goto out;
+ } else if (cmd == SIOCGARP) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ switch(cmd) {
+ case SIOCDARP:
+ err = arp_req_delete(&r, dev);
+ break;
+ case SIOCSARP:
+ err = arp_req_set(&r, dev);
+ break;
+ case SIOCGARP:
+ err = arp_req_get(&r, dev);
+ if (!err && copy_to_user(arg, &r, sizeof(r)))
+ err = -EFAULT;
+ break;
+ }
+out:
+ rtnl_unlock();
+ return err;
+}
+
+static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *dev = ptr;
+
+ switch (event) {
+ case NETDEV_CHANGEADDR:
+ neigh_changeaddr(&arp_tbl, dev);
+ rt_cache_flush(0);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block arp_netdev_notifier = {
+ .notifier_call = arp_netdev_event,
+};
+
+/* Note, that it is not on notifier chain.
+ It is necessary, that this routine was called after route cache will be
+ flushed.
+ */
+void arp_ifdown(struct net_device *dev)
+{
+ neigh_ifdown(&arp_tbl, dev);
+}
+
+
+/*
+ * Called once on startup.
+ */
+
+static struct packet_type arp_packet_type = {
+ .type = __constant_htons(ETH_P_ARP),
+ .func = arp_rcv,
+};
+
+static int arp_proc_init(void);
+
+void __init arp_init(void)
+{
+ neigh_table_init(&arp_tbl);
+
+ dev_add_pack(&arp_packet_type);
+ arp_proc_init();
+#ifdef CONFIG_SYSCTL
+ neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4,
+ NET_IPV4_NEIGH, "ipv4", NULL, NULL);
+#endif
+ register_netdevice_notifier(&arp_netdev_notifier);
+}
+
+#ifdef CONFIG_PROC_FS
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+
+/* ------------------------------------------------------------------------ */
+/*
+ * ax25 -> ASCII conversion
+ */
+static char *ax2asc2(ax25_address *a, char *buf)
+{
+ char c, *s;
+ int n;
+
+ for (n = 0, s = buf; n < 6; n++) {
+ c = (a->ax25_call[n] >> 1) & 0x7F;
+
+ if (c != ' ') *s++ = c;
+ }
+
+ *s++ = '-';
+
+ if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) {
+ *s++ = '1';
+ n -= 10;
+ }
+
+ *s++ = n + '0';
+ *s++ = '\0';
+
+ if (*buf == '\0' || *buf == '-')
+ return "*";
+
+ return buf;
+
+}
+#endif /* CONFIG_AX25 */
+
+#define HBUFFERLEN 30
+
+static void arp_format_neigh_entry(struct seq_file *seq,
+ struct neighbour *n)
+{
+ char hbuffer[HBUFFERLEN];
+ const char hexbuf[] = "0123456789ABCDEF";
+ int k, j;
+ char tbuf[16];
+ struct net_device *dev = n->dev;
+ int hatype = dev->type;
+
+ read_lock(&n->lock);
+ /* Convert hardware address to XX:XX:XX:XX ... form. */
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+ if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
+ ax2asc2((ax25_address *)n->ha, hbuffer);
+ else {
+#endif
+ for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) {
+ hbuffer[k++] = hexbuf[(n->ha[j] >> 4) & 15];
+ hbuffer[k++] = hexbuf[n->ha[j] & 15];
+ hbuffer[k++] = ':';
+ }
+ hbuffer[--k] = 0;
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+ }
+#endif
+ sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->primary_key));
+ seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n",
+ tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
+ read_unlock(&n->lock);
+}
+
+static void arp_format_pneigh_entry(struct seq_file *seq,
+ struct pneigh_entry *n)
+{
+ struct net_device *dev = n->dev;
+ int hatype = dev ? dev->type : 0;
+ char tbuf[16];
+
+ sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->key));
+ seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n",
+ tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00",
+ dev ? dev->name : "*");
+}
+
+static int arp_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, "IP address HW type Flags "
+ "HW address Mask Device\n");
+ } else {
+ struct neigh_seq_state *state = seq->private;
+
+ if (state->flags & NEIGH_SEQ_IS_PNEIGH)
+ arp_format_pneigh_entry(seq, v);
+ else
+ arp_format_neigh_entry(seq, v);
+ }
+
+ return 0;
+}
+
+static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ /* Don't want to confuse "arp -a" w/ magic entries,
+ * so we tell the generic iterator to skip NUD_NOARP.
+ */
+ return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP);
+}
+
+/* ------------------------------------------------------------------------ */
+
+static struct seq_operations arp_seq_ops = {
+ .start = arp_seq_start,
+ .next = neigh_seq_next,
+ .stop = neigh_seq_stop,
+ .show = arp_seq_show,
+};
+
+static int arp_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc = -ENOMEM;
+ struct neigh_seq_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (!s)
+ goto out;
+
+ memset(s, 0, sizeof(*s));
+ rc = seq_open(file, &arp_seq_ops);
+ if (rc)
+ goto out_kfree;
+
+ seq = file->private_data;
+ seq->private = s;
+out:
+ return rc;
+out_kfree:
+ kfree(s);
+ goto out;
+}
+
+static struct file_operations arp_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = arp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static int __init arp_proc_init(void)
+{
+ if (!proc_net_fops_create("arp", S_IRUGO, &arp_seq_fops))
+ return -ENOMEM;
+ return 0;
+}
+
+#else /* CONFIG_PROC_FS */
+
+static int __init arp_proc_init(void)
+{
+ return 0;
+}
+
+#endif /* CONFIG_PROC_FS */
+
+EXPORT_SYMBOL(arp_broken_ops);
+EXPORT_SYMBOL(arp_find);
+EXPORT_SYMBOL(arp_rcv);
+EXPORT_SYMBOL(arp_create);
+EXPORT_SYMBOL(arp_xmit);
+EXPORT_SYMBOL(arp_send);
+EXPORT_SYMBOL(arp_tbl);
+
+#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
+EXPORT_SYMBOL(clip_tbl_hook);
+#endif
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
new file mode 100644
index 00000000000..b1db561f254
--- /dev/null
+++ b/net/ipv4/datagram.c
@@ -0,0 +1,73 @@
+/*
+ * common UDP/RAW code
+ * Linux INET implementation
+ *
+ * Authors:
+ * Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
+ struct rtable *rt;
+ u32 saddr;
+ int oif;
+ int err;
+
+
+ if (addr_len < sizeof(*usin))
+ return -EINVAL;
+
+ if (usin->sin_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ sk_dst_reset(sk);
+
+ oif = sk->sk_bound_dev_if;
+ saddr = inet->saddr;
+ if (MULTICAST(usin->sin_addr.s_addr)) {
+ if (!oif)
+ oif = inet->mc_index;
+ if (!saddr)
+ saddr = inet->mc_addr;
+ }
+ err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr,
+ RT_CONN_FLAGS(sk), oif,
+ sk->sk_protocol,
+ inet->sport, usin->sin_port, sk);
+ if (err)
+ return err;
+ if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) {
+ ip_rt_put(rt);
+ return -EACCES;
+ }
+ if (!inet->saddr)
+ inet->saddr = rt->rt_src; /* Update source address */
+ if (!inet->rcv_saddr)
+ inet->rcv_saddr = rt->rt_src;
+ inet->daddr = rt->rt_dst;
+ inet->dport = usin->sin_port;
+ sk->sk_state = TCP_ESTABLISHED;
+ inet->id = jiffies;
+
+ sk_dst_set(sk, &rt->u.dst);
+ return(0);
+}
+
+EXPORT_SYMBOL(ip4_datagram_connect);
+
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
new file mode 100644
index 00000000000..eea7ef01077
--- /dev/null
+++ b/net/ipv4/devinet.c
@@ -0,0 +1,1508 @@
+/*
+ * NET3 IP device support routines.
+ *
+ * Version: $Id: devinet.c,v 1.44 2001/10/31 21:55:54 davem Exp $
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Derived from the IP parts of dev.c 1.0.19
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *
+ * Additional Authors:
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ * Alexey Kuznetsov: pa_* fields are replaced with ifaddr
+ * lists.
+ * Cyrus Durgin: updated for kmod
+ * Matthias Andree: in devinet_ioctl, compare label and
+ * address (4.4BSD alias style support),
+ * fall back to comparing just the label
+ * if no match found.
+ */
+
+#include <linux/config.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#include <linux/kmod.h>
+
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+
+struct ipv4_devconf ipv4_devconf = {
+ .accept_redirects = 1,
+ .send_redirects = 1,
+ .secure_redirects = 1,
+ .shared_media = 1,
+};
+
+static struct ipv4_devconf ipv4_devconf_dflt = {
+ .accept_redirects = 1,
+ .send_redirects = 1,
+ .secure_redirects = 1,
+ .shared_media = 1,
+ .accept_source_route = 1,
+};
+
+static void rtmsg_ifa(int event, struct in_ifaddr *);
+
+static struct notifier_block *inetaddr_chain;
+static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+ int destroy);
+#ifdef CONFIG_SYSCTL
+static void devinet_sysctl_register(struct in_device *in_dev,
+ struct ipv4_devconf *p);
+static void devinet_sysctl_unregister(struct ipv4_devconf *p);
+#endif
+
+/* Locks all the inet devices. */
+
+static struct in_ifaddr *inet_alloc_ifa(void)
+{
+ struct in_ifaddr *ifa = kmalloc(sizeof(*ifa), GFP_KERNEL);
+
+ if (ifa) {
+ memset(ifa, 0, sizeof(*ifa));
+ INIT_RCU_HEAD(&ifa->rcu_head);
+ }
+
+ return ifa;
+}
+
+static void inet_rcu_free_ifa(struct rcu_head *head)
+{
+ struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head);
+ if (ifa->ifa_dev)
+ in_dev_put(ifa->ifa_dev);
+ kfree(ifa);
+}
+
+static inline void inet_free_ifa(struct in_ifaddr *ifa)
+{
+ call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+}
+
+void in_dev_finish_destroy(struct in_device *idev)
+{
+ struct net_device *dev = idev->dev;
+
+ BUG_TRAP(!idev->ifa_list);
+ BUG_TRAP(!idev->mc_list);
+#ifdef NET_REFCNT_DEBUG
+ printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n",
+ idev, dev ? dev->name : "NIL");
+#endif
+ dev_put(dev);
+ if (!idev->dead)
+ printk("Freeing alive in_device %p\n", idev);
+ else {
+ kfree(idev);
+ }
+}
+
+struct in_device *inetdev_init(struct net_device *dev)
+{
+ struct in_device *in_dev;
+
+ ASSERT_RTNL();
+
+ in_dev = kmalloc(sizeof(*in_dev), GFP_KERNEL);
+ if (!in_dev)
+ goto out;
+ memset(in_dev, 0, sizeof(*in_dev));
+ INIT_RCU_HEAD(&in_dev->rcu_head);
+ memcpy(&in_dev->cnf, &ipv4_devconf_dflt, sizeof(in_dev->cnf));
+ in_dev->cnf.sysctl = NULL;
+ in_dev->dev = dev;
+ if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL)
+ goto out_kfree;
+ /* Reference in_dev->dev */
+ dev_hold(dev);
+#ifdef CONFIG_SYSCTL
+ neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4,
+ NET_IPV4_NEIGH, "ipv4", NULL, NULL);
+#endif
+
+ /* Account for reference dev->ip_ptr */
+ in_dev_hold(in_dev);
+ rcu_assign_pointer(dev->ip_ptr, in_dev);
+
+#ifdef CONFIG_SYSCTL
+ devinet_sysctl_register(in_dev, &in_dev->cnf);
+#endif
+ ip_mc_init_dev(in_dev);
+ if (dev->flags & IFF_UP)
+ ip_mc_up(in_dev);
+out:
+ return in_dev;
+out_kfree:
+ kfree(in_dev);
+ in_dev = NULL;
+ goto out;
+}
+
+static void in_dev_rcu_put(struct rcu_head *head)
+{
+ struct in_device *idev = container_of(head, struct in_device, rcu_head);
+ in_dev_put(idev);
+}
+
+static void inetdev_destroy(struct in_device *in_dev)
+{
+ struct in_ifaddr *ifa;
+ struct net_device *dev;
+
+ ASSERT_RTNL();
+
+ dev = in_dev->dev;
+ if (dev == &loopback_dev)
+ return;
+
+ in_dev->dead = 1;
+
+ ip_mc_destroy_dev(in_dev);
+
+ while ((ifa = in_dev->ifa_list) != NULL) {
+ inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
+ inet_free_ifa(ifa);
+ }
+
+#ifdef CONFIG_SYSCTL
+ devinet_sysctl_unregister(&in_dev->cnf);
+#endif
+
+ dev->ip_ptr = NULL;
+
+#ifdef CONFIG_SYSCTL
+ neigh_sysctl_unregister(in_dev->arp_parms);
+#endif
+ neigh_parms_release(&arp_tbl, in_dev->arp_parms);
+ arp_ifdown(dev);
+
+ call_rcu(&in_dev->rcu_head, in_dev_rcu_put);
+}
+
+int inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b)
+{
+ rcu_read_lock();
+ for_primary_ifa(in_dev) {
+ if (inet_ifa_match(a, ifa)) {
+ if (!b || inet_ifa_match(b, ifa)) {
+ rcu_read_unlock();
+ return 1;
+ }
+ }
+ } endfor_ifa(in_dev);
+ rcu_read_unlock();
+ return 0;
+}
+
+static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+ int destroy)
+{
+ struct in_ifaddr *ifa1 = *ifap;
+
+ ASSERT_RTNL();
+
+ /* 1. Deleting primary ifaddr forces deletion all secondaries */
+
+ if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) {
+ struct in_ifaddr *ifa;
+ struct in_ifaddr **ifap1 = &ifa1->ifa_next;
+
+ while ((ifa = *ifap1) != NULL) {
+ if (!(ifa->ifa_flags & IFA_F_SECONDARY) ||
+ ifa1->ifa_mask != ifa->ifa_mask ||
+ !inet_ifa_match(ifa1->ifa_address, ifa)) {
+ ifap1 = &ifa->ifa_next;
+ continue;
+ }
+
+ *ifap1 = ifa->ifa_next;
+
+ rtmsg_ifa(RTM_DELADDR, ifa);
+ notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
+ inet_free_ifa(ifa);
+ }
+ }
+
+ /* 2. Unlink it */
+
+ *ifap = ifa1->ifa_next;
+
+ /* 3. Announce address deletion */
+
+ /* Send message first, then call notifier.
+ At first sight, FIB update triggered by notifier
+ will refer to already deleted ifaddr, that could confuse
+ netlink listeners. It is not true: look, gated sees
+ that route deleted and if it still thinks that ifaddr
+ is valid, it will try to restore deleted routes... Grr.
+ So that, this order is correct.
+ */
+ rtmsg_ifa(RTM_DELADDR, ifa1);
+ notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
+ if (destroy) {
+ inet_free_ifa(ifa1);
+
+ if (!in_dev->ifa_list)
+ inetdev_destroy(in_dev);
+ }
+}
+
+static int inet_insert_ifa(struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct in_ifaddr *ifa1, **ifap, **last_primary;
+
+ ASSERT_RTNL();
+
+ if (!ifa->ifa_local) {
+ inet_free_ifa(ifa);
+ return 0;
+ }
+
+ ifa->ifa_flags &= ~IFA_F_SECONDARY;
+ last_primary = &in_dev->ifa_list;
+
+ for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
+ ifap = &ifa1->ifa_next) {
+ if (!(ifa1->ifa_flags & IFA_F_SECONDARY) &&
+ ifa->ifa_scope <= ifa1->ifa_scope)
+ last_primary = &ifa1->ifa_next;
+ if (ifa1->ifa_mask == ifa->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, ifa)) {
+ if (ifa1->ifa_local == ifa->ifa_local) {
+ inet_free_ifa(ifa);
+ return -EEXIST;
+ }
+ if (ifa1->ifa_scope != ifa->ifa_scope) {
+ inet_free_ifa(ifa);
+ return -EINVAL;
+ }
+ ifa->ifa_flags |= IFA_F_SECONDARY;
+ }
+ }
+
+ if (!(ifa->ifa_flags & IFA_F_SECONDARY)) {
+ net_srandom(ifa->ifa_local);
+ ifap = last_primary;
+ }
+
+ ifa->ifa_next = *ifap;
+ *ifap = ifa;
+
+ /* Send message first, then call notifier.
+ Notifier will trigger FIB update, so that
+ listeners of netlink will know about new ifaddr */
+ rtmsg_ifa(RTM_NEWADDR, ifa);
+ notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
+
+ return 0;
+}
+
+static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = __in_dev_get(dev);
+
+ ASSERT_RTNL();
+
+ if (!in_dev) {
+ in_dev = inetdev_init(dev);
+ if (!in_dev) {
+ inet_free_ifa(ifa);
+ return -ENOBUFS;
+ }
+ }
+ if (ifa->ifa_dev != in_dev) {
+ BUG_TRAP(!ifa->ifa_dev);
+ in_dev_hold(in_dev);
+ ifa->ifa_dev = in_dev;
+ }
+ if (LOOPBACK(ifa->ifa_local))
+ ifa->ifa_scope = RT_SCOPE_HOST;
+ return inet_insert_ifa(ifa);
+}
+
+struct in_device *inetdev_by_index(int ifindex)
+{
+ struct net_device *dev;
+ struct in_device *in_dev = NULL;
+ read_lock(&dev_base_lock);
+ dev = __dev_get_by_index(ifindex);
+ if (dev)
+ in_dev = in_dev_get(dev);
+ read_unlock(&dev_base_lock);
+ return in_dev;
+}
+
+/* Called only from RTNL semaphored context. No locks. */
+
+struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, u32 prefix,
+ u32 mask)
+{
+ ASSERT_RTNL();
+
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa))
+ return ifa;
+ } endfor_ifa(in_dev);
+ return NULL;
+}
+
+static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+ struct rtattr **rta = arg;
+ struct in_device *in_dev;
+ struct ifaddrmsg *ifm = NLMSG_DATA(nlh);
+ struct in_ifaddr *ifa, **ifap;
+
+ ASSERT_RTNL();
+
+ if ((in_dev = inetdev_by_index(ifm->ifa_index)) == NULL)
+ goto out;
+ __in_dev_put(in_dev);
+
+ for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ ifap = &ifa->ifa_next) {
+ if ((rta[IFA_LOCAL - 1] &&
+ memcmp(RTA_DATA(rta[IFA_LOCAL - 1]),
+ &ifa->ifa_local, 4)) ||
+ (rta[IFA_LABEL - 1] &&
+ rtattr_strcmp(rta[IFA_LABEL - 1], ifa->ifa_label)) ||
+ (rta[IFA_ADDRESS - 1] &&
+ (ifm->ifa_prefixlen != ifa->ifa_prefixlen ||
+ !inet_ifa_match(*(u32*)RTA_DATA(rta[IFA_ADDRESS - 1]),
+ ifa))))
+ continue;
+ inet_del_ifa(in_dev, ifap, 1);
+ return 0;
+ }
+out:
+ return -EADDRNOTAVAIL;
+}
+
+static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+ struct rtattr **rta = arg;
+ struct net_device *dev;
+ struct in_device *in_dev;
+ struct ifaddrmsg *ifm = NLMSG_DATA(nlh);
+ struct in_ifaddr *ifa;
+ int rc = -EINVAL;
+
+ ASSERT_RTNL();
+
+ if (ifm->ifa_prefixlen > 32 || !rta[IFA_LOCAL - 1])
+ goto out;
+
+ rc = -ENODEV;
+ if ((dev = __dev_get_by_index(ifm->ifa_index)) == NULL)
+ goto out;
+
+ rc = -ENOBUFS;
+ if ((in_dev = __in_dev_get(dev)) == NULL) {
+ in_dev = inetdev_init(dev);
+ if (!in_dev)
+ goto out;
+ }
+
+ if ((ifa = inet_alloc_ifa()) == NULL)
+ goto out;
+
+ if (!rta[IFA_ADDRESS - 1])
+ rta[IFA_ADDRESS - 1] = rta[IFA_LOCAL - 1];
+ memcpy(&ifa->ifa_local, RTA_DATA(rta[IFA_LOCAL - 1]), 4);
+ memcpy(&ifa->ifa_address, RTA_DATA(rta[IFA_ADDRESS - 1]), 4);
+ ifa->ifa_prefixlen = ifm->ifa_prefixlen;
+ ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
+ if (rta[IFA_BROADCAST - 1])
+ memcpy(&ifa->ifa_broadcast,
+ RTA_DATA(rta[IFA_BROADCAST - 1]), 4);
+ if (rta[IFA_ANYCAST - 1])
+ memcpy(&ifa->ifa_anycast, RTA_DATA(rta[IFA_ANYCAST - 1]), 4);
+ ifa->ifa_flags = ifm->ifa_flags;
+ ifa->ifa_scope = ifm->ifa_scope;
+ in_dev_hold(in_dev);
+ ifa->ifa_dev = in_dev;
+ if (rta[IFA_LABEL - 1])
+ rtattr_strlcpy(ifa->ifa_label, rta[IFA_LABEL - 1], IFNAMSIZ);
+ else
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+
+ rc = inet_insert_ifa(ifa);
+out:
+ return rc;
+}
+
+/*
+ * Determine a default network mask, based on the IP address.
+ */
+
+static __inline__ int inet_abc_len(u32 addr)
+{
+ int rc = -1; /* Something else, probably a multicast. */
+
+ if (ZERONET(addr))
+ rc = 0;
+ else {
+ addr = ntohl(addr);
+
+ if (IN_CLASSA(addr))
+ rc = 8;
+ else if (IN_CLASSB(addr))
+ rc = 16;
+ else if (IN_CLASSC(addr))
+ rc = 24;
+ }
+
+ return rc;
+}
+
+
+int devinet_ioctl(unsigned int cmd, void __user *arg)
+{
+ struct ifreq ifr;
+ struct sockaddr_in sin_orig;
+ struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
+ struct in_device *in_dev;
+ struct in_ifaddr **ifap = NULL;
+ struct in_ifaddr *ifa = NULL;
+ struct net_device *dev;
+ char *colon;
+ int ret = -EFAULT;
+ int tryaddrmatch = 0;
+
+ /*
+ * Fetch the caller's info block into kernel space
+ */
+
+ if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+ goto out;
+ ifr.ifr_name[IFNAMSIZ - 1] = 0;
+
+ /* save original address for comparison */
+ memcpy(&sin_orig, sin, sizeof(*sin));
+
+ colon = strchr(ifr.ifr_name, ':');
+ if (colon)
+ *colon = 0;
+
+#ifdef CONFIG_KMOD
+ dev_load(ifr.ifr_name);
+#endif
+
+ switch(cmd) {
+ case SIOCGIFADDR: /* Get interface address */
+ case SIOCGIFBRDADDR: /* Get the broadcast address */
+ case SIOCGIFDSTADDR: /* Get the destination address */
+ case SIOCGIFNETMASK: /* Get the netmask for the interface */
+ /* Note that these ioctls will not sleep,
+ so that we do not impose a lock.
+ One day we will be forced to put shlock here (I mean SMP)
+ */
+ tryaddrmatch = (sin_orig.sin_family == AF_INET);
+ memset(sin, 0, sizeof(*sin));
+ sin->sin_family = AF_INET;
+ break;
+
+ case SIOCSIFFLAGS:
+ ret = -EACCES;
+ if (!capable(CAP_NET_ADMIN))
+ goto out;
+ break;
+ case SIOCSIFADDR: /* Set interface address (and family) */
+ case SIOCSIFBRDADDR: /* Set the broadcast address */
+ case SIOCSIFDSTADDR: /* Set the destination address */
+ case SIOCSIFNETMASK: /* Set the netmask for the interface */
+ ret = -EACCES;
+ if (!capable(CAP_NET_ADMIN))
+ goto out;
+ ret = -EINVAL;
+ if (sin->sin_family != AF_INET)
+ goto out;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+ rtnl_lock();
+
+ ret = -ENODEV;
+ if ((dev = __dev_get_by_name(ifr.ifr_name)) == NULL)
+ goto done;
+
+ if (colon)
+ *colon = ':';
+
+ if ((in_dev = __in_dev_get(dev)) != NULL) {
+ if (tryaddrmatch) {
+ /* Matthias Andree */
+ /* compare label and address (4.4BSD style) */
+ /* note: we only do this for a limited set of ioctls
+ and only if the original address family was AF_INET.
+ This is checked above. */
+ for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ ifap = &ifa->ifa_next) {
+ if (!strcmp(ifr.ifr_name, ifa->ifa_label) &&
+ sin_orig.sin_addr.s_addr ==
+ ifa->ifa_address) {
+ break; /* found */
+ }
+ }
+ }
+ /* we didn't get a match, maybe the application is
+ 4.3BSD-style and passed in junk so we fall back to
+ comparing just the label */
+ if (!ifa) {
+ for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ ifap = &ifa->ifa_next)
+ if (!strcmp(ifr.ifr_name, ifa->ifa_label))
+ break;
+ }
+ }
+
+ ret = -EADDRNOTAVAIL;
+ if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS)
+ goto done;
+
+ switch(cmd) {
+ case SIOCGIFADDR: /* Get interface address */
+ sin->sin_addr.s_addr = ifa->ifa_local;
+ goto rarok;
+
+ case SIOCGIFBRDADDR: /* Get the broadcast address */
+ sin->sin_addr.s_addr = ifa->ifa_broadcast;
+ goto rarok;
+
+ case SIOCGIFDSTADDR: /* Get the destination address */
+ sin->sin_addr.s_addr = ifa->ifa_address;
+ goto rarok;
+
+ case SIOCGIFNETMASK: /* Get the netmask for the interface */
+ sin->sin_addr.s_addr = ifa->ifa_mask;
+ goto rarok;
+
+ case SIOCSIFFLAGS:
+ if (colon) {
+ ret = -EADDRNOTAVAIL;
+ if (!ifa)
+ break;
+ ret = 0;
+ if (!(ifr.ifr_flags & IFF_UP))
+ inet_del_ifa(in_dev, ifap, 1);
+ break;
+ }
+ ret = dev_change_flags(dev, ifr.ifr_flags);
+ break;
+
+ case SIOCSIFADDR: /* Set interface address (and family) */
+ ret = -EINVAL;
+ if (inet_abc_len(sin->sin_addr.s_addr) < 0)
+ break;
+
+ if (!ifa) {
+ ret = -ENOBUFS;
+ if ((ifa = inet_alloc_ifa()) == NULL)
+ break;
+ if (colon)
+ memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
+ else
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ } else {
+ ret = 0;
+ if (ifa->ifa_local == sin->sin_addr.s_addr)
+ break;
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_broadcast = 0;
+ ifa->ifa_anycast = 0;
+ }
+
+ ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr;
+
+ if (!(dev->flags & IFF_POINTOPOINT)) {
+ ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address);
+ ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen);
+ if ((dev->flags & IFF_BROADCAST) &&
+ ifa->ifa_prefixlen < 31)
+ ifa->ifa_broadcast = ifa->ifa_address |
+ ~ifa->ifa_mask;
+ } else {
+ ifa->ifa_prefixlen = 32;
+ ifa->ifa_mask = inet_make_mask(32);
+ }
+ ret = inet_set_ifa(dev, ifa);
+ break;
+
+ case SIOCSIFBRDADDR: /* Set the broadcast address */
+ ret = 0;
+ if (ifa->ifa_broadcast != sin->sin_addr.s_addr) {
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_broadcast = sin->sin_addr.s_addr;
+ inet_insert_ifa(ifa);
+ }
+ break;
+
+ case SIOCSIFDSTADDR: /* Set the destination address */
+ ret = 0;
+ if (ifa->ifa_address == sin->sin_addr.s_addr)
+ break;
+ ret = -EINVAL;
+ if (inet_abc_len(sin->sin_addr.s_addr) < 0)
+ break;
+ ret = 0;
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_address = sin->sin_addr.s_addr;
+ inet_insert_ifa(ifa);
+ break;
+
+ case SIOCSIFNETMASK: /* Set the netmask for the interface */
+
+ /*
+ * The mask we set must be legal.
+ */
+ ret = -EINVAL;
+ if (bad_mask(sin->sin_addr.s_addr, 0))
+ break;
+ ret = 0;
+ if (ifa->ifa_mask != sin->sin_addr.s_addr) {
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_mask = sin->sin_addr.s_addr;
+ ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask);
+
+ /* See if current broadcast address matches
+ * with current netmask, then recalculate
+ * the broadcast address. Otherwise it's a
+ * funny address, so don't touch it since
+ * the user seems to know what (s)he's doing...
+ */
+ if ((dev->flags & IFF_BROADCAST) &&
+ (ifa->ifa_prefixlen < 31) &&
+ (ifa->ifa_broadcast ==
+ (ifa->ifa_local|~ifa->ifa_mask))) {
+ ifa->ifa_broadcast = (ifa->ifa_local |
+ ~sin->sin_addr.s_addr);
+ }
+ inet_insert_ifa(ifa);
+ }
+ break;
+ }
+done:
+ rtnl_unlock();
+out:
+ return ret;
+rarok:
+ rtnl_unlock();
+ ret = copy_to_user(arg, &ifr, sizeof(struct ifreq)) ? -EFAULT : 0;
+ goto out;
+}
+
+static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
+{
+ struct in_device *in_dev = __in_dev_get(dev);
+ struct in_ifaddr *ifa;
+ struct ifreq ifr;
+ int done = 0;
+
+ if (!in_dev || (ifa = in_dev->ifa_list) == NULL)
+ goto out;
+
+ for (; ifa; ifa = ifa->ifa_next) {
+ if (!buf) {
+ done += sizeof(ifr);
+ continue;
+ }
+ if (len < (int) sizeof(ifr))
+ break;
+ memset(&ifr, 0, sizeof(struct ifreq));
+ if (ifa->ifa_label)
+ strcpy(ifr.ifr_name, ifa->ifa_label);
+ else
+ strcpy(ifr.ifr_name, dev->name);
+
+ (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET;
+ (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr =
+ ifa->ifa_local;
+
+ if (copy_to_user(buf, &ifr, sizeof(struct ifreq))) {
+ done = -EFAULT;
+ break;
+ }
+ buf += sizeof(struct ifreq);
+ len -= sizeof(struct ifreq);
+ done += sizeof(struct ifreq);
+ }
+out:
+ return done;
+}
+
+u32 inet_select_addr(const struct net_device *dev, u32 dst, int scope)
+{
+ u32 addr = 0;
+ struct in_device *in_dev;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get(dev);
+ if (!in_dev)
+ goto no_in_dev;
+
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_scope > scope)
+ continue;
+ if (!dst || inet_ifa_match(dst, ifa)) {
+ addr = ifa->ifa_local;
+ break;
+ }
+ if (!addr)
+ addr = ifa->ifa_local;
+ } endfor_ifa(in_dev);
+no_in_dev:
+ rcu_read_unlock();
+
+ if (addr)
+ goto out;
+
+ /* Not loopback addresses on loopback should be preferred
+ in this case. It is importnat that lo is the first interface
+ in dev_base list.
+ */
+ read_lock(&dev_base_lock);
+ rcu_read_lock();
+ for (dev = dev_base; dev; dev = dev->next) {
+ if ((in_dev = __in_dev_get(dev)) == NULL)
+ continue;
+
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_scope != RT_SCOPE_LINK &&
+ ifa->ifa_scope <= scope) {
+ addr = ifa->ifa_local;
+ goto out_unlock_both;
+ }
+ } endfor_ifa(in_dev);
+ }
+out_unlock_both:
+ read_unlock(&dev_base_lock);
+ rcu_read_unlock();
+out:
+ return addr;
+}
+
+static u32 confirm_addr_indev(struct in_device *in_dev, u32 dst,
+ u32 local, int scope)
+{
+ int same = 0;
+ u32 addr = 0;
+
+ for_ifa(in_dev) {
+ if (!addr &&
+ (local == ifa->ifa_local || !local) &&
+ ifa->ifa_scope <= scope) {
+ addr = ifa->ifa_local;
+ if (same)
+ break;
+ }
+ if (!same) {
+ same = (!local || inet_ifa_match(local, ifa)) &&
+ (!dst || inet_ifa_match(dst, ifa));
+ if (same && addr) {
+ if (local || !dst)
+ break;
+ /* Is the selected addr into dst subnet? */
+ if (inet_ifa_match(addr, ifa))
+ break;
+ /* No, then can we use new local src? */
+ if (ifa->ifa_scope <= scope) {
+ addr = ifa->ifa_local;
+ break;
+ }
+ /* search for large dst subnet for addr */
+ same = 0;
+ }
+ }
+ } endfor_ifa(in_dev);
+
+ return same? addr : 0;
+}
+
+/*
+ * Confirm that local IP address exists using wildcards:
+ * - dev: only on this interface, 0=any interface
+ * - dst: only in the same subnet as dst, 0=any dst
+ * - local: address, 0=autoselect the local address
+ * - scope: maximum allowed scope value for the local address
+ */
+u32 inet_confirm_addr(const struct net_device *dev, u32 dst, u32 local, int scope)
+{
+ u32 addr = 0;
+ struct in_device *in_dev;
+
+ if (dev) {
+ rcu_read_lock();
+ if ((in_dev = __in_dev_get(dev)))
+ addr = confirm_addr_indev(in_dev, dst, local, scope);
+ rcu_read_unlock();
+
+ return addr;
+ }
+
+ read_lock(&dev_base_lock);
+ rcu_read_lock();
+ for (dev = dev_base; dev; dev = dev->next) {
+ if ((in_dev = __in_dev_get(dev))) {
+ addr = confirm_addr_indev(in_dev, dst, local, scope);
+ if (addr)
+ break;
+ }
+ }
+ rcu_read_unlock();
+ read_unlock(&dev_base_lock);
+
+ return addr;
+}
+
+/*
+ * Device notifier
+ */
+
+int register_inetaddr_notifier(struct notifier_block *nb)
+{
+ return notifier_chain_register(&inetaddr_chain, nb);
+}
+
+int unregister_inetaddr_notifier(struct notifier_block *nb)
+{
+ return notifier_chain_unregister(&inetaddr_chain, nb);
+}
+
+/* Rename ifa_labels for a device name change. Make some effort to preserve existing
+ * alias numbering and to create unique labels if possible.
+*/
+static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
+{
+ struct in_ifaddr *ifa;
+ int named = 0;
+
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ char old[IFNAMSIZ], *dot;
+
+ memcpy(old, ifa->ifa_label, IFNAMSIZ);
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ if (named++ == 0)
+ continue;
+ dot = strchr(ifa->ifa_label, ':');
+ if (dot == NULL) {
+ sprintf(old, ":%d", named);
+ dot = old;
+ }
+ if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) {
+ strcat(ifa->ifa_label, dot);
+ } else {
+ strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot);
+ }
+ }
+}
+
+/* Called only under RTNL semaphore */
+
+static int inetdev_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = ptr;
+ struct in_device *in_dev = __in_dev_get(dev);
+
+ ASSERT_RTNL();
+
+ if (!in_dev) {
+ if (event == NETDEV_REGISTER && dev == &loopback_dev) {
+ in_dev = inetdev_init(dev);
+ if (!in_dev)
+ panic("devinet: Failed to create loopback\n");
+ in_dev->cnf.no_xfrm = 1;
+ in_dev->cnf.no_policy = 1;
+ }
+ goto out;
+ }
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ printk(KERN_DEBUG "inetdev_event: bug\n");
+ dev->ip_ptr = NULL;
+ break;
+ case NETDEV_UP:
+ if (dev->mtu < 68)
+ break;
+ if (dev == &loopback_dev) {
+ struct in_ifaddr *ifa;
+ if ((ifa = inet_alloc_ifa()) != NULL) {
+ ifa->ifa_local =
+ ifa->ifa_address = htonl(INADDR_LOOPBACK);
+ ifa->ifa_prefixlen = 8;
+ ifa->ifa_mask = inet_make_mask(8);
+ in_dev_hold(in_dev);
+ ifa->ifa_dev = in_dev;
+ ifa->ifa_scope = RT_SCOPE_HOST;
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ inet_insert_ifa(ifa);
+ }
+ }
+ ip_mc_up(in_dev);
+ break;
+ case NETDEV_DOWN:
+ ip_mc_down(in_dev);
+ break;
+ case NETDEV_CHANGEMTU:
+ if (dev->mtu >= 68)
+ break;
+ /* MTU falled under 68, disable IP */
+ case NETDEV_UNREGISTER:
+ inetdev_destroy(in_dev);
+ break;
+ case NETDEV_CHANGENAME:
+ /* Do not notify about label change, this event is
+ * not interesting to applications using netlink.
+ */
+ inetdev_changename(dev, in_dev);
+
+#ifdef CONFIG_SYSCTL
+ devinet_sysctl_unregister(&in_dev->cnf);
+ neigh_sysctl_unregister(in_dev->arp_parms);
+ neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4,
+ NET_IPV4_NEIGH, "ipv4", NULL, NULL);
+ devinet_sysctl_register(in_dev, &in_dev->cnf);
+#endif
+ break;
+ }
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block ip_netdev_notifier = {
+ .notifier_call =inetdev_event,
+};
+
+static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
+ u32 pid, u32 seq, int event)
+{
+ struct ifaddrmsg *ifm;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb->tail;
+
+ nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm));
+ if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
+ ifm = NLMSG_DATA(nlh);
+ ifm->ifa_family = AF_INET;
+ ifm->ifa_prefixlen = ifa->ifa_prefixlen;
+ ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT;
+ ifm->ifa_scope = ifa->ifa_scope;
+ ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
+ if (ifa->ifa_address)
+ RTA_PUT(skb, IFA_ADDRESS, 4, &ifa->ifa_address);
+ if (ifa->ifa_local)
+ RTA_PUT(skb, IFA_LOCAL, 4, &ifa->ifa_local);
+ if (ifa->ifa_broadcast)
+ RTA_PUT(skb, IFA_BROADCAST, 4, &ifa->ifa_broadcast);
+ if (ifa->ifa_anycast)
+ RTA_PUT(skb, IFA_ANYCAST, 4, &ifa->ifa_anycast);
+ if (ifa->ifa_label[0])
+ RTA_PUT(skb, IFA_LABEL, IFNAMSIZ, &ifa->ifa_label);
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+nlmsg_failure:
+rtattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int idx, ip_idx;
+ struct net_device *dev;
+ struct in_device *in_dev;
+ struct in_ifaddr *ifa;
+ int s_ip_idx, s_idx = cb->args[0];
+
+ s_ip_idx = ip_idx = cb->args[1];
+ read_lock(&dev_base_lock);
+ for (dev = dev_base, idx = 0; dev; dev = dev->next, idx++) {
+ if (idx < s_idx)
+ continue;
+ if (idx > s_idx)
+ s_ip_idx = 0;
+ rcu_read_lock();
+ if ((in_dev = __in_dev_get(dev)) == NULL) {
+ rcu_read_unlock();
+ continue;
+ }
+
+ for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
+ ifa = ifa->ifa_next, ip_idx++) {
+ if (ip_idx < s_ip_idx)
+ continue;
+ if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWADDR) <= 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+ }
+ rcu_read_unlock();
+ }
+
+done:
+ read_unlock(&dev_base_lock);
+ cb->args[0] = idx;
+ cb->args[1] = ip_idx;
+
+ return skb->len;
+}
+
+static void rtmsg_ifa(int event, struct in_ifaddr* ifa)
+{
+ int size = NLMSG_SPACE(sizeof(struct ifaddrmsg) + 128);
+ struct sk_buff *skb = alloc_skb(size, GFP_KERNEL);
+
+ if (!skb)
+ netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS);
+ else if (inet_fill_ifaddr(skb, ifa, 0, 0, event) < 0) {
+ kfree_skb(skb);
+ netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL);
+ } else {
+ NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR;
+ netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL);
+ }
+}
+
+static struct rtnetlink_link inet_rtnetlink_table[RTM_MAX - RTM_BASE + 1] = {
+ [4] = { .doit = inet_rtm_newaddr, },
+ [5] = { .doit = inet_rtm_deladdr, },
+ [6] = { .dumpit = inet_dump_ifaddr, },
+ [8] = { .doit = inet_rtm_newroute, },
+ [9] = { .doit = inet_rtm_delroute, },
+ [10] = { .doit = inet_rtm_getroute, .dumpit = inet_dump_fib, },
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ [16] = { .doit = inet_rtm_newrule, },
+ [17] = { .doit = inet_rtm_delrule, },
+ [18] = { .dumpit = inet_dump_rules, },
+#endif
+};
+
+#ifdef CONFIG_SYSCTL
+
+void inet_forward_change(void)
+{
+ struct net_device *dev;
+ int on = ipv4_devconf.forwarding;
+
+ ipv4_devconf.accept_redirects = !on;
+ ipv4_devconf_dflt.forwarding = on;
+
+ read_lock(&dev_base_lock);
+ for (dev = dev_base; dev; dev = dev->next) {
+ struct in_device *in_dev;
+ rcu_read_lock();
+ in_dev = __in_dev_get(dev);
+ if (in_dev)
+ in_dev->cnf.forwarding = on;
+ rcu_read_unlock();
+ }
+ read_unlock(&dev_base_lock);
+
+ rt_cache_flush(0);
+}
+
+static int devinet_sysctl_forward(ctl_table *ctl, int write,
+ struct file* filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int *valp = ctl->data;
+ int val = *valp;
+ int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+
+ if (write && *valp != val) {
+ if (valp == &ipv4_devconf.forwarding)
+ inet_forward_change();
+ else if (valp != &ipv4_devconf_dflt.forwarding)
+ rt_cache_flush(0);
+ }
+
+ return ret;
+}
+
+int ipv4_doint_and_flush(ctl_table *ctl, int write,
+ struct file* filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int *valp = ctl->data;
+ int val = *valp;
+ int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+
+ if (write && *valp != val)
+ rt_cache_flush(0);
+
+ return ret;
+}
+
+int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen,
+ void **context)
+{
+ int *valp = table->data;
+ int new;
+
+ if (!newval || !newlen)
+ return 0;
+
+ if (newlen != sizeof(int))
+ return -EINVAL;
+
+ if (get_user(new, (int __user *)newval))
+ return -EFAULT;
+
+ if (new == *valp)
+ return 0;
+
+ if (oldval && oldlenp) {
+ size_t len;
+
+ if (get_user(len, oldlenp))
+ return -EFAULT;
+
+ if (len) {
+ if (len > table->maxlen)
+ len = table->maxlen;
+ if (copy_to_user(oldval, valp, len))
+ return -EFAULT;
+ if (put_user(len, oldlenp))
+ return -EFAULT;
+ }
+ }
+
+ *valp = new;
+ rt_cache_flush(0);
+ return 1;
+}
+
+
+static struct devinet_sysctl_table {
+ struct ctl_table_header *sysctl_header;
+ ctl_table devinet_vars[__NET_IPV4_CONF_MAX];
+ ctl_table devinet_dev[2];
+ ctl_table devinet_conf_dir[2];
+ ctl_table devinet_proto_dir[2];
+ ctl_table devinet_root_dir[2];
+} devinet_sysctl = {
+ .devinet_vars = {
+ {
+ .ctl_name = NET_IPV4_CONF_FORWARDING,
+ .procname = "forwarding",
+ .data = &ipv4_devconf.forwarding,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &devinet_sysctl_forward,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_MC_FORWARDING,
+ .procname = "mc_forwarding",
+ .data = &ipv4_devconf.mc_forwarding,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_ACCEPT_REDIRECTS,
+ .procname = "accept_redirects",
+ .data = &ipv4_devconf.accept_redirects,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_SECURE_REDIRECTS,
+ .procname = "secure_redirects",
+ .data = &ipv4_devconf.secure_redirects,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_SHARED_MEDIA,
+ .procname = "shared_media",
+ .data = &ipv4_devconf.shared_media,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_RP_FILTER,
+ .procname = "rp_filter",
+ .data = &ipv4_devconf.rp_filter,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_SEND_REDIRECTS,
+ .procname = "send_redirects",
+ .data = &ipv4_devconf.send_redirects,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE,
+ .procname = "accept_source_route",
+ .data = &ipv4_devconf.accept_source_route,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_PROXY_ARP,
+ .procname = "proxy_arp",
+ .data = &ipv4_devconf.proxy_arp,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_MEDIUM_ID,
+ .procname = "medium_id",
+ .data = &ipv4_devconf.medium_id,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_BOOTP_RELAY,
+ .procname = "bootp_relay",
+ .data = &ipv4_devconf.bootp_relay,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_LOG_MARTIANS,
+ .procname = "log_martians",
+ .data = &ipv4_devconf.log_martians,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_TAG,
+ .procname = "tag",
+ .data = &ipv4_devconf.tag,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_ARPFILTER,
+ .procname = "arp_filter",
+ .data = &ipv4_devconf.arp_filter,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_ARP_ANNOUNCE,
+ .procname = "arp_announce",
+ .data = &ipv4_devconf.arp_announce,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_ARP_IGNORE,
+ .procname = "arp_ignore",
+ .data = &ipv4_devconf.arp_ignore,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_NOXFRM,
+ .procname = "disable_xfrm",
+ .data = &ipv4_devconf.no_xfrm,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &ipv4_doint_and_flush,
+ .strategy = &ipv4_doint_and_flush_strategy,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_NOPOLICY,
+ .procname = "disable_policy",
+ .data = &ipv4_devconf.no_policy,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &ipv4_doint_and_flush,
+ .strategy = &ipv4_doint_and_flush_strategy,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_FORCE_IGMP_VERSION,
+ .procname = "force_igmp_version",
+ .data = &ipv4_devconf.force_igmp_version,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &ipv4_doint_and_flush,
+ .strategy = &ipv4_doint_and_flush_strategy,
+ },
+ },
+ .devinet_dev = {
+ {
+ .ctl_name = NET_PROTO_CONF_ALL,
+ .procname = "all",
+ .mode = 0555,
+ .child = devinet_sysctl.devinet_vars,
+ },
+ },
+ .devinet_conf_dir = {
+ {
+ .ctl_name = NET_IPV4_CONF,
+ .procname = "conf",
+ .mode = 0555,
+ .child = devinet_sysctl.devinet_dev,
+ },
+ },
+ .devinet_proto_dir = {
+ {
+ .ctl_name = NET_IPV4,
+ .procname = "ipv4",
+ .mode = 0555,
+ .child = devinet_sysctl.devinet_conf_dir,
+ },
+ },
+ .devinet_root_dir = {
+ {
+ .ctl_name = CTL_NET,
+ .procname = "net",
+ .mode = 0555,
+ .child = devinet_sysctl.devinet_proto_dir,
+ },
+ },
+};
+
+static void devinet_sysctl_register(struct in_device *in_dev,
+ struct ipv4_devconf *p)
+{
+ int i;
+ struct net_device *dev = in_dev ? in_dev->dev : NULL;
+ struct devinet_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL);
+ char *dev_name = NULL;
+
+ if (!t)
+ return;
+ memcpy(t, &devinet_sysctl, sizeof(*t));
+ for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) {
+ t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf;
+ t->devinet_vars[i].de = NULL;
+ }
+
+ if (dev) {
+ dev_name = dev->name;
+ t->devinet_dev[0].ctl_name = dev->ifindex;
+ } else {
+ dev_name = "default";
+ t->devinet_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT;
+ }
+
+ /*
+ * Make a copy of dev_name, because '.procname' is regarded as const
+ * by sysctl and we wouldn't want anyone to change it under our feet
+ * (see SIOCSIFNAME).
+ */
+ dev_name = net_sysctl_strdup(dev_name);
+ if (!dev_name)
+ goto free;
+
+ t->devinet_dev[0].procname = dev_name;
+ t->devinet_dev[0].child = t->devinet_vars;
+ t->devinet_dev[0].de = NULL;
+ t->devinet_conf_dir[0].child = t->devinet_dev;
+ t->devinet_conf_dir[0].de = NULL;
+ t->devinet_proto_dir[0].child = t->devinet_conf_dir;
+ t->devinet_proto_dir[0].de = NULL;
+ t->devinet_root_dir[0].child = t->devinet_proto_dir;
+ t->devinet_root_dir[0].de = NULL;
+
+ t->sysctl_header = register_sysctl_table(t->devinet_root_dir, 0);
+ if (!t->sysctl_header)
+ goto free_procname;
+
+ p->sysctl = t;
+ return;
+
+ /* error path */
+ free_procname:
+ kfree(dev_name);
+ free:
+ kfree(t);
+ return;
+}
+
+static void devinet_sysctl_unregister(struct ipv4_devconf *p)
+{
+ if (p->sysctl) {
+ struct devinet_sysctl_table *t = p->sysctl;
+ p->sysctl = NULL;
+ unregister_sysctl_table(t->sysctl_header);
+ kfree(t->devinet_dev[0].procname);
+ kfree(t);
+ }
+}
+#endif
+
+void __init devinet_init(void)
+{
+ register_gifconf(PF_INET, inet_gifconf);
+ register_netdevice_notifier(&ip_netdev_notifier);
+ rtnetlink_links[PF_INET] = inet_rtnetlink_table;
+#ifdef CONFIG_SYSCTL
+ devinet_sysctl.sysctl_header =
+ register_sysctl_table(devinet_sysctl.devinet_root_dir, 0);
+ devinet_sysctl_register(NULL, &ipv4_devconf_dflt);
+#endif
+}
+
+EXPORT_SYMBOL(devinet_ioctl);
+EXPORT_SYMBOL(in_dev_finish_destroy);
+EXPORT_SYMBOL(inet_select_addr);
+EXPORT_SYMBOL(inetdev_by_index);
+EXPORT_SYMBOL(register_inetaddr_notifier);
+EXPORT_SYMBOL(unregister_inetaddr_notifier);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
new file mode 100644
index 00000000000..053a883247b
--- /dev/null
+++ b/net/ipv4/esp4.c
@@ -0,0 +1,510 @@
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <asm/scatterlist.h>
+#include <linux/crypto.h>
+#include <linux/pfkeyv2.h>
+#include <linux/random.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+
+/* decapsulation data for use when post-processing */
+struct esp_decap_data {
+ xfrm_address_t saddr;
+ __u16 sport;
+ __u8 proto;
+};
+
+static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+ struct iphdr *top_iph;
+ struct ip_esp_hdr *esph;
+ struct crypto_tfm *tfm;
+ struct esp_data *esp;
+ struct sk_buff *trailer;
+ int blksize;
+ int clen;
+ int alen;
+ int nfrags;
+
+ /* Strip IP+ESP header. */
+ __skb_pull(skb, skb->h.raw - skb->data);
+ /* Now skb is pure payload to encrypt */
+
+ err = -ENOMEM;
+
+ /* Round to block size */
+ clen = skb->len;
+
+ esp = x->data;
+ alen = esp->auth.icv_trunc_len;
+ tfm = esp->conf.tfm;
+ blksize = (crypto_tfm_alg_blocksize(tfm) + 3) & ~3;
+ clen = (clen + 2 + blksize-1)&~(blksize-1);
+ if (esp->conf.padlen)
+ clen = (clen + esp->conf.padlen-1)&~(esp->conf.padlen-1);
+
+ if ((nfrags = skb_cow_data(skb, clen-skb->len+alen, &trailer)) < 0)
+ goto error;
+
+ /* Fill padding... */
+ do {
+ int i;
+ for (i=0; i<clen-skb->len - 2; i++)
+ *(u8*)(trailer->tail + i) = i+1;
+ } while (0);
+ *(u8*)(trailer->tail + clen-skb->len - 2) = (clen - skb->len)-2;
+ pskb_put(skb, trailer, clen - skb->len);
+
+ __skb_push(skb, skb->data - skb->nh.raw);
+ top_iph = skb->nh.iph;
+ esph = (struct ip_esp_hdr *)(skb->nh.raw + top_iph->ihl*4);
+ top_iph->tot_len = htons(skb->len + alen);
+ *(u8*)(trailer->tail - 1) = top_iph->protocol;
+
+ /* this is non-NULL only with UDP Encapsulation */
+ if (x->encap) {
+ struct xfrm_encap_tmpl *encap = x->encap;
+ struct udphdr *uh;
+ u32 *udpdata32;
+
+ uh = (struct udphdr *)esph;
+ uh->source = encap->encap_sport;
+ uh->dest = encap->encap_dport;
+ uh->len = htons(skb->len + alen - top_iph->ihl*4);
+ uh->check = 0;
+
+ switch (encap->encap_type) {
+ default:
+ case UDP_ENCAP_ESPINUDP:
+ esph = (struct ip_esp_hdr *)(uh + 1);
+ break;
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ udpdata32 = (u32 *)(uh + 1);
+ udpdata32[0] = udpdata32[1] = 0;
+ esph = (struct ip_esp_hdr *)(udpdata32 + 2);
+ break;
+ }
+
+ top_iph->protocol = IPPROTO_UDP;
+ } else
+ top_iph->protocol = IPPROTO_ESP;
+
+ esph->spi = x->id.spi;
+ esph->seq_no = htonl(++x->replay.oseq);
+
+ if (esp->conf.ivlen)
+ crypto_cipher_set_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm));
+
+ do {
+ struct scatterlist *sg = &esp->sgbuf[0];
+
+ if (unlikely(nfrags > ESP_NUM_FAST_SG)) {
+ sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC);
+ if (!sg)
+ goto error;
+ }
+ skb_to_sgvec(skb, sg, esph->enc_data+esp->conf.ivlen-skb->data, clen);
+ crypto_cipher_encrypt(tfm, sg, sg, clen);
+ if (unlikely(sg != &esp->sgbuf[0]))
+ kfree(sg);
+ } while (0);
+
+ if (esp->conf.ivlen) {
+ memcpy(esph->enc_data, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm));
+ crypto_cipher_get_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm));
+ }
+
+ if (esp->auth.icv_full_len) {
+ esp->auth.icv(esp, skb, (u8*)esph-skb->data,
+ sizeof(struct ip_esp_hdr) + esp->conf.ivlen+clen, trailer->tail);
+ pskb_put(skb, trailer, alen);
+ }
+
+ ip_send_check(top_iph);
+
+ err = 0;
+
+error:
+ return err;
+}
+
+/*
+ * Note: detecting truncated vs. non-truncated authentication data is very
+ * expensive, so we only support truncated data, which is the recommended
+ * and common case.
+ */
+static int esp_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
+{
+ struct iphdr *iph;
+ struct ip_esp_hdr *esph;
+ struct esp_data *esp = x->data;
+ struct sk_buff *trailer;
+ int blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
+ int alen = esp->auth.icv_trunc_len;
+ int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen;
+ int nfrags;
+ int encap_len = 0;
+
+ if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr)))
+ goto out;
+
+ if (elen <= 0 || (elen & (blksize-1)))
+ goto out;
+
+ /* If integrity check is required, do this. */
+ if (esp->auth.icv_full_len) {
+ u8 sum[esp->auth.icv_full_len];
+ u8 sum1[alen];
+
+ esp->auth.icv(esp, skb, 0, skb->len-alen, sum);
+
+ if (skb_copy_bits(skb, skb->len-alen, sum1, alen))
+ BUG();
+
+ if (unlikely(memcmp(sum, sum1, alen))) {
+ x->stats.integrity_failed++;
+ goto out;
+ }
+ }
+
+ if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0)
+ goto out;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ esph = (struct ip_esp_hdr*)skb->data;
+ iph = skb->nh.iph;
+
+ /* Get ivec. This can be wrong, check against another impls. */
+ if (esp->conf.ivlen)
+ crypto_cipher_set_iv(esp->conf.tfm, esph->enc_data, crypto_tfm_alg_ivsize(esp->conf.tfm));
+
+ {
+ u8 nexthdr[2];
+ struct scatterlist *sg = &esp->sgbuf[0];
+ u8 workbuf[60];
+ int padlen;
+
+ if (unlikely(nfrags > ESP_NUM_FAST_SG)) {
+ sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC);
+ if (!sg)
+ goto out;
+ }
+ skb_to_sgvec(skb, sg, sizeof(struct ip_esp_hdr) + esp->conf.ivlen, elen);
+ crypto_cipher_decrypt(esp->conf.tfm, sg, sg, elen);
+ if (unlikely(sg != &esp->sgbuf[0]))
+ kfree(sg);
+
+ if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2))
+ BUG();
+
+ padlen = nexthdr[0];
+ if (padlen+2 >= elen)
+ goto out;
+
+ /* ... check padding bits here. Silly. :-) */
+
+ if (x->encap && decap && decap->decap_type) {
+ struct esp_decap_data *encap_data;
+ struct udphdr *uh = (struct udphdr *) (iph+1);
+
+ encap_data = (struct esp_decap_data *) (decap->decap_data);
+ encap_data->proto = 0;
+
+ switch (decap->decap_type) {
+ case UDP_ENCAP_ESPINUDP:
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ encap_data->proto = AF_INET;
+ encap_data->saddr.a4 = iph->saddr;
+ encap_data->sport = uh->source;
+ encap_len = (void*)esph - (void*)uh;
+ break;
+
+ default:
+ goto out;
+ }
+ }
+
+ iph->protocol = nexthdr[1];
+ pskb_trim(skb, skb->len - alen - padlen - 2);
+ memcpy(workbuf, skb->nh.raw, iph->ihl*4);
+ skb->h.raw = skb_pull(skb, sizeof(struct ip_esp_hdr) + esp->conf.ivlen);
+ skb->nh.raw += encap_len + sizeof(struct ip_esp_hdr) + esp->conf.ivlen;
+ memcpy(skb->nh.raw, workbuf, iph->ihl*4);
+ skb->nh.iph->tot_len = htons(skb->len);
+ }
+
+ return 0;
+
+out:
+ return -EINVAL;
+}
+
+static int esp_post_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
+{
+
+ if (x->encap) {
+ struct xfrm_encap_tmpl *encap;
+ struct esp_decap_data *decap_data;
+
+ encap = x->encap;
+ decap_data = (struct esp_decap_data *)(decap->decap_data);
+
+ /* first, make sure that the decap type == the encap type */
+ if (encap->encap_type != decap->decap_type)
+ return -EINVAL;
+
+ switch (encap->encap_type) {
+ default:
+ case UDP_ENCAP_ESPINUDP:
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ /*
+ * 1) if the NAT-T peer's IP or port changed then
+ * advertize the change to the keying daemon.
+ * This is an inbound SA, so just compare
+ * SRC ports.
+ */
+ if (decap_data->proto == AF_INET &&
+ (decap_data->saddr.a4 != x->props.saddr.a4 ||
+ decap_data->sport != encap->encap_sport)) {
+ xfrm_address_t ipaddr;
+
+ ipaddr.a4 = decap_data->saddr.a4;
+ km_new_mapping(x, &ipaddr, decap_data->sport);
+
+ /* XXX: perhaps add an extra
+ * policy check here, to see
+ * if we should allow or
+ * reject a packet from a
+ * different source
+ * address/port.
+ */
+ }
+
+ /*
+ * 2) ignore UDP/TCP checksums in case
+ * of NAT-T in Transport Mode, or
+ * perform other post-processing fixes
+ * as per * draft-ietf-ipsec-udp-encaps-06,
+ * section 3.1.2
+ */
+ if (!x->props.mode)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ break;
+ }
+ }
+ return 0;
+}
+
+static u32 esp4_get_max_size(struct xfrm_state *x, int mtu)
+{
+ struct esp_data *esp = x->data;
+ u32 blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
+
+ if (x->props.mode) {
+ mtu = (mtu + 2 + blksize-1)&~(blksize-1);
+ } else {
+ /* The worst case. */
+ mtu += 2 + blksize;
+ }
+ if (esp->conf.padlen)
+ mtu = (mtu + esp->conf.padlen-1)&~(esp->conf.padlen-1);
+
+ return mtu + x->props.header_len + esp->auth.icv_trunc_len;
+}
+
+static void esp4_err(struct sk_buff *skb, u32 info)
+{
+ struct iphdr *iph = (struct iphdr*)skb->data;
+ struct ip_esp_hdr *esph = (struct ip_esp_hdr*)(skb->data+(iph->ihl<<2));
+ struct xfrm_state *x;
+
+ if (skb->h.icmph->type != ICMP_DEST_UNREACH ||
+ skb->h.icmph->code != ICMP_FRAG_NEEDED)
+ return;
+
+ x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
+ if (!x)
+ return;
+ NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
+ ntohl(esph->spi), ntohl(iph->daddr)));
+ xfrm_state_put(x);
+}
+
+static void esp_destroy(struct xfrm_state *x)
+{
+ struct esp_data *esp = x->data;
+
+ if (!esp)
+ return;
+
+ if (esp->conf.tfm) {
+ crypto_free_tfm(esp->conf.tfm);
+ esp->conf.tfm = NULL;
+ }
+ if (esp->conf.ivec) {
+ kfree(esp->conf.ivec);
+ esp->conf.ivec = NULL;
+ }
+ if (esp->auth.tfm) {
+ crypto_free_tfm(esp->auth.tfm);
+ esp->auth.tfm = NULL;
+ }
+ if (esp->auth.work_icv) {
+ kfree(esp->auth.work_icv);
+ esp->auth.work_icv = NULL;
+ }
+ kfree(esp);
+}
+
+static int esp_init_state(struct xfrm_state *x, void *args)
+{
+ struct esp_data *esp = NULL;
+
+ /* null auth and encryption can have zero length keys */
+ if (x->aalg) {
+ if (x->aalg->alg_key_len > 512)
+ goto error;
+ }
+ if (x->ealg == NULL)
+ goto error;
+
+ esp = kmalloc(sizeof(*esp), GFP_KERNEL);
+ if (esp == NULL)
+ return -ENOMEM;
+
+ memset(esp, 0, sizeof(*esp));
+
+ if (x->aalg) {
+ struct xfrm_algo_desc *aalg_desc;
+
+ esp->auth.key = x->aalg->alg_key;
+ esp->auth.key_len = (x->aalg->alg_key_len+7)/8;
+ esp->auth.tfm = crypto_alloc_tfm(x->aalg->alg_name, 0);
+ if (esp->auth.tfm == NULL)
+ goto error;
+ esp->auth.icv = esp_hmac_digest;
+
+ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+ BUG_ON(!aalg_desc);
+
+ if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
+ crypto_tfm_alg_digestsize(esp->auth.tfm)) {
+ NETDEBUG(printk(KERN_INFO "ESP: %s digestsize %u != %hu\n",
+ x->aalg->alg_name,
+ crypto_tfm_alg_digestsize(esp->auth.tfm),
+ aalg_desc->uinfo.auth.icv_fullbits/8));
+ goto error;
+ }
+
+ esp->auth.icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
+ esp->auth.icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8;
+
+ esp->auth.work_icv = kmalloc(esp->auth.icv_full_len, GFP_KERNEL);
+ if (!esp->auth.work_icv)
+ goto error;
+ }
+ esp->conf.key = x->ealg->alg_key;
+ esp->conf.key_len = (x->ealg->alg_key_len+7)/8;
+ if (x->props.ealgo == SADB_EALG_NULL)
+ esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_ECB);
+ else
+ esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_CBC);
+ if (esp->conf.tfm == NULL)
+ goto error;
+ esp->conf.ivlen = crypto_tfm_alg_ivsize(esp->conf.tfm);
+ esp->conf.padlen = 0;
+ if (esp->conf.ivlen) {
+ esp->conf.ivec = kmalloc(esp->conf.ivlen, GFP_KERNEL);
+ if (unlikely(esp->conf.ivec == NULL))
+ goto error;
+ get_random_bytes(esp->conf.ivec, esp->conf.ivlen);
+ }
+ if (crypto_cipher_setkey(esp->conf.tfm, esp->conf.key, esp->conf.key_len))
+ goto error;
+ x->props.header_len = sizeof(struct ip_esp_hdr) + esp->conf.ivlen;
+ if (x->props.mode)
+ x->props.header_len += sizeof(struct iphdr);
+ if (x->encap) {
+ struct xfrm_encap_tmpl *encap = x->encap;
+
+ switch (encap->encap_type) {
+ default:
+ goto error;
+ case UDP_ENCAP_ESPINUDP:
+ x->props.header_len += sizeof(struct udphdr);
+ break;
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
+ break;
+ }
+ }
+ x->data = esp;
+ x->props.trailer_len = esp4_get_max_size(x, 0) - x->props.header_len;
+ return 0;
+
+error:
+ x->data = esp;
+ esp_destroy(x);
+ x->data = NULL;
+ return -EINVAL;
+}
+
+static struct xfrm_type esp_type =
+{
+ .description = "ESP4",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_ESP,
+ .init_state = esp_init_state,
+ .destructor = esp_destroy,
+ .get_max_size = esp4_get_max_size,
+ .input = esp_input,
+ .post_input = esp_post_input,
+ .output = esp_output
+};
+
+static struct net_protocol esp4_protocol = {
+ .handler = xfrm4_rcv,
+ .err_handler = esp4_err,
+ .no_policy = 1,
+};
+
+static int __init esp4_init(void)
+{
+ struct xfrm_decap_state decap;
+
+ if (sizeof(struct esp_decap_data) <
+ sizeof(decap.decap_data)) {
+ extern void decap_data_too_small(void);
+
+ decap_data_too_small();
+ }
+
+ if (xfrm_register_type(&esp_type, AF_INET) < 0) {
+ printk(KERN_INFO "ip esp init: can't add xfrm type\n");
+ return -EAGAIN;
+ }
+ if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) {
+ printk(KERN_INFO "ip esp init: can't add protocol\n");
+ xfrm_unregister_type(&esp_type, AF_INET);
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static void __exit esp4_fini(void)
+{
+ if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0)
+ printk(KERN_INFO "ip esp close: can't remove protocol\n");
+ if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
+ printk(KERN_INFO "ip esp close: can't remove xfrm type\n");
+}
+
+module_init(esp4_init);
+module_exit(esp4_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
new file mode 100644
index 00000000000..563e7d61270
--- /dev/null
+++ b/net/ipv4/fib_frontend.c
@@ -0,0 +1,611 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 Forwarding Information Base: FIB frontend.
+ *
+ * Version: $Id: fib_frontend.c,v 1.26 2001/10/31 21:55:54 davem Exp $
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/arp.h>
+#include <net/ip_fib.h>
+
+#define FFprint(a...) printk(KERN_DEBUG a)
+
+#ifndef CONFIG_IP_MULTIPLE_TABLES
+
+#define RT_TABLE_MIN RT_TABLE_MAIN
+
+struct fib_table *ip_fib_local_table;
+struct fib_table *ip_fib_main_table;
+
+#else
+
+#define RT_TABLE_MIN 1
+
+struct fib_table *fib_tables[RT_TABLE_MAX+1];
+
+struct fib_table *__fib_new_table(int id)
+{
+ struct fib_table *tb;
+
+ tb = fib_hash_init(id);
+ if (!tb)
+ return NULL;
+ fib_tables[id] = tb;
+ return tb;
+}
+
+
+#endif /* CONFIG_IP_MULTIPLE_TABLES */
+
+
+static void fib_flush(void)
+{
+ int flushed = 0;
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ struct fib_table *tb;
+ int id;
+
+ for (id = RT_TABLE_MAX; id>0; id--) {
+ if ((tb = fib_get_table(id))==NULL)
+ continue;
+ flushed += tb->tb_flush(tb);
+ }
+#else /* CONFIG_IP_MULTIPLE_TABLES */
+ flushed += ip_fib_main_table->tb_flush(ip_fib_main_table);
+ flushed += ip_fib_local_table->tb_flush(ip_fib_local_table);
+#endif /* CONFIG_IP_MULTIPLE_TABLES */
+
+ if (flushed)
+ rt_cache_flush(-1);
+}
+
+/*
+ * Find the first device with a given source address.
+ */
+
+struct net_device * ip_dev_find(u32 addr)
+{
+ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
+ struct fib_result res;
+ struct net_device *dev = NULL;
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ res.r = NULL;
+#endif
+
+ if (!ip_fib_local_table ||
+ ip_fib_local_table->tb_lookup(ip_fib_local_table, &fl, &res))
+ return NULL;
+ if (res.type != RTN_LOCAL)
+ goto out;
+ dev = FIB_RES_DEV(res);
+
+ if (dev)
+ dev_hold(dev);
+out:
+ fib_res_put(&res);
+ return dev;
+}
+
+unsigned inet_addr_type(u32 addr)
+{
+ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
+ struct fib_result res;
+ unsigned ret = RTN_BROADCAST;
+
+ if (ZERONET(addr) || BADCLASS(addr))
+ return RTN_BROADCAST;
+ if (MULTICAST(addr))
+ return RTN_MULTICAST;
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ res.r = NULL;
+#endif
+
+ if (ip_fib_local_table) {
+ ret = RTN_UNICAST;
+ if (!ip_fib_local_table->tb_lookup(ip_fib_local_table,
+ &fl, &res)) {
+ ret = res.type;
+ fib_res_put(&res);
+ }
+ }
+ return ret;
+}
+
+/* Given (packet source, input interface) and optional (dst, oif, tos):
+ - (main) check, that source is valid i.e. not broadcast or our local
+ address.
+ - figure out what "logical" interface this packet arrived
+ and calculate "specific destination" address.
+ - check, that packet arrived from expected physical interface.
+ */
+
+int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
+ struct net_device *dev, u32 *spec_dst, u32 *itag)
+{
+ struct in_device *in_dev;
+ struct flowi fl = { .nl_u = { .ip4_u =
+ { .daddr = src,
+ .saddr = dst,
+ .tos = tos } },
+ .iif = oif };
+ struct fib_result res;
+ int no_addr, rpf;
+ int ret;
+
+ no_addr = rpf = 0;
+ rcu_read_lock();
+ in_dev = __in_dev_get(dev);
+ if (in_dev) {
+ no_addr = in_dev->ifa_list == NULL;
+ rpf = IN_DEV_RPFILTER(in_dev);
+ }
+ rcu_read_unlock();
+
+ if (in_dev == NULL)
+ goto e_inval;
+
+ if (fib_lookup(&fl, &res))
+ goto last_resort;
+ if (res.type != RTN_UNICAST)
+ goto e_inval_res;
+ *spec_dst = FIB_RES_PREFSRC(res);
+ fib_combine_itag(itag, &res);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1)
+#else
+ if (FIB_RES_DEV(res) == dev)
+#endif
+ {
+ ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+ fib_res_put(&res);
+ return ret;
+ }
+ fib_res_put(&res);
+ if (no_addr)
+ goto last_resort;
+ if (rpf)
+ goto e_inval;
+ fl.oif = dev->ifindex;
+
+ ret = 0;
+ if (fib_lookup(&fl, &res) == 0) {
+ if (res.type == RTN_UNICAST) {
+ *spec_dst = FIB_RES_PREFSRC(res);
+ ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+ }
+ fib_res_put(&res);
+ }
+ return ret;
+
+last_resort:
+ if (rpf)
+ goto e_inval;
+ *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+ *itag = 0;
+ return 0;
+
+e_inval_res:
+ fib_res_put(&res);
+e_inval:
+ return -EINVAL;
+}
+
+#ifndef CONFIG_IP_NOSIOCRT
+
+/*
+ * Handle IP routing ioctl calls. These are used to manipulate the routing tables
+ */
+
+int ip_rt_ioctl(unsigned int cmd, void __user *arg)
+{
+ int err;
+ struct kern_rta rta;
+ struct rtentry r;
+ struct {
+ struct nlmsghdr nlh;
+ struct rtmsg rtm;
+ } req;
+
+ switch (cmd) {
+ case SIOCADDRT: /* Add a route */
+ case SIOCDELRT: /* Delete a route */
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (copy_from_user(&r, arg, sizeof(struct rtentry)))
+ return -EFAULT;
+ rtnl_lock();
+ err = fib_convert_rtentry(cmd, &req.nlh, &req.rtm, &rta, &r);
+ if (err == 0) {
+ if (cmd == SIOCDELRT) {
+ struct fib_table *tb = fib_get_table(req.rtm.rtm_table);
+ err = -ESRCH;
+ if (tb)
+ err = tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL);
+ } else {
+ struct fib_table *tb = fib_new_table(req.rtm.rtm_table);
+ err = -ENOBUFS;
+ if (tb)
+ err = tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL);
+ }
+ if (rta.rta_mx)
+ kfree(rta.rta_mx);
+ }
+ rtnl_unlock();
+ return err;
+ }
+ return -EINVAL;
+}
+
+#else
+
+int ip_rt_ioctl(unsigned int cmd, void *arg)
+{
+ return -EINVAL;
+}
+
+#endif
+
+static int inet_check_attr(struct rtmsg *r, struct rtattr **rta)
+{
+ int i;
+
+ for (i=1; i<=RTA_MAX; i++) {
+ struct rtattr *attr = rta[i-1];
+ if (attr) {
+ if (RTA_PAYLOAD(attr) < 4)
+ return -EINVAL;
+ if (i != RTA_MULTIPATH && i != RTA_METRICS)
+ rta[i-1] = (struct rtattr*)RTA_DATA(attr);
+ }
+ }
+ return 0;
+}
+
+int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+ struct fib_table * tb;
+ struct rtattr **rta = arg;
+ struct rtmsg *r = NLMSG_DATA(nlh);
+
+ if (inet_check_attr(r, rta))
+ return -EINVAL;
+
+ tb = fib_get_table(r->rtm_table);
+ if (tb)
+ return tb->tb_delete(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb));
+ return -ESRCH;
+}
+
+int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+ struct fib_table * tb;
+ struct rtattr **rta = arg;
+ struct rtmsg *r = NLMSG_DATA(nlh);
+
+ if (inet_check_attr(r, rta))
+ return -EINVAL;
+
+ tb = fib_new_table(r->rtm_table);
+ if (tb)
+ return tb->tb_insert(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb));
+ return -ENOBUFS;
+}
+
+int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int t;
+ int s_t;
+ struct fib_table *tb;
+
+ if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) &&
+ ((struct rtmsg*)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED)
+ return ip_rt_dump(skb, cb);
+
+ s_t = cb->args[0];
+ if (s_t == 0)
+ s_t = cb->args[0] = RT_TABLE_MIN;
+
+ for (t=s_t; t<=RT_TABLE_MAX; t++) {
+ if (t < s_t) continue;
+ if (t > s_t)
+ memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
+ if ((tb = fib_get_table(t))==NULL)
+ continue;
+ if (tb->tb_dump(tb, skb, cb) < 0)
+ break;
+ }
+
+ cb->args[0] = t;
+
+ return skb->len;
+}
+
+/* Prepare and feed intra-kernel routing request.
+ Really, it should be netlink message, but :-( netlink
+ can be not configured, so that we feed it directly
+ to fib engine. It is legal, because all events occur
+ only when netlink is already locked.
+ */
+
+static void fib_magic(int cmd, int type, u32 dst, int dst_len, struct in_ifaddr *ifa)
+{
+ struct fib_table * tb;
+ struct {
+ struct nlmsghdr nlh;
+ struct rtmsg rtm;
+ } req;
+ struct kern_rta rta;
+
+ memset(&req.rtm, 0, sizeof(req.rtm));
+ memset(&rta, 0, sizeof(rta));
+
+ if (type == RTN_UNICAST)
+ tb = fib_new_table(RT_TABLE_MAIN);
+ else
+ tb = fib_new_table(RT_TABLE_LOCAL);
+
+ if (tb == NULL)
+ return;
+
+ req.nlh.nlmsg_len = sizeof(req);
+ req.nlh.nlmsg_type = cmd;
+ req.nlh.nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE|NLM_F_APPEND;
+ req.nlh.nlmsg_pid = 0;
+ req.nlh.nlmsg_seq = 0;
+
+ req.rtm.rtm_dst_len = dst_len;
+ req.rtm.rtm_table = tb->tb_id;
+ req.rtm.rtm_protocol = RTPROT_KERNEL;
+ req.rtm.rtm_scope = (type != RTN_LOCAL ? RT_SCOPE_LINK : RT_SCOPE_HOST);
+ req.rtm.rtm_type = type;
+
+ rta.rta_dst = &dst;
+ rta.rta_prefsrc = &ifa->ifa_local;
+ rta.rta_oif = &ifa->ifa_dev->dev->ifindex;
+
+ if (cmd == RTM_NEWROUTE)
+ tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL);
+ else
+ tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL);
+}
+
+static void fib_add_ifaddr(struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct net_device *dev = in_dev->dev;
+ struct in_ifaddr *prim = ifa;
+ u32 mask = ifa->ifa_mask;
+ u32 addr = ifa->ifa_local;
+ u32 prefix = ifa->ifa_address&mask;
+
+ if (ifa->ifa_flags&IFA_F_SECONDARY) {
+ prim = inet_ifa_byprefix(in_dev, prefix, mask);
+ if (prim == NULL) {
+ printk(KERN_DEBUG "fib_add_ifaddr: bug: prim == NULL\n");
+ return;
+ }
+ }
+
+ fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
+
+ if (!(dev->flags&IFF_UP))
+ return;
+
+ /* Add broadcast address, if it is explicitly assigned. */
+ if (ifa->ifa_broadcast && ifa->ifa_broadcast != 0xFFFFFFFF)
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+
+ if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
+ (prefix != addr || ifa->ifa_prefixlen < 32)) {
+ fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
+ RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim);
+
+ /* Add network specific broadcasts, when it takes a sense */
+ if (ifa->ifa_prefixlen < 31) {
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim);
+ }
+ }
+}
+
+static void fib_del_ifaddr(struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct net_device *dev = in_dev->dev;
+ struct in_ifaddr *ifa1;
+ struct in_ifaddr *prim = ifa;
+ u32 brd = ifa->ifa_address|~ifa->ifa_mask;
+ u32 any = ifa->ifa_address&ifa->ifa_mask;
+#define LOCAL_OK 1
+#define BRD_OK 2
+#define BRD0_OK 4
+#define BRD1_OK 8
+ unsigned ok = 0;
+
+ if (!(ifa->ifa_flags&IFA_F_SECONDARY))
+ fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
+ RTN_UNICAST, any, ifa->ifa_prefixlen, prim);
+ else {
+ prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
+ if (prim == NULL) {
+ printk(KERN_DEBUG "fib_del_ifaddr: bug: prim == NULL\n");
+ return;
+ }
+ }
+
+ /* Deletion is more complicated than add.
+ We should take care of not to delete too much :-)
+
+ Scan address list to be sure that addresses are really gone.
+ */
+
+ for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
+ if (ifa->ifa_local == ifa1->ifa_local)
+ ok |= LOCAL_OK;
+ if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
+ ok |= BRD_OK;
+ if (brd == ifa1->ifa_broadcast)
+ ok |= BRD1_OK;
+ if (any == ifa1->ifa_broadcast)
+ ok |= BRD0_OK;
+ }
+
+ if (!(ok&BRD_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+ if (!(ok&BRD1_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
+ if (!(ok&BRD0_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
+ if (!(ok&LOCAL_OK)) {
+ fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
+
+ /* Check, that this local address finally disappeared. */
+ if (inet_addr_type(ifa->ifa_local) != RTN_LOCAL) {
+ /* And the last, but not the least thing.
+ We must flush stray FIB entries.
+
+ First of all, we scan fib_info list searching
+ for stray nexthop entries, then ignite fib_flush.
+ */
+ if (fib_sync_down(ifa->ifa_local, NULL, 0))
+ fib_flush();
+ }
+ }
+#undef LOCAL_OK
+#undef BRD_OK
+#undef BRD0_OK
+#undef BRD1_OK
+}
+
+static void fib_disable_ip(struct net_device *dev, int force)
+{
+ if (fib_sync_down(0, dev, force))
+ fib_flush();
+ rt_cache_flush(0);
+ arp_ifdown(dev);
+}
+
+static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct in_ifaddr *ifa = (struct in_ifaddr*)ptr;
+
+ switch (event) {
+ case NETDEV_UP:
+ fib_add_ifaddr(ifa);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ fib_sync_up(ifa->ifa_dev->dev);
+#endif
+ rt_cache_flush(-1);
+ break;
+ case NETDEV_DOWN:
+ fib_del_ifaddr(ifa);
+ if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) {
+ /* Last address was deleted from this interface.
+ Disable IP.
+ */
+ fib_disable_ip(ifa->ifa_dev->dev, 1);
+ } else {
+ rt_cache_flush(-1);
+ }
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *dev = ptr;
+ struct in_device *in_dev = __in_dev_get(dev);
+
+ if (event == NETDEV_UNREGISTER) {
+ fib_disable_ip(dev, 2);
+ return NOTIFY_DONE;
+ }
+
+ if (!in_dev)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UP:
+ for_ifa(in_dev) {
+ fib_add_ifaddr(ifa);
+ } endfor_ifa(in_dev);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ fib_sync_up(dev);
+#endif
+ rt_cache_flush(-1);
+ break;
+ case NETDEV_DOWN:
+ fib_disable_ip(dev, 0);
+ break;
+ case NETDEV_CHANGEMTU:
+ case NETDEV_CHANGE:
+ rt_cache_flush(0);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block fib_inetaddr_notifier = {
+ .notifier_call =fib_inetaddr_event,
+};
+
+static struct notifier_block fib_netdev_notifier = {
+ .notifier_call =fib_netdev_event,
+};
+
+void __init ip_fib_init(void)
+{
+#ifndef CONFIG_IP_MULTIPLE_TABLES
+ ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL);
+ ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN);
+#else
+ fib_rules_init();
+#endif
+
+ register_netdevice_notifier(&fib_netdev_notifier);
+ register_inetaddr_notifier(&fib_inetaddr_notifier);
+}
+
+EXPORT_SYMBOL(inet_addr_type);
+EXPORT_SYMBOL(ip_dev_find);
+EXPORT_SYMBOL(ip_rt_ioctl);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
new file mode 100644
index 00000000000..6506dcc01b4
--- /dev/null
+++ b/net/ipv4/fib_hash.c
@@ -0,0 +1,1086 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 FIB: lookup engine and maintenance routines.
+ *
+ * Version: $Id: fib_hash.c,v 1.13 2001/10/31 21:55:54 davem Exp $
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+
+#include "fib_lookup.h"
+
+static kmem_cache_t *fn_hash_kmem;
+static kmem_cache_t *fn_alias_kmem;
+
+struct fib_node {
+ struct hlist_node fn_hash;
+ struct list_head fn_alias;
+ u32 fn_key;
+};
+
+struct fn_zone {
+ struct fn_zone *fz_next; /* Next not empty zone */
+ struct hlist_head *fz_hash; /* Hash table pointer */
+ int fz_nent; /* Number of entries */
+
+ int fz_divisor; /* Hash divisor */
+ u32 fz_hashmask; /* (fz_divisor - 1) */
+#define FZ_HASHMASK(fz) ((fz)->fz_hashmask)
+
+ int fz_order; /* Zone order */
+ u32 fz_mask;
+#define FZ_MASK(fz) ((fz)->fz_mask)
+};
+
+/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask
+ * can be cheaper than memory lookup, so that FZ_* macros are used.
+ */
+
+struct fn_hash {
+ struct fn_zone *fn_zones[33];
+ struct fn_zone *fn_zone_list;
+};
+
+static inline u32 fn_hash(u32 key, struct fn_zone *fz)
+{
+ u32 h = ntohl(key)>>(32 - fz->fz_order);
+ h ^= (h>>20);
+ h ^= (h>>10);
+ h ^= (h>>5);
+ h &= FZ_HASHMASK(fz);
+ return h;
+}
+
+static inline u32 fz_key(u32 dst, struct fn_zone *fz)
+{
+ return dst & FZ_MASK(fz);
+}
+
+static DEFINE_RWLOCK(fib_hash_lock);
+static unsigned int fib_hash_genid;
+
+#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
+
+static struct hlist_head *fz_hash_alloc(int divisor)
+{
+ unsigned long size = divisor * sizeof(struct hlist_head);
+
+ if (size <= PAGE_SIZE) {
+ return kmalloc(size, GFP_KERNEL);
+ } else {
+ return (struct hlist_head *)
+ __get_free_pages(GFP_KERNEL, get_order(size));
+ }
+}
+
+/* The fib hash lock must be held when this is called. */
+static inline void fn_rebuild_zone(struct fn_zone *fz,
+ struct hlist_head *old_ht,
+ int old_divisor)
+{
+ int i;
+
+ for (i = 0; i < old_divisor; i++) {
+ struct hlist_node *node, *n;
+ struct fib_node *f;
+
+ hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
+ struct hlist_head *new_head;
+
+ hlist_del(&f->fn_hash);
+
+ new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
+ hlist_add_head(&f->fn_hash, new_head);
+ }
+ }
+}
+
+static void fz_hash_free(struct hlist_head *hash, int divisor)
+{
+ unsigned long size = divisor * sizeof(struct hlist_head);
+
+ if (size <= PAGE_SIZE)
+ kfree(hash);
+ else
+ free_pages((unsigned long)hash, get_order(size));
+}
+
+static void fn_rehash_zone(struct fn_zone *fz)
+{
+ struct hlist_head *ht, *old_ht;
+ int old_divisor, new_divisor;
+ u32 new_hashmask;
+
+ old_divisor = fz->fz_divisor;
+
+ switch (old_divisor) {
+ case 16:
+ new_divisor = 256;
+ break;
+ case 256:
+ new_divisor = 1024;
+ break;
+ default:
+ if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
+ printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor);
+ return;
+ }
+ new_divisor = (old_divisor << 1);
+ break;
+ }
+
+ new_hashmask = (new_divisor - 1);
+
+#if RT_CACHE_DEBUG >= 2
+ printk("fn_rehash_zone: hash for zone %d grows from %d\n", fz->fz_order, old_divisor);
+#endif
+
+ ht = fz_hash_alloc(new_divisor);
+
+ if (ht) {
+ memset(ht, 0, new_divisor * sizeof(struct hlist_head));
+
+ write_lock_bh(&fib_hash_lock);
+ old_ht = fz->fz_hash;
+ fz->fz_hash = ht;
+ fz->fz_hashmask = new_hashmask;
+ fz->fz_divisor = new_divisor;
+ fn_rebuild_zone(fz, old_ht, old_divisor);
+ fib_hash_genid++;
+ write_unlock_bh(&fib_hash_lock);
+
+ fz_hash_free(old_ht, old_divisor);
+ }
+}
+
+static inline void fn_free_node(struct fib_node * f)
+{
+ kmem_cache_free(fn_hash_kmem, f);
+}
+
+static inline void fn_free_alias(struct fib_alias *fa)
+{
+ fib_release_info(fa->fa_info);
+ kmem_cache_free(fn_alias_kmem, fa);
+}
+
+static struct fn_zone *
+fn_new_zone(struct fn_hash *table, int z)
+{
+ int i;
+ struct fn_zone *fz = kmalloc(sizeof(struct fn_zone), GFP_KERNEL);
+ if (!fz)
+ return NULL;
+
+ memset(fz, 0, sizeof(struct fn_zone));
+ if (z) {
+ fz->fz_divisor = 16;
+ } else {
+ fz->fz_divisor = 1;
+ }
+ fz->fz_hashmask = (fz->fz_divisor - 1);
+ fz->fz_hash = fz_hash_alloc(fz->fz_divisor);
+ if (!fz->fz_hash) {
+ kfree(fz);
+ return NULL;
+ }
+ memset(fz->fz_hash, 0, fz->fz_divisor * sizeof(struct hlist_head *));
+ fz->fz_order = z;
+ fz->fz_mask = inet_make_mask(z);
+
+ /* Find the first not empty zone with more specific mask */
+ for (i=z+1; i<=32; i++)
+ if (table->fn_zones[i])
+ break;
+ write_lock_bh(&fib_hash_lock);
+ if (i>32) {
+ /* No more specific masks, we are the first. */
+ fz->fz_next = table->fn_zone_list;
+ table->fn_zone_list = fz;
+ } else {
+ fz->fz_next = table->fn_zones[i]->fz_next;
+ table->fn_zones[i]->fz_next = fz;
+ }
+ table->fn_zones[z] = fz;
+ fib_hash_genid++;
+ write_unlock_bh(&fib_hash_lock);
+ return fz;
+}
+
+static int
+fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+{
+ int err;
+ struct fn_zone *fz;
+ struct fn_hash *t = (struct fn_hash*)tb->tb_data;
+
+ read_lock(&fib_hash_lock);
+ for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct fib_node *f;
+ u32 k = fz_key(flp->fl4_dst, fz);
+
+ head = &fz->fz_hash[fn_hash(k, fz)];
+ hlist_for_each_entry(f, node, head, fn_hash) {
+ if (f->fn_key != k)
+ continue;
+
+ err = fib_semantic_match(&f->fn_alias,
+ flp, res,
+ f->fn_key, fz->fz_mask,
+ fz->fz_order);
+ if (err <= 0)
+ goto out;
+ }
+ }
+ err = 1;
+out:
+ read_unlock(&fib_hash_lock);
+ return err;
+}
+
+static int fn_hash_last_dflt=-1;
+
+static void
+fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+{
+ int order, last_idx;
+ struct hlist_node *node;
+ struct fib_node *f;
+ struct fib_info *fi = NULL;
+ struct fib_info *last_resort;
+ struct fn_hash *t = (struct fn_hash*)tb->tb_data;
+ struct fn_zone *fz = t->fn_zones[0];
+
+ if (fz == NULL)
+ return;
+
+ last_idx = -1;
+ last_resort = NULL;
+ order = -1;
+
+ read_lock(&fib_hash_lock);
+ hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) {
+ struct fib_alias *fa;
+
+ list_for_each_entry(fa, &f->fn_alias, fa_list) {
+ struct fib_info *next_fi = fa->fa_info;
+
+ if (fa->fa_scope != res->scope ||
+ fa->fa_type != RTN_UNICAST)
+ continue;
+
+ if (next_fi->fib_priority > res->fi->fib_priority)
+ break;
+ if (!next_fi->fib_nh[0].nh_gw ||
+ next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+ continue;
+ fa->fa_state |= FA_S_ACCESSED;
+
+ if (fi == NULL) {
+ if (next_fi != res->fi)
+ break;
+ } else if (!fib_detect_death(fi, order, &last_resort,
+ &last_idx, &fn_hash_last_dflt)) {
+ if (res->fi)
+ fib_info_put(res->fi);
+ res->fi = fi;
+ atomic_inc(&fi->fib_clntref);
+ fn_hash_last_dflt = order;
+ goto out;
+ }
+ fi = next_fi;
+ order++;
+ }
+ }
+
+ if (order <= 0 || fi == NULL) {
+ fn_hash_last_dflt = -1;
+ goto out;
+ }
+
+ if (!fib_detect_death(fi, order, &last_resort, &last_idx, &fn_hash_last_dflt)) {
+ if (res->fi)
+ fib_info_put(res->fi);
+ res->fi = fi;
+ atomic_inc(&fi->fib_clntref);
+ fn_hash_last_dflt = order;
+ goto out;
+ }
+
+ if (last_idx >= 0) {
+ if (res->fi)
+ fib_info_put(res->fi);
+ res->fi = last_resort;
+ if (last_resort)
+ atomic_inc(&last_resort->fib_clntref);
+ }
+ fn_hash_last_dflt = last_idx;
+out:
+ read_unlock(&fib_hash_lock);
+}
+
+/* Insert node F to FZ. */
+static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
+{
+ struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
+
+ hlist_add_head(&f->fn_hash, head);
+}
+
+/* Return the node in FZ matching KEY. */
+static struct fib_node *fib_find_node(struct fn_zone *fz, u32 key)
+{
+ struct hlist_head *head = &fz->fz_hash[fn_hash(key, fz)];
+ struct hlist_node *node;
+ struct fib_node *f;
+
+ hlist_for_each_entry(f, node, head, fn_hash) {
+ if (f->fn_key == key)
+ return f;
+ }
+
+ return NULL;
+}
+
+static int
+fn_hash_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
+ struct nlmsghdr *n, struct netlink_skb_parms *req)
+{
+ struct fn_hash *table = (struct fn_hash *) tb->tb_data;
+ struct fib_node *new_f, *f;
+ struct fib_alias *fa, *new_fa;
+ struct fn_zone *fz;
+ struct fib_info *fi;
+ int z = r->rtm_dst_len;
+ int type = r->rtm_type;
+ u8 tos = r->rtm_tos;
+ u32 key;
+ int err;
+
+ if (z > 32)
+ return -EINVAL;
+ fz = table->fn_zones[z];
+ if (!fz && !(fz = fn_new_zone(table, z)))
+ return -ENOBUFS;
+
+ key = 0;
+ if (rta->rta_dst) {
+ u32 dst;
+ memcpy(&dst, rta->rta_dst, 4);
+ if (dst & ~FZ_MASK(fz))
+ return -EINVAL;
+ key = fz_key(dst, fz);
+ }
+
+ if ((fi = fib_create_info(r, rta, n, &err)) == NULL)
+ return err;
+
+ if (fz->fz_nent > (fz->fz_divisor<<1) &&
+ fz->fz_divisor < FZ_MAX_DIVISOR &&
+ (z==32 || (1<<z) > fz->fz_divisor))
+ fn_rehash_zone(fz);
+
+ f = fib_find_node(fz, key);
+
+ if (!f)
+ fa = NULL;
+ else
+ fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority);
+
+ /* Now fa, if non-NULL, points to the first fib alias
+ * with the same keys [prefix,tos,priority], if such key already
+ * exists or to the node before which we will insert new one.
+ *
+ * If fa is NULL, we will need to allocate a new one and
+ * insert to the head of f.
+ *
+ * If f is NULL, no fib node matched the destination key
+ * and we need to allocate a new one of those as well.
+ */
+
+ if (fa && fa->fa_tos == tos &&
+ fa->fa_info->fib_priority == fi->fib_priority) {
+ struct fib_alias *fa_orig;
+
+ err = -EEXIST;
+ if (n->nlmsg_flags & NLM_F_EXCL)
+ goto out;
+
+ if (n->nlmsg_flags & NLM_F_REPLACE) {
+ struct fib_info *fi_drop;
+ u8 state;
+
+ write_lock_bh(&fib_hash_lock);
+ fi_drop = fa->fa_info;
+ fa->fa_info = fi;
+ fa->fa_type = type;
+ fa->fa_scope = r->rtm_scope;
+ state = fa->fa_state;
+ fa->fa_state &= ~FA_S_ACCESSED;
+ fib_hash_genid++;
+ write_unlock_bh(&fib_hash_lock);
+
+ fib_release_info(fi_drop);
+ if (state & FA_S_ACCESSED)
+ rt_cache_flush(-1);
+ return 0;
+ }
+
+ /* Error if we find a perfect match which
+ * uses the same scope, type, and nexthop
+ * information.
+ */
+ fa_orig = fa;
+ fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
+ list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
+ if (fa->fa_tos != tos)
+ break;
+ if (fa->fa_info->fib_priority != fi->fib_priority)
+ break;
+ if (fa->fa_type == type &&
+ fa->fa_scope == r->rtm_scope &&
+ fa->fa_info == fi)
+ goto out;
+ }
+ if (!(n->nlmsg_flags & NLM_F_APPEND))
+ fa = fa_orig;
+ }
+
+ err = -ENOENT;
+ if (!(n->nlmsg_flags&NLM_F_CREATE))
+ goto out;
+
+ err = -ENOBUFS;
+ new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
+ if (new_fa == NULL)
+ goto out;
+
+ new_f = NULL;
+ if (!f) {
+ new_f = kmem_cache_alloc(fn_hash_kmem, SLAB_KERNEL);
+ if (new_f == NULL)
+ goto out_free_new_fa;
+
+ INIT_HLIST_NODE(&new_f->fn_hash);
+ INIT_LIST_HEAD(&new_f->fn_alias);
+ new_f->fn_key = key;
+ f = new_f;
+ }
+
+ new_fa->fa_info = fi;
+ new_fa->fa_tos = tos;
+ new_fa->fa_type = type;
+ new_fa->fa_scope = r->rtm_scope;
+ new_fa->fa_state = 0;
+
+ /*
+ * Insert new entry to the list.
+ */
+
+ write_lock_bh(&fib_hash_lock);
+ if (new_f)
+ fib_insert_node(fz, new_f);
+ list_add_tail(&new_fa->fa_list,
+ (fa ? &fa->fa_list : &f->fn_alias));
+ fib_hash_genid++;
+ write_unlock_bh(&fib_hash_lock);
+
+ if (new_f)
+ fz->fz_nent++;
+ rt_cache_flush(-1);
+
+ rtmsg_fib(RTM_NEWROUTE, key, new_fa, z, tb->tb_id, n, req);
+ return 0;
+
+out_free_new_fa:
+ kmem_cache_free(fn_alias_kmem, new_fa);
+out:
+ fib_release_info(fi);
+ return err;
+}
+
+
+static int
+fn_hash_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
+ struct nlmsghdr *n, struct netlink_skb_parms *req)
+{
+ struct fn_hash *table = (struct fn_hash*)tb->tb_data;
+ struct fib_node *f;
+ struct fib_alias *fa, *fa_to_delete;
+ int z = r->rtm_dst_len;
+ struct fn_zone *fz;
+ u32 key;
+ u8 tos = r->rtm_tos;
+
+ if (z > 32)
+ return -EINVAL;
+ if ((fz = table->fn_zones[z]) == NULL)
+ return -ESRCH;
+
+ key = 0;
+ if (rta->rta_dst) {
+ u32 dst;
+ memcpy(&dst, rta->rta_dst, 4);
+ if (dst & ~FZ_MASK(fz))
+ return -EINVAL;
+ key = fz_key(dst, fz);
+ }
+
+ f = fib_find_node(fz, key);
+
+ if (!f)
+ fa = NULL;
+ else
+ fa = fib_find_alias(&f->fn_alias, tos, 0);
+ if (!fa)
+ return -ESRCH;
+
+ fa_to_delete = NULL;
+ fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
+ list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (fa->fa_tos != tos)
+ break;
+
+ if ((!r->rtm_type ||
+ fa->fa_type == r->rtm_type) &&
+ (r->rtm_scope == RT_SCOPE_NOWHERE ||
+ fa->fa_scope == r->rtm_scope) &&
+ (!r->rtm_protocol ||
+ fi->fib_protocol == r->rtm_protocol) &&
+ fib_nh_match(r, n, rta, fi) == 0) {
+ fa_to_delete = fa;
+ break;
+ }
+ }
+
+ if (fa_to_delete) {
+ int kill_fn;
+
+ fa = fa_to_delete;
+ rtmsg_fib(RTM_DELROUTE, key, fa, z, tb->tb_id, n, req);
+
+ kill_fn = 0;
+ write_lock_bh(&fib_hash_lock);
+ list_del(&fa->fa_list);
+ if (list_empty(&f->fn_alias)) {
+ hlist_del(&f->fn_hash);
+ kill_fn = 1;
+ }
+ fib_hash_genid++;
+ write_unlock_bh(&fib_hash_lock);
+
+ if (fa->fa_state & FA_S_ACCESSED)
+ rt_cache_flush(-1);
+ fn_free_alias(fa);
+ if (kill_fn) {
+ fn_free_node(f);
+ fz->fz_nent--;
+ }
+
+ return 0;
+ }
+ return -ESRCH;
+}
+
+static int fn_flush_list(struct fn_zone *fz, int idx)
+{
+ struct hlist_head *head = &fz->fz_hash[idx];
+ struct hlist_node *node, *n;
+ struct fib_node *f;
+ int found = 0;
+
+ hlist_for_each_entry_safe(f, node, n, head, fn_hash) {
+ struct fib_alias *fa, *fa_node;
+ int kill_f;
+
+ kill_f = 0;
+ list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
+ write_lock_bh(&fib_hash_lock);
+ list_del(&fa->fa_list);
+ if (list_empty(&f->fn_alias)) {
+ hlist_del(&f->fn_hash);
+ kill_f = 1;
+ }
+ fib_hash_genid++;
+ write_unlock_bh(&fib_hash_lock);
+
+ fn_free_alias(fa);
+ found++;
+ }
+ }
+ if (kill_f) {
+ fn_free_node(f);
+ fz->fz_nent--;
+ }
+ }
+ return found;
+}
+
+static int fn_hash_flush(struct fib_table *tb)
+{
+ struct fn_hash *table = (struct fn_hash *) tb->tb_data;
+ struct fn_zone *fz;
+ int found = 0;
+
+ for (fz = table->fn_zone_list; fz; fz = fz->fz_next) {
+ int i;
+
+ for (i = fz->fz_divisor - 1; i >= 0; i--)
+ found += fn_flush_list(fz, i);
+ }
+ return found;
+}
+
+
+static inline int
+fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
+ struct fib_table *tb,
+ struct fn_zone *fz,
+ struct hlist_head *head)
+{
+ struct hlist_node *node;
+ struct fib_node *f;
+ int i, s_i;
+
+ s_i = cb->args[3];
+ i = 0;
+ hlist_for_each_entry(f, node, head, fn_hash) {
+ struct fib_alias *fa;
+
+ list_for_each_entry(fa, &f->fn_alias, fa_list) {
+ if (i < s_i)
+ goto next;
+
+ if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWROUTE,
+ tb->tb_id,
+ fa->fa_type,
+ fa->fa_scope,
+ &f->fn_key,
+ fz->fz_order,
+ fa->fa_tos,
+ fa->fa_info) < 0) {
+ cb->args[3] = i;
+ return -1;
+ }
+ next:
+ i++;
+ }
+ }
+ cb->args[3] = i;
+ return skb->len;
+}
+
+static inline int
+fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
+ struct fib_table *tb,
+ struct fn_zone *fz)
+{
+ int h, s_h;
+
+ s_h = cb->args[2];
+ for (h=0; h < fz->fz_divisor; h++) {
+ if (h < s_h) continue;
+ if (h > s_h)
+ memset(&cb->args[3], 0,
+ sizeof(cb->args) - 3*sizeof(cb->args[0]));
+ if (fz->fz_hash == NULL ||
+ hlist_empty(&fz->fz_hash[h]))
+ continue;
+ if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h])<0) {
+ cb->args[2] = h;
+ return -1;
+ }
+ }
+ cb->args[2] = h;
+ return skb->len;
+}
+
+static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int m, s_m;
+ struct fn_zone *fz;
+ struct fn_hash *table = (struct fn_hash*)tb->tb_data;
+
+ s_m = cb->args[1];
+ read_lock(&fib_hash_lock);
+ for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
+ if (m < s_m) continue;
+ if (m > s_m)
+ memset(&cb->args[2], 0,
+ sizeof(cb->args) - 2*sizeof(cb->args[0]));
+ if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
+ cb->args[1] = m;
+ read_unlock(&fib_hash_lock);
+ return -1;
+ }
+ }
+ read_unlock(&fib_hash_lock);
+ cb->args[1] = m;
+ return skb->len;
+}
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+struct fib_table * fib_hash_init(int id)
+#else
+struct fib_table * __init fib_hash_init(int id)
+#endif
+{
+ struct fib_table *tb;
+
+ if (fn_hash_kmem == NULL)
+ fn_hash_kmem = kmem_cache_create("ip_fib_hash",
+ sizeof(struct fib_node),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+
+ if (fn_alias_kmem == NULL)
+ fn_alias_kmem = kmem_cache_create("ip_fib_alias",
+ sizeof(struct fib_alias),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+
+ tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
+ GFP_KERNEL);
+ if (tb == NULL)
+ return NULL;
+
+ tb->tb_id = id;
+ tb->tb_lookup = fn_hash_lookup;
+ tb->tb_insert = fn_hash_insert;
+ tb->tb_delete = fn_hash_delete;
+ tb->tb_flush = fn_hash_flush;
+ tb->tb_select_default = fn_hash_select_default;
+ tb->tb_dump = fn_hash_dump;
+ memset(tb->tb_data, 0, sizeof(struct fn_hash));
+ return tb;
+}
+
+/* ------------------------------------------------------------------------ */
+#ifdef CONFIG_PROC_FS
+
+struct fib_iter_state {
+ struct fn_zone *zone;
+ int bucket;
+ struct hlist_head *hash_head;
+ struct fib_node *fn;
+ struct fib_alias *fa;
+ loff_t pos;
+ unsigned int genid;
+ int valid;
+};
+
+static struct fib_alias *fib_get_first(struct seq_file *seq)
+{
+ struct fib_iter_state *iter = seq->private;
+ struct fn_hash *table = (struct fn_hash *) ip_fib_main_table->tb_data;
+
+ iter->bucket = 0;
+ iter->hash_head = NULL;
+ iter->fn = NULL;
+ iter->fa = NULL;
+ iter->pos = 0;
+ iter->genid = fib_hash_genid;
+ iter->valid = 1;
+
+ for (iter->zone = table->fn_zone_list; iter->zone;
+ iter->zone = iter->zone->fz_next) {
+ int maxslot;
+
+ if (!iter->zone->fz_nent)
+ continue;
+
+ iter->hash_head = iter->zone->fz_hash;
+ maxslot = iter->zone->fz_divisor;
+
+ for (iter->bucket = 0; iter->bucket < maxslot;
+ ++iter->bucket, ++iter->hash_head) {
+ struct hlist_node *node;
+ struct fib_node *fn;
+
+ hlist_for_each_entry(fn,node,iter->hash_head,fn_hash) {
+ struct fib_alias *fa;
+
+ list_for_each_entry(fa,&fn->fn_alias,fa_list) {
+ iter->fn = fn;
+ iter->fa = fa;
+ goto out;
+ }
+ }
+ }
+ }
+out:
+ return iter->fa;
+}
+
+static struct fib_alias *fib_get_next(struct seq_file *seq)
+{
+ struct fib_iter_state *iter = seq->private;
+ struct fib_node *fn;
+ struct fib_alias *fa;
+
+ /* Advance FA, if any. */
+ fn = iter->fn;
+ fa = iter->fa;
+ if (fa) {
+ BUG_ON(!fn);
+ list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) {
+ iter->fa = fa;
+ goto out;
+ }
+ }
+
+ fa = iter->fa = NULL;
+
+ /* Advance FN. */
+ if (fn) {
+ struct hlist_node *node = &fn->fn_hash;
+ hlist_for_each_entry_continue(fn, node, fn_hash) {
+ iter->fn = fn;
+
+ list_for_each_entry(fa, &fn->fn_alias, fa_list) {
+ iter->fa = fa;
+ goto out;
+ }
+ }
+ }
+
+ fn = iter->fn = NULL;
+
+ /* Advance hash chain. */
+ if (!iter->zone)
+ goto out;
+
+ for (;;) {
+ struct hlist_node *node;
+ int maxslot;
+
+ maxslot = iter->zone->fz_divisor;
+
+ while (++iter->bucket < maxslot) {
+ iter->hash_head++;
+
+ hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
+ list_for_each_entry(fa, &fn->fn_alias, fa_list) {
+ iter->fn = fn;
+ iter->fa = fa;
+ goto out;
+ }
+ }
+ }
+
+ iter->zone = iter->zone->fz_next;
+
+ if (!iter->zone)
+ goto out;
+
+ iter->bucket = 0;
+ iter->hash_head = iter->zone->fz_hash;
+
+ hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
+ list_for_each_entry(fa, &fn->fn_alias, fa_list) {
+ iter->fn = fn;
+ iter->fa = fa;
+ goto out;
+ }
+ }
+ }
+out:
+ iter->pos++;
+ return fa;
+}
+
+static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct fib_iter_state *iter = seq->private;
+ struct fib_alias *fa;
+
+ if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) {
+ fa = iter->fa;
+ pos -= iter->pos;
+ } else
+ fa = fib_get_first(seq);
+
+ if (fa)
+ while (pos && (fa = fib_get_next(seq)))
+ --pos;
+ return pos ? NULL : fa;
+}
+
+static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ void *v = NULL;
+
+ read_lock(&fib_hash_lock);
+ if (ip_fib_main_table)
+ v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+ return v;
+}
+
+static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq);
+}
+
+static void fib_seq_stop(struct seq_file *seq, void *v)
+{
+ read_unlock(&fib_hash_lock);
+}
+
+static unsigned fib_flag_trans(int type, u32 mask, struct fib_info *fi)
+{
+ static unsigned type2flags[RTN_MAX + 1] = {
+ [7] = RTF_REJECT, [8] = RTF_REJECT,
+ };
+ unsigned flags = type2flags[type];
+
+ if (fi && fi->fib_nh->nh_gw)
+ flags |= RTF_GATEWAY;
+ if (mask == 0xFFFFFFFF)
+ flags |= RTF_HOST;
+ flags |= RTF_UP;
+ return flags;
+}
+
+/*
+ * This outputs /proc/net/route.
+ *
+ * It always works in backward compatibility mode.
+ * The format of the file is not supposed to be changed.
+ */
+static int fib_seq_show(struct seq_file *seq, void *v)
+{
+ struct fib_iter_state *iter;
+ char bf[128];
+ u32 prefix, mask;
+ unsigned flags;
+ struct fib_node *f;
+ struct fib_alias *fa;
+ struct fib_info *fi;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
+ "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
+ "\tWindow\tIRTT");
+ goto out;
+ }
+
+ iter = seq->private;
+ f = iter->fn;
+ fa = iter->fa;
+ fi = fa->fa_info;
+ prefix = f->fn_key;
+ mask = FZ_MASK(iter->zone);
+ flags = fib_flag_trans(fa->fa_type, mask, fi);
+ if (fi)
+ snprintf(bf, sizeof(bf),
+ "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
+ fi->fib_dev ? fi->fib_dev->name : "*", prefix,
+ fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority,
+ mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
+ fi->fib_window,
+ fi->fib_rtt >> 3);
+ else
+ snprintf(bf, sizeof(bf),
+ "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
+ prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0);
+ seq_printf(seq, "%-127s\n", bf);
+out:
+ return 0;
+}
+
+static struct seq_operations fib_seq_ops = {
+ .start = fib_seq_start,
+ .next = fib_seq_next,
+ .stop = fib_seq_stop,
+ .show = fib_seq_show,
+};
+
+static int fib_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc = -ENOMEM;
+ struct fib_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (!s)
+ goto out;
+
+ rc = seq_open(file, &fib_seq_ops);
+ if (rc)
+ goto out_kfree;
+
+ seq = file->private_data;
+ seq->private = s;
+ memset(s, 0, sizeof(*s));
+out:
+ return rc;
+out_kfree:
+ kfree(s);
+ goto out;
+}
+
+static struct file_operations fib_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = fib_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+int __init fib_proc_init(void)
+{
+ if (!proc_net_fops_create("route", S_IRUGO, &fib_seq_fops))
+ return -ENOMEM;
+ return 0;
+}
+
+void __init fib_proc_exit(void)
+{
+ proc_net_remove("route");
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
new file mode 100644
index 00000000000..ac4485f75e9
--- /dev/null
+++ b/net/ipv4/fib_lookup.h
@@ -0,0 +1,43 @@
+#ifndef _FIB_LOOKUP_H
+#define _FIB_LOOKUP_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <net/ip_fib.h>
+
+struct fib_alias {
+ struct list_head fa_list;
+ struct fib_info *fa_info;
+ u8 fa_tos;
+ u8 fa_type;
+ u8 fa_scope;
+ u8 fa_state;
+};
+
+#define FA_S_ACCESSED 0x01
+
+/* Exported by fib_semantics.c */
+extern int fib_semantic_match(struct list_head *head,
+ const struct flowi *flp,
+ struct fib_result *res, __u32 zone, __u32 mask,
+ int prefixlen);
+extern void fib_release_info(struct fib_info *);
+extern struct fib_info *fib_create_info(const struct rtmsg *r,
+ struct kern_rta *rta,
+ const struct nlmsghdr *,
+ int *err);
+extern int fib_nh_match(struct rtmsg *r, struct nlmsghdr *,
+ struct kern_rta *rta, struct fib_info *fi);
+extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+ u8 tb_id, u8 type, u8 scope, void *dst,
+ int dst_len, u8 tos, struct fib_info *fi);
+extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
+ int z, int tb_id,
+ struct nlmsghdr *n, struct netlink_skb_parms *req);
+extern struct fib_alias *fib_find_alias(struct list_head *fah,
+ u8 tos, u32 prio);
+extern int fib_detect_death(struct fib_info *fi, int order,
+ struct fib_info **last_resort,
+ int *last_idx, int *dflt);
+
+#endif /* _FIB_LOOKUP_H */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
new file mode 100644
index 00000000000..39d0aadb9a2
--- /dev/null
+++ b/net/ipv4/fib_rules.c
@@ -0,0 +1,437 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 Forwarding Information Base: policy rules.
+ *
+ * Version: $Id: fib_rules.c,v 1.17 2001/10/31 21:55:54 davem Exp $
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Rani Assaf : local_rule cannot be deleted
+ * Marc Boucher : routing by fwmark
+ */
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+
+#define FRprintk(a...)
+
+struct fib_rule
+{
+ struct fib_rule *r_next;
+ atomic_t r_clntref;
+ u32 r_preference;
+ unsigned char r_table;
+ unsigned char r_action;
+ unsigned char r_dst_len;
+ unsigned char r_src_len;
+ u32 r_src;
+ u32 r_srcmask;
+ u32 r_dst;
+ u32 r_dstmask;
+ u32 r_srcmap;
+ u8 r_flags;
+ u8 r_tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ u32 r_fwmark;
+#endif
+ int r_ifindex;
+#ifdef CONFIG_NET_CLS_ROUTE
+ __u32 r_tclassid;
+#endif
+ char r_ifname[IFNAMSIZ];
+ int r_dead;
+};
+
+static struct fib_rule default_rule = {
+ .r_clntref = ATOMIC_INIT(2),
+ .r_preference = 0x7FFF,
+ .r_table = RT_TABLE_DEFAULT,
+ .r_action = RTN_UNICAST,
+};
+
+static struct fib_rule main_rule = {
+ .r_next = &default_rule,
+ .r_clntref = ATOMIC_INIT(2),
+ .r_preference = 0x7FFE,
+ .r_table = RT_TABLE_MAIN,
+ .r_action = RTN_UNICAST,
+};
+
+static struct fib_rule local_rule = {
+ .r_next = &main_rule,
+ .r_clntref = ATOMIC_INIT(2),
+ .r_table = RT_TABLE_LOCAL,
+ .r_action = RTN_UNICAST,
+};
+
+static struct fib_rule *fib_rules = &local_rule;
+static DEFINE_RWLOCK(fib_rules_lock);
+
+int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+ struct rtattr **rta = arg;
+ struct rtmsg *rtm = NLMSG_DATA(nlh);
+ struct fib_rule *r, **rp;
+ int err = -ESRCH;
+
+ for (rp=&fib_rules; (r=*rp) != NULL; rp=&r->r_next) {
+ if ((!rta[RTA_SRC-1] || memcmp(RTA_DATA(rta[RTA_SRC-1]), &r->r_src, 4) == 0) &&
+ rtm->rtm_src_len == r->r_src_len &&
+ rtm->rtm_dst_len == r->r_dst_len &&
+ (!rta[RTA_DST-1] || memcmp(RTA_DATA(rta[RTA_DST-1]), &r->r_dst, 4) == 0) &&
+ rtm->rtm_tos == r->r_tos &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ (!rta[RTA_PROTOINFO-1] || memcmp(RTA_DATA(rta[RTA_PROTOINFO-1]), &r->r_fwmark, 4) == 0) &&
+#endif
+ (!rtm->rtm_type || rtm->rtm_type == r->r_action) &&
+ (!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) &&
+ (!rta[RTA_IIF-1] || rtattr_strcmp(rta[RTA_IIF-1], r->r_ifname) == 0) &&
+ (!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) {
+ err = -EPERM;
+ if (r == &local_rule)
+ break;
+
+ write_lock_bh(&fib_rules_lock);
+ *rp = r->r_next;
+ r->r_dead = 1;
+ write_unlock_bh(&fib_rules_lock);
+ fib_rule_put(r);
+ err = 0;
+ break;
+ }
+ }
+ return err;
+}
+
+/* Allocate new unique table id */
+
+static struct fib_table *fib_empty_table(void)
+{
+ int id;
+
+ for (id = 1; id <= RT_TABLE_MAX; id++)
+ if (fib_tables[id] == NULL)
+ return __fib_new_table(id);
+ return NULL;
+}
+
+void fib_rule_put(struct fib_rule *r)
+{
+ if (atomic_dec_and_test(&r->r_clntref)) {
+ if (r->r_dead)
+ kfree(r);
+ else
+ printk("Freeing alive rule %p\n", r);
+ }
+}
+
+int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+ struct rtattr **rta = arg;
+ struct rtmsg *rtm = NLMSG_DATA(nlh);
+ struct fib_rule *r, *new_r, **rp;
+ unsigned char table_id;
+
+ if (rtm->rtm_src_len > 32 || rtm->rtm_dst_len > 32 ||
+ (rtm->rtm_tos & ~IPTOS_TOS_MASK))
+ return -EINVAL;
+
+ if (rta[RTA_IIF-1] && RTA_PAYLOAD(rta[RTA_IIF-1]) > IFNAMSIZ)
+ return -EINVAL;
+
+ table_id = rtm->rtm_table;
+ if (table_id == RT_TABLE_UNSPEC) {
+ struct fib_table *table;
+ if (rtm->rtm_type == RTN_UNICAST) {
+ if ((table = fib_empty_table()) == NULL)
+ return -ENOBUFS;
+ table_id = table->tb_id;
+ }
+ }
+
+ new_r = kmalloc(sizeof(*new_r), GFP_KERNEL);
+ if (!new_r)
+ return -ENOMEM;
+ memset(new_r, 0, sizeof(*new_r));
+ if (rta[RTA_SRC-1])
+ memcpy(&new_r->r_src, RTA_DATA(rta[RTA_SRC-1]), 4);
+ if (rta[RTA_DST-1])
+ memcpy(&new_r->r_dst, RTA_DATA(rta[RTA_DST-1]), 4);
+ if (rta[RTA_GATEWAY-1])
+ memcpy(&new_r->r_srcmap, RTA_DATA(rta[RTA_GATEWAY-1]), 4);
+ new_r->r_src_len = rtm->rtm_src_len;
+ new_r->r_dst_len = rtm->rtm_dst_len;
+ new_r->r_srcmask = inet_make_mask(rtm->rtm_src_len);
+ new_r->r_dstmask = inet_make_mask(rtm->rtm_dst_len);
+ new_r->r_tos = rtm->rtm_tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ if (rta[RTA_PROTOINFO-1])
+ memcpy(&new_r->r_fwmark, RTA_DATA(rta[RTA_PROTOINFO-1]), 4);
+#endif
+ new_r->r_action = rtm->rtm_type;
+ new_r->r_flags = rtm->rtm_flags;
+ if (rta[RTA_PRIORITY-1])
+ memcpy(&new_r->r_preference, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
+ new_r->r_table = table_id;
+ if (rta[RTA_IIF-1]) {
+ struct net_device *dev;
+ rtattr_strlcpy(new_r->r_ifname, rta[RTA_IIF-1], IFNAMSIZ);
+ new_r->r_ifindex = -1;
+ dev = __dev_get_by_name(new_r->r_ifname);
+ if (dev)
+ new_r->r_ifindex = dev->ifindex;
+ }
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (rta[RTA_FLOW-1])
+ memcpy(&new_r->r_tclassid, RTA_DATA(rta[RTA_FLOW-1]), 4);
+#endif
+
+ rp = &fib_rules;
+ if (!new_r->r_preference) {
+ r = fib_rules;
+ if (r && (r = r->r_next) != NULL) {
+ rp = &fib_rules->r_next;
+ if (r->r_preference)
+ new_r->r_preference = r->r_preference - 1;
+ }
+ }
+
+ while ( (r = *rp) != NULL ) {
+ if (r->r_preference > new_r->r_preference)
+ break;
+ rp = &r->r_next;
+ }
+
+ new_r->r_next = r;
+ atomic_inc(&new_r->r_clntref);
+ write_lock_bh(&fib_rules_lock);
+ *rp = new_r;
+ write_unlock_bh(&fib_rules_lock);
+ return 0;
+}
+
+#ifdef CONFIG_NET_CLS_ROUTE
+u32 fib_rules_tclass(struct fib_result *res)
+{
+ if (res->r)
+ return res->r->r_tclassid;
+ return 0;
+}
+#endif
+
+
+static void fib_rules_detach(struct net_device *dev)
+{
+ struct fib_rule *r;
+
+ for (r=fib_rules; r; r=r->r_next) {
+ if (r->r_ifindex == dev->ifindex) {
+ write_lock_bh(&fib_rules_lock);
+ r->r_ifindex = -1;
+ write_unlock_bh(&fib_rules_lock);
+ }
+ }
+}
+
+static void fib_rules_attach(struct net_device *dev)
+{
+ struct fib_rule *r;
+
+ for (r=fib_rules; r; r=r->r_next) {
+ if (r->r_ifindex == -1 && strcmp(dev->name, r->r_ifname) == 0) {
+ write_lock_bh(&fib_rules_lock);
+ r->r_ifindex = dev->ifindex;
+ write_unlock_bh(&fib_rules_lock);
+ }
+ }
+}
+
+int fib_lookup(const struct flowi *flp, struct fib_result *res)
+{
+ int err;
+ struct fib_rule *r, *policy;
+ struct fib_table *tb;
+
+ u32 daddr = flp->fl4_dst;
+ u32 saddr = flp->fl4_src;
+
+FRprintk("Lookup: %u.%u.%u.%u <- %u.%u.%u.%u ",
+ NIPQUAD(flp->fl4_dst), NIPQUAD(flp->fl4_src));
+ read_lock(&fib_rules_lock);
+ for (r = fib_rules; r; r=r->r_next) {
+ if (((saddr^r->r_src) & r->r_srcmask) ||
+ ((daddr^r->r_dst) & r->r_dstmask) ||
+ (r->r_tos && r->r_tos != flp->fl4_tos) ||
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ (r->r_fwmark && r->r_fwmark != flp->fl4_fwmark) ||
+#endif
+ (r->r_ifindex && r->r_ifindex != flp->iif))
+ continue;
+
+FRprintk("tb %d r %d ", r->r_table, r->r_action);
+ switch (r->r_action) {
+ case RTN_UNICAST:
+ policy = r;
+ break;
+ case RTN_UNREACHABLE:
+ read_unlock(&fib_rules_lock);
+ return -ENETUNREACH;
+ default:
+ case RTN_BLACKHOLE:
+ read_unlock(&fib_rules_lock);
+ return -EINVAL;
+ case RTN_PROHIBIT:
+ read_unlock(&fib_rules_lock);
+ return -EACCES;
+ }
+
+ if ((tb = fib_get_table(r->r_table)) == NULL)
+ continue;
+ err = tb->tb_lookup(tb, flp, res);
+ if (err == 0) {
+ res->r = policy;
+ if (policy)
+ atomic_inc(&policy->r_clntref);
+ read_unlock(&fib_rules_lock);
+ return 0;
+ }
+ if (err < 0 && err != -EAGAIN) {
+ read_unlock(&fib_rules_lock);
+ return err;
+ }
+ }
+FRprintk("FAILURE\n");
+ read_unlock(&fib_rules_lock);
+ return -ENETUNREACH;
+}
+
+void fib_select_default(const struct flowi *flp, struct fib_result *res)
+{
+ if (res->r && res->r->r_action == RTN_UNICAST &&
+ FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) {
+ struct fib_table *tb;
+ if ((tb = fib_get_table(res->r->r_table)) != NULL)
+ tb->tb_select_default(tb, flp, res);
+ }
+}
+
+static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *dev = ptr;
+
+ if (event == NETDEV_UNREGISTER)
+ fib_rules_detach(dev);
+ else if (event == NETDEV_REGISTER)
+ fib_rules_attach(dev);
+ return NOTIFY_DONE;
+}
+
+
+static struct notifier_block fib_rules_notifier = {
+ .notifier_call =fib_rules_event,
+};
+
+static __inline__ int inet_fill_rule(struct sk_buff *skb,
+ struct fib_rule *r,
+ struct netlink_callback *cb)
+{
+ struct rtmsg *rtm;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb->tail;
+
+ nlh = NLMSG_PUT(skb, NETLINK_CREDS(cb->skb)->pid, cb->nlh->nlmsg_seq, RTM_NEWRULE, sizeof(*rtm));
+ rtm = NLMSG_DATA(nlh);
+ rtm->rtm_family = AF_INET;
+ rtm->rtm_dst_len = r->r_dst_len;
+ rtm->rtm_src_len = r->r_src_len;
+ rtm->rtm_tos = r->r_tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ if (r->r_fwmark)
+ RTA_PUT(skb, RTA_PROTOINFO, 4, &r->r_fwmark);
+#endif
+ rtm->rtm_table = r->r_table;
+ rtm->rtm_protocol = 0;
+ rtm->rtm_scope = 0;
+ rtm->rtm_type = r->r_action;
+ rtm->rtm_flags = r->r_flags;
+
+ if (r->r_dst_len)
+ RTA_PUT(skb, RTA_DST, 4, &r->r_dst);
+ if (r->r_src_len)
+ RTA_PUT(skb, RTA_SRC, 4, &r->r_src);
+ if (r->r_ifname[0])
+ RTA_PUT(skb, RTA_IIF, IFNAMSIZ, &r->r_ifname);
+ if (r->r_preference)
+ RTA_PUT(skb, RTA_PRIORITY, 4, &r->r_preference);
+ if (r->r_srcmap)
+ RTA_PUT(skb, RTA_GATEWAY, 4, &r->r_srcmap);
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (r->r_tclassid)
+ RTA_PUT(skb, RTA_FLOW, 4, &r->r_tclassid);
+#endif
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+nlmsg_failure:
+rtattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int idx;
+ int s_idx = cb->args[0];
+ struct fib_rule *r;
+
+ read_lock(&fib_rules_lock);
+ for (r=fib_rules, idx=0; r; r = r->r_next, idx++) {
+ if (idx < s_idx)
+ continue;
+ if (inet_fill_rule(skb, r, cb) < 0)
+ break;
+ }
+ read_unlock(&fib_rules_lock);
+ cb->args[0] = idx;
+
+ return skb->len;
+}
+
+void __init fib_rules_init(void)
+{
+ register_netdevice_notifier(&fib_rules_notifier);
+}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
new file mode 100644
index 00000000000..029362d6613
--- /dev/null
+++ b/net/ipv4/fib_semantics.c
@@ -0,0 +1,1332 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 Forwarding Information Base: semantics.
+ *
+ * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include <net/ip_mp_alg.h>
+
+#include "fib_lookup.h"
+
+#define FSprintk(a...)
+
+static DEFINE_RWLOCK(fib_info_lock);
+static struct hlist_head *fib_info_hash;
+static struct hlist_head *fib_info_laddrhash;
+static unsigned int fib_hash_size;
+static unsigned int fib_info_cnt;
+
+#define DEVINDEX_HASHBITS 8
+#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
+static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+static DEFINE_SPINLOCK(fib_multipath_lock);
+
+#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
+for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
+
+#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
+for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
+
+#else /* CONFIG_IP_ROUTE_MULTIPATH */
+
+/* Hope, that gcc will optimize it to get rid of dummy loop */
+
+#define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
+for (nhsel=0; nhsel < 1; nhsel++)
+
+#define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
+for (nhsel=0; nhsel < 1; nhsel++)
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
+#define endfor_nexthops(fi) }
+
+
+static struct
+{
+ int error;
+ u8 scope;
+} fib_props[RTA_MAX + 1] = {
+ {
+ .error = 0,
+ .scope = RT_SCOPE_NOWHERE,
+ }, /* RTN_UNSPEC */
+ {
+ .error = 0,
+ .scope = RT_SCOPE_UNIVERSE,
+ }, /* RTN_UNICAST */
+ {
+ .error = 0,
+ .scope = RT_SCOPE_HOST,
+ }, /* RTN_LOCAL */
+ {
+ .error = 0,
+ .scope = RT_SCOPE_LINK,
+ }, /* RTN_BROADCAST */
+ {
+ .error = 0,
+ .scope = RT_SCOPE_LINK,
+ }, /* RTN_ANYCAST */
+ {
+ .error = 0,
+ .scope = RT_SCOPE_UNIVERSE,
+ }, /* RTN_MULTICAST */
+ {
+ .error = -EINVAL,
+ .scope = RT_SCOPE_UNIVERSE,
+ }, /* RTN_BLACKHOLE */
+ {
+ .error = -EHOSTUNREACH,
+ .scope = RT_SCOPE_UNIVERSE,
+ }, /* RTN_UNREACHABLE */
+ {
+ .error = -EACCES,
+ .scope = RT_SCOPE_UNIVERSE,
+ }, /* RTN_PROHIBIT */
+ {
+ .error = -EAGAIN,
+ .scope = RT_SCOPE_UNIVERSE,
+ }, /* RTN_THROW */
+ {
+ .error = -EINVAL,
+ .scope = RT_SCOPE_NOWHERE,
+ }, /* RTN_NAT */
+ {
+ .error = -EINVAL,
+ .scope = RT_SCOPE_NOWHERE,
+ }, /* RTN_XRESOLVE */
+};
+
+
+/* Release a nexthop info record */
+
+void free_fib_info(struct fib_info *fi)
+{
+ if (fi->fib_dead == 0) {
+ printk("Freeing alive fib_info %p\n", fi);
+ return;
+ }
+ change_nexthops(fi) {
+ if (nh->nh_dev)
+ dev_put(nh->nh_dev);
+ nh->nh_dev = NULL;
+ } endfor_nexthops(fi);
+ fib_info_cnt--;
+ kfree(fi);
+}
+
+void fib_release_info(struct fib_info *fi)
+{
+ write_lock(&fib_info_lock);
+ if (fi && --fi->fib_treeref == 0) {
+ hlist_del(&fi->fib_hash);
+ if (fi->fib_prefsrc)
+ hlist_del(&fi->fib_lhash);
+ change_nexthops(fi) {
+ if (!nh->nh_dev)
+ continue;
+ hlist_del(&nh->nh_hash);
+ } endfor_nexthops(fi)
+ fi->fib_dead = 1;
+ fib_info_put(fi);
+ }
+ write_unlock(&fib_info_lock);
+}
+
+static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
+{
+ const struct fib_nh *onh = ofi->fib_nh;
+
+ for_nexthops(fi) {
+ if (nh->nh_oif != onh->nh_oif ||
+ nh->nh_gw != onh->nh_gw ||
+ nh->nh_scope != onh->nh_scope ||
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ nh->nh_weight != onh->nh_weight ||
+#endif
+#ifdef CONFIG_NET_CLS_ROUTE
+ nh->nh_tclassid != onh->nh_tclassid ||
+#endif
+ ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
+ return -1;
+ onh++;
+ } endfor_nexthops(fi);
+ return 0;
+}
+
+static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
+{
+ unsigned int mask = (fib_hash_size - 1);
+ unsigned int val = fi->fib_nhs;
+
+ val ^= fi->fib_protocol;
+ val ^= fi->fib_prefsrc;
+ val ^= fi->fib_priority;
+
+ return (val ^ (val >> 7) ^ (val >> 12)) & mask;
+}
+
+static struct fib_info *fib_find_info(const struct fib_info *nfi)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct fib_info *fi;
+ unsigned int hash;
+
+ hash = fib_info_hashfn(nfi);
+ head = &fib_info_hash[hash];
+
+ hlist_for_each_entry(fi, node, head, fib_hash) {
+ if (fi->fib_nhs != nfi->fib_nhs)
+ continue;
+ if (nfi->fib_protocol == fi->fib_protocol &&
+ nfi->fib_prefsrc == fi->fib_prefsrc &&
+ nfi->fib_priority == fi->fib_priority &&
+ memcmp(nfi->fib_metrics, fi->fib_metrics,
+ sizeof(fi->fib_metrics)) == 0 &&
+ ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
+ (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
+ return fi;
+ }
+
+ return NULL;
+}
+
+static inline unsigned int fib_devindex_hashfn(unsigned int val)
+{
+ unsigned int mask = DEVINDEX_HASHSIZE - 1;
+
+ return (val ^
+ (val >> DEVINDEX_HASHBITS) ^
+ (val >> (DEVINDEX_HASHBITS * 2))) & mask;
+}
+
+/* Check, that the gateway is already configured.
+ Used only by redirect accept routine.
+ */
+
+int ip_fib_check_default(u32 gw, struct net_device *dev)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct fib_nh *nh;
+ unsigned int hash;
+
+ read_lock(&fib_info_lock);
+
+ hash = fib_devindex_hashfn(dev->ifindex);
+ head = &fib_info_devhash[hash];
+ hlist_for_each_entry(nh, node, head, nh_hash) {
+ if (nh->nh_dev == dev &&
+ nh->nh_gw == gw &&
+ !(nh->nh_flags&RTNH_F_DEAD)) {
+ read_unlock(&fib_info_lock);
+ return 0;
+ }
+ }
+
+ read_unlock(&fib_info_lock);
+
+ return -1;
+}
+
+void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
+ int z, int tb_id,
+ struct nlmsghdr *n, struct netlink_skb_parms *req)
+{
+ struct sk_buff *skb;
+ u32 pid = req ? req->pid : 0;
+ int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
+
+ skb = alloc_skb(size, GFP_KERNEL);
+ if (!skb)
+ return;
+
+ if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id,
+ fa->fa_type, fa->fa_scope, &key, z,
+ fa->fa_tos,
+ fa->fa_info) < 0) {
+ kfree_skb(skb);
+ return;
+ }
+ NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE;
+ if (n->nlmsg_flags&NLM_F_ECHO)
+ atomic_inc(&skb->users);
+ netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL);
+ if (n->nlmsg_flags&NLM_F_ECHO)
+ netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
+}
+
+/* Return the first fib alias matching TOS with
+ * priority less than or equal to PRIO.
+ */
+struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
+{
+ if (fah) {
+ struct fib_alias *fa;
+ list_for_each_entry(fa, fah, fa_list) {
+ if (fa->fa_tos > tos)
+ continue;
+ if (fa->fa_info->fib_priority >= prio ||
+ fa->fa_tos < tos)
+ return fa;
+ }
+ }
+ return NULL;
+}
+
+int fib_detect_death(struct fib_info *fi, int order,
+ struct fib_info **last_resort, int *last_idx, int *dflt)
+{
+ struct neighbour *n;
+ int state = NUD_NONE;
+
+ n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
+ if (n) {
+ state = n->nud_state;
+ neigh_release(n);
+ }
+ if (state==NUD_REACHABLE)
+ return 0;
+ if ((state&NUD_VALID) && order != *dflt)
+ return 0;
+ if ((state&NUD_VALID) ||
+ (*last_idx<0 && order > *dflt)) {
+ *last_resort = fi;
+ *last_idx = order;
+ }
+ return 1;
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type)
+{
+ while (RTA_OK(attr,attrlen)) {
+ if (attr->rta_type == type)
+ return *(u32*)RTA_DATA(attr);
+ attr = RTA_NEXT(attr, attrlen);
+ }
+ return 0;
+}
+
+static int
+fib_count_nexthops(struct rtattr *rta)
+{
+ int nhs = 0;
+ struct rtnexthop *nhp = RTA_DATA(rta);
+ int nhlen = RTA_PAYLOAD(rta);
+
+ while (nhlen >= (int)sizeof(struct rtnexthop)) {
+ if ((nhlen -= nhp->rtnh_len) < 0)
+ return 0;
+ nhs++;
+ nhp = RTNH_NEXT(nhp);
+ };
+ return nhs;
+}
+
+static int
+fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r)
+{
+ struct rtnexthop *nhp = RTA_DATA(rta);
+ int nhlen = RTA_PAYLOAD(rta);
+
+ change_nexthops(fi) {
+ int attrlen = nhlen - sizeof(struct rtnexthop);
+ if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
+ return -EINVAL;
+ nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
+ nh->nh_oif = nhp->rtnh_ifindex;
+ nh->nh_weight = nhp->rtnh_hops + 1;
+ if (attrlen) {
+ nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
+#ifdef CONFIG_NET_CLS_ROUTE
+ nh->nh_tclassid = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
+#endif
+ }
+ nhp = RTNH_NEXT(nhp);
+ } endfor_nexthops(fi);
+ return 0;
+}
+
+#endif
+
+int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta,
+ struct fib_info *fi)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ struct rtnexthop *nhp;
+ int nhlen;
+#endif
+
+ if (rta->rta_priority &&
+ *rta->rta_priority != fi->fib_priority)
+ return 1;
+
+ if (rta->rta_oif || rta->rta_gw) {
+ if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) &&
+ (!rta->rta_gw || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 4) == 0))
+ return 0;
+ return 1;
+ }
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (rta->rta_mp == NULL)
+ return 0;
+ nhp = RTA_DATA(rta->rta_mp);
+ nhlen = RTA_PAYLOAD(rta->rta_mp);
+
+ for_nexthops(fi) {
+ int attrlen = nhlen - sizeof(struct rtnexthop);
+ u32 gw;
+
+ if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
+ return -EINVAL;
+ if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
+ return 1;
+ if (attrlen) {
+ gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
+ if (gw && gw != nh->nh_gw)
+ return 1;
+#ifdef CONFIG_NET_CLS_ROUTE
+ gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
+ if (gw && gw != nh->nh_tclassid)
+ return 1;
+#endif
+ }
+ nhp = RTNH_NEXT(nhp);
+ } endfor_nexthops(fi);
+#endif
+ return 0;
+}
+
+
+/*
+ Picture
+ -------
+
+ Semantics of nexthop is very messy by historical reasons.
+ We have to take into account, that:
+ a) gateway can be actually local interface address,
+ so that gatewayed route is direct.
+ b) gateway must be on-link address, possibly
+ described not by an ifaddr, but also by a direct route.
+ c) If both gateway and interface are specified, they should not
+ contradict.
+ d) If we use tunnel routes, gateway could be not on-link.
+
+ Attempt to reconcile all of these (alas, self-contradictory) conditions
+ results in pretty ugly and hairy code with obscure logic.
+
+ I chose to generalized it instead, so that the size
+ of code does not increase practically, but it becomes
+ much more general.
+ Every prefix is assigned a "scope" value: "host" is local address,
+ "link" is direct route,
+ [ ... "site" ... "interior" ... ]
+ and "universe" is true gateway route with global meaning.
+
+ Every prefix refers to a set of "nexthop"s (gw, oif),
+ where gw must have narrower scope. This recursion stops
+ when gw has LOCAL scope or if "nexthop" is declared ONLINK,
+ which means that gw is forced to be on link.
+
+ Code is still hairy, but now it is apparently logically
+ consistent and very flexible. F.e. as by-product it allows
+ to co-exists in peace independent exterior and interior
+ routing processes.
+
+ Normally it looks as following.
+
+ {universe prefix} -> (gw, oif) [scope link]
+ |
+ |-> {link prefix} -> (gw, oif) [scope local]
+ |
+ |-> {local prefix} (terminal node)
+ */
+
+static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_nh *nh)
+{
+ int err;
+
+ if (nh->nh_gw) {
+ struct fib_result res;
+
+#ifdef CONFIG_IP_ROUTE_PERVASIVE
+ if (nh->nh_flags&RTNH_F_PERVASIVE)
+ return 0;
+#endif
+ if (nh->nh_flags&RTNH_F_ONLINK) {
+ struct net_device *dev;
+
+ if (r->rtm_scope >= RT_SCOPE_LINK)
+ return -EINVAL;
+ if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
+ return -EINVAL;
+ if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
+ return -ENODEV;
+ if (!(dev->flags&IFF_UP))
+ return -ENETDOWN;
+ nh->nh_dev = dev;
+ dev_hold(dev);
+ nh->nh_scope = RT_SCOPE_LINK;
+ return 0;
+ }
+ {
+ struct flowi fl = { .nl_u = { .ip4_u =
+ { .daddr = nh->nh_gw,
+ .scope = r->rtm_scope + 1 } },
+ .oif = nh->nh_oif };
+
+ /* It is not necessary, but requires a bit of thinking */
+ if (fl.fl4_scope < RT_SCOPE_LINK)
+ fl.fl4_scope = RT_SCOPE_LINK;
+ if ((err = fib_lookup(&fl, &res)) != 0)
+ return err;
+ }
+ err = -EINVAL;
+ if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
+ goto out;
+ nh->nh_scope = res.scope;
+ nh->nh_oif = FIB_RES_OIF(res);
+ if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
+ goto out;
+ dev_hold(nh->nh_dev);
+ err = -ENETDOWN;
+ if (!(nh->nh_dev->flags & IFF_UP))
+ goto out;
+ err = 0;
+out:
+ fib_res_put(&res);
+ return err;
+ } else {
+ struct in_device *in_dev;
+
+ if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
+ return -EINVAL;
+
+ in_dev = inetdev_by_index(nh->nh_oif);
+ if (in_dev == NULL)
+ return -ENODEV;
+ if (!(in_dev->dev->flags&IFF_UP)) {
+ in_dev_put(in_dev);
+ return -ENETDOWN;
+ }
+ nh->nh_dev = in_dev->dev;
+ dev_hold(nh->nh_dev);
+ nh->nh_scope = RT_SCOPE_HOST;
+ in_dev_put(in_dev);
+ }
+ return 0;
+}
+
+static inline unsigned int fib_laddr_hashfn(u32 val)
+{
+ unsigned int mask = (fib_hash_size - 1);
+
+ return (val ^ (val >> 7) ^ (val >> 14)) & mask;
+}
+
+static struct hlist_head *fib_hash_alloc(int bytes)
+{
+ if (bytes <= PAGE_SIZE)
+ return kmalloc(bytes, GFP_KERNEL);
+ else
+ return (struct hlist_head *)
+ __get_free_pages(GFP_KERNEL, get_order(bytes));
+}
+
+static void fib_hash_free(struct hlist_head *hash, int bytes)
+{
+ if (!hash)
+ return;
+
+ if (bytes <= PAGE_SIZE)
+ kfree(hash);
+ else
+ free_pages((unsigned long) hash, get_order(bytes));
+}
+
+static void fib_hash_move(struct hlist_head *new_info_hash,
+ struct hlist_head *new_laddrhash,
+ unsigned int new_size)
+{
+ unsigned int old_size = fib_hash_size;
+ unsigned int i;
+
+ write_lock(&fib_info_lock);
+ fib_hash_size = new_size;
+
+ for (i = 0; i < old_size; i++) {
+ struct hlist_head *head = &fib_info_hash[i];
+ struct hlist_node *node, *n;
+ struct fib_info *fi;
+
+ hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
+ struct hlist_head *dest;
+ unsigned int new_hash;
+
+ hlist_del(&fi->fib_hash);
+
+ new_hash = fib_info_hashfn(fi);
+ dest = &new_info_hash[new_hash];
+ hlist_add_head(&fi->fib_hash, dest);
+ }
+ }
+ fib_info_hash = new_info_hash;
+
+ for (i = 0; i < old_size; i++) {
+ struct hlist_head *lhead = &fib_info_laddrhash[i];
+ struct hlist_node *node, *n;
+ struct fib_info *fi;
+
+ hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
+ struct hlist_head *ldest;
+ unsigned int new_hash;
+
+ hlist_del(&fi->fib_lhash);
+
+ new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
+ ldest = &new_laddrhash[new_hash];
+ hlist_add_head(&fi->fib_lhash, ldest);
+ }
+ }
+ fib_info_laddrhash = new_laddrhash;
+
+ write_unlock(&fib_info_lock);
+}
+
+struct fib_info *
+fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
+ const struct nlmsghdr *nlh, int *errp)
+{
+ int err;
+ struct fib_info *fi = NULL;
+ struct fib_info *ofi;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ int nhs = 1;
+#else
+ const int nhs = 1;
+#endif
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+ u32 mp_alg = IP_MP_ALG_NONE;
+#endif
+
+ /* Fast check to catch the most weird cases */
+ if (fib_props[r->rtm_type].scope > r->rtm_scope)
+ goto err_inval;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (rta->rta_mp) {
+ nhs = fib_count_nexthops(rta->rta_mp);
+ if (nhs == 0)
+ goto err_inval;
+ }
+#endif
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+ if (rta->rta_mp_alg) {
+ mp_alg = *rta->rta_mp_alg;
+
+ if (mp_alg < IP_MP_ALG_NONE ||
+ mp_alg > IP_MP_ALG_MAX)
+ goto err_inval;
+ }
+#endif
+
+ err = -ENOBUFS;
+ if (fib_info_cnt >= fib_hash_size) {
+ unsigned int new_size = fib_hash_size << 1;
+ struct hlist_head *new_info_hash;
+ struct hlist_head *new_laddrhash;
+ unsigned int bytes;
+
+ if (!new_size)
+ new_size = 1;
+ bytes = new_size * sizeof(struct hlist_head *);
+ new_info_hash = fib_hash_alloc(bytes);
+ new_laddrhash = fib_hash_alloc(bytes);
+ if (!new_info_hash || !new_laddrhash) {
+ fib_hash_free(new_info_hash, bytes);
+ fib_hash_free(new_laddrhash, bytes);
+ } else {
+ memset(new_info_hash, 0, bytes);
+ memset(new_laddrhash, 0, bytes);
+
+ fib_hash_move(new_info_hash, new_laddrhash, new_size);
+ }
+
+ if (!fib_hash_size)
+ goto failure;
+ }
+
+ fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
+ if (fi == NULL)
+ goto failure;
+ fib_info_cnt++;
+ memset(fi, 0, sizeof(*fi)+nhs*sizeof(struct fib_nh));
+
+ fi->fib_protocol = r->rtm_protocol;
+
+ fi->fib_nhs = nhs;
+ change_nexthops(fi) {
+ nh->nh_parent = fi;
+ } endfor_nexthops(fi)
+
+ fi->fib_flags = r->rtm_flags;
+ if (rta->rta_priority)
+ fi->fib_priority = *rta->rta_priority;
+ if (rta->rta_mx) {
+ int attrlen = RTA_PAYLOAD(rta->rta_mx);
+ struct rtattr *attr = RTA_DATA(rta->rta_mx);
+
+ while (RTA_OK(attr, attrlen)) {
+ unsigned flavor = attr->rta_type;
+ if (flavor) {
+ if (flavor > RTAX_MAX)
+ goto err_inval;
+ fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr);
+ }
+ attr = RTA_NEXT(attr, attrlen);
+ }
+ }
+ if (rta->rta_prefsrc)
+ memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4);
+
+ if (rta->rta_mp) {
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0)
+ goto failure;
+ if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif)
+ goto err_inval;
+ if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4))
+ goto err_inval;
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (rta->rta_flow && memcmp(&fi->fib_nh->nh_tclassid, rta->rta_flow, 4))
+ goto err_inval;
+#endif
+#else
+ goto err_inval;
+#endif
+ } else {
+ struct fib_nh *nh = fi->fib_nh;
+ if (rta->rta_oif)
+ nh->nh_oif = *rta->rta_oif;
+ if (rta->rta_gw)
+ memcpy(&nh->nh_gw, rta->rta_gw, 4);
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (rta->rta_flow)
+ memcpy(&nh->nh_tclassid, rta->rta_flow, 4);
+#endif
+ nh->nh_flags = r->rtm_flags;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ nh->nh_weight = 1;
+#endif
+ }
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+ fi->fib_mp_alg = mp_alg;
+#endif
+
+ if (fib_props[r->rtm_type].error) {
+ if (rta->rta_gw || rta->rta_oif || rta->rta_mp)
+ goto err_inval;
+ goto link_it;
+ }
+
+ if (r->rtm_scope > RT_SCOPE_HOST)
+ goto err_inval;
+
+ if (r->rtm_scope == RT_SCOPE_HOST) {
+ struct fib_nh *nh = fi->fib_nh;
+
+ /* Local address is added. */
+ if (nhs != 1 || nh->nh_gw)
+ goto err_inval;
+ nh->nh_scope = RT_SCOPE_NOWHERE;
+ nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
+ err = -ENODEV;
+ if (nh->nh_dev == NULL)
+ goto failure;
+ } else {
+ change_nexthops(fi) {
+ if ((err = fib_check_nh(r, fi, nh)) != 0)
+ goto failure;
+ } endfor_nexthops(fi)
+ }
+
+ if (fi->fib_prefsrc) {
+ if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL ||
+ memcmp(&fi->fib_prefsrc, rta->rta_dst, 4))
+ if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
+ goto err_inval;
+ }
+
+link_it:
+ if ((ofi = fib_find_info(fi)) != NULL) {
+ fi->fib_dead = 1;
+ free_fib_info(fi);
+ ofi->fib_treeref++;
+ return ofi;
+ }
+
+ fi->fib_treeref++;
+ atomic_inc(&fi->fib_clntref);
+ write_lock(&fib_info_lock);
+ hlist_add_head(&fi->fib_hash,
+ &fib_info_hash[fib_info_hashfn(fi)]);
+ if (fi->fib_prefsrc) {
+ struct hlist_head *head;
+
+ head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
+ hlist_add_head(&fi->fib_lhash, head);
+ }
+ change_nexthops(fi) {
+ struct hlist_head *head;
+ unsigned int hash;
+
+ if (!nh->nh_dev)
+ continue;
+ hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
+ head = &fib_info_devhash[hash];
+ hlist_add_head(&nh->nh_hash, head);
+ } endfor_nexthops(fi)
+ write_unlock(&fib_info_lock);
+ return fi;
+
+err_inval:
+ err = -EINVAL;
+
+failure:
+ *errp = err;
+ if (fi) {
+ fi->fib_dead = 1;
+ free_fib_info(fi);
+ }
+ return NULL;
+}
+
+int fib_semantic_match(struct list_head *head, const struct flowi *flp,
+ struct fib_result *res, __u32 zone, __u32 mask,
+ int prefixlen)
+{
+ struct fib_alias *fa;
+ int nh_sel = 0;
+
+ list_for_each_entry(fa, head, fa_list) {
+ int err;
+
+ if (fa->fa_tos &&
+ fa->fa_tos != flp->fl4_tos)
+ continue;
+
+ if (fa->fa_scope < flp->fl4_scope)
+ continue;
+
+ fa->fa_state |= FA_S_ACCESSED;
+
+ err = fib_props[fa->fa_type].error;
+ if (err == 0) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (fi->fib_flags & RTNH_F_DEAD)
+ continue;
+
+ switch (fa->fa_type) {
+ case RTN_UNICAST:
+ case RTN_LOCAL:
+ case RTN_BROADCAST:
+ case RTN_ANYCAST:
+ case RTN_MULTICAST:
+ for_nexthops(fi) {
+ if (nh->nh_flags&RTNH_F_DEAD)
+ continue;
+ if (!flp->oif || flp->oif == nh->nh_oif)
+ break;
+ }
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (nhsel < fi->fib_nhs) {
+ nh_sel = nhsel;
+ goto out_fill_res;
+ }
+#else
+ if (nhsel < 1) {
+ goto out_fill_res;
+ }
+#endif
+ endfor_nexthops(fi);
+ continue;
+
+ default:
+ printk(KERN_DEBUG "impossible 102\n");
+ return -EINVAL;
+ };
+ }
+ return err;
+ }
+ return 1;
+
+out_fill_res:
+ res->prefixlen = prefixlen;
+ res->nh_sel = nh_sel;
+ res->type = fa->fa_type;
+ res->scope = fa->fa_scope;
+ res->fi = fa->fa_info;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+ res->netmask = mask;
+ res->network = zone &
+ (0xFFFFFFFF >> (32 - prefixlen));
+#endif
+ atomic_inc(&res->fi->fib_clntref);
+ return 0;
+}
+
+/* Find appropriate source address to this destination */
+
+u32 __fib_res_prefsrc(struct fib_result *res)
+{
+ return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
+}
+
+int
+fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+ u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
+ struct fib_info *fi)
+{
+ struct rtmsg *rtm;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb->tail;
+
+ nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm));
+ rtm = NLMSG_DATA(nlh);
+ rtm->rtm_family = AF_INET;
+ rtm->rtm_dst_len = dst_len;
+ rtm->rtm_src_len = 0;
+ rtm->rtm_tos = tos;
+ rtm->rtm_table = tb_id;
+ rtm->rtm_type = type;
+ rtm->rtm_flags = fi->fib_flags;
+ rtm->rtm_scope = scope;
+ if (rtm->rtm_dst_len)
+ RTA_PUT(skb, RTA_DST, 4, dst);
+ rtm->rtm_protocol = fi->fib_protocol;
+ if (fi->fib_priority)
+ RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority);
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (fi->fib_nh[0].nh_tclassid)
+ RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid);
+#endif
+ if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
+ goto rtattr_failure;
+ if (fi->fib_prefsrc)
+ RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc);
+ if (fi->fib_nhs == 1) {
+ if (fi->fib_nh->nh_gw)
+ RTA_PUT(skb, RTA_GATEWAY, 4, &fi->fib_nh->nh_gw);
+ if (fi->fib_nh->nh_oif)
+ RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif);
+ }
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (fi->fib_nhs > 1) {
+ struct rtnexthop *nhp;
+ struct rtattr *mp_head;
+ if (skb_tailroom(skb) <= RTA_SPACE(0))
+ goto rtattr_failure;
+ mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0));
+
+ for_nexthops(fi) {
+ if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
+ goto rtattr_failure;
+ nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
+ nhp->rtnh_flags = nh->nh_flags & 0xFF;
+ nhp->rtnh_hops = nh->nh_weight-1;
+ nhp->rtnh_ifindex = nh->nh_oif;
+ if (nh->nh_gw)
+ RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw);
+ nhp->rtnh_len = skb->tail - (unsigned char*)nhp;
+ } endfor_nexthops(fi);
+ mp_head->rta_type = RTA_MULTIPATH;
+ mp_head->rta_len = skb->tail - (u8*)mp_head;
+ }
+#endif
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+nlmsg_failure:
+rtattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+#ifndef CONFIG_IP_NOSIOCRT
+
+int
+fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
+ struct kern_rta *rta, struct rtentry *r)
+{
+ int plen;
+ u32 *ptr;
+
+ memset(rtm, 0, sizeof(*rtm));
+ memset(rta, 0, sizeof(*rta));
+
+ if (r->rt_dst.sa_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ /* Check mask for validity:
+ a) it must be contiguous.
+ b) destination must have all host bits clear.
+ c) if application forgot to set correct family (AF_INET),
+ reject request unless it is absolutely clear i.e.
+ both family and mask are zero.
+ */
+ plen = 32;
+ ptr = &((struct sockaddr_in*)&r->rt_dst)->sin_addr.s_addr;
+ if (!(r->rt_flags&RTF_HOST)) {
+ u32 mask = ((struct sockaddr_in*)&r->rt_genmask)->sin_addr.s_addr;
+ if (r->rt_genmask.sa_family != AF_INET) {
+ if (mask || r->rt_genmask.sa_family)
+ return -EAFNOSUPPORT;
+ }
+ if (bad_mask(mask, *ptr))
+ return -EINVAL;
+ plen = inet_mask_len(mask);
+ }
+
+ nl->nlmsg_flags = NLM_F_REQUEST;
+ nl->nlmsg_pid = 0;
+ nl->nlmsg_seq = 0;
+ nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm));
+ if (cmd == SIOCDELRT) {
+ nl->nlmsg_type = RTM_DELROUTE;
+ nl->nlmsg_flags = 0;
+ } else {
+ nl->nlmsg_type = RTM_NEWROUTE;
+ nl->nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE;
+ rtm->rtm_protocol = RTPROT_BOOT;
+ }
+
+ rtm->rtm_dst_len = plen;
+ rta->rta_dst = ptr;
+
+ if (r->rt_metric) {
+ *(u32*)&r->rt_pad3 = r->rt_metric - 1;
+ rta->rta_priority = (u32*)&r->rt_pad3;
+ }
+ if (r->rt_flags&RTF_REJECT) {
+ rtm->rtm_scope = RT_SCOPE_HOST;
+ rtm->rtm_type = RTN_UNREACHABLE;
+ return 0;
+ }
+ rtm->rtm_scope = RT_SCOPE_NOWHERE;
+ rtm->rtm_type = RTN_UNICAST;
+
+ if (r->rt_dev) {
+ char *colon;
+ struct net_device *dev;
+ char devname[IFNAMSIZ];
+
+ if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1))
+ return -EFAULT;
+ devname[IFNAMSIZ-1] = 0;
+ colon = strchr(devname, ':');
+ if (colon)
+ *colon = 0;
+ dev = __dev_get_by_name(devname);
+ if (!dev)
+ return -ENODEV;
+ rta->rta_oif = &dev->ifindex;
+ if (colon) {
+ struct in_ifaddr *ifa;
+ struct in_device *in_dev = __in_dev_get(dev);
+ if (!in_dev)
+ return -ENODEV;
+ *colon = ':';
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
+ if (strcmp(ifa->ifa_label, devname) == 0)
+ break;
+ if (ifa == NULL)
+ return -ENODEV;
+ rta->rta_prefsrc = &ifa->ifa_local;
+ }
+ }
+
+ ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr;
+ if (r->rt_gateway.sa_family == AF_INET && *ptr) {
+ rta->rta_gw = ptr;
+ if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr) == RTN_UNICAST)
+ rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+ }
+
+ if (cmd == SIOCDELRT)
+ return 0;
+
+ if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL)
+ return -EINVAL;
+
+ if (rtm->rtm_scope == RT_SCOPE_NOWHERE)
+ rtm->rtm_scope = RT_SCOPE_LINK;
+
+ if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) {
+ struct rtattr *rec;
+ struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL);
+ if (mx == NULL)
+ return -ENOMEM;
+ rta->rta_mx = mx;
+ mx->rta_type = RTA_METRICS;
+ mx->rta_len = RTA_LENGTH(0);
+ if (r->rt_flags&RTF_MTU) {
+ rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
+ rec->rta_type = RTAX_ADVMSS;
+ rec->rta_len = RTA_LENGTH(4);
+ mx->rta_len += RTA_LENGTH(4);
+ *(u32*)RTA_DATA(rec) = r->rt_mtu - 40;
+ }
+ if (r->rt_flags&RTF_WINDOW) {
+ rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
+ rec->rta_type = RTAX_WINDOW;
+ rec->rta_len = RTA_LENGTH(4);
+ mx->rta_len += RTA_LENGTH(4);
+ *(u32*)RTA_DATA(rec) = r->rt_window;
+ }
+ if (r->rt_flags&RTF_IRTT) {
+ rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
+ rec->rta_type = RTAX_RTT;
+ rec->rta_len = RTA_LENGTH(4);
+ mx->rta_len += RTA_LENGTH(4);
+ *(u32*)RTA_DATA(rec) = r->rt_irtt<<3;
+ }
+ }
+ return 0;
+}
+
+#endif
+
+/*
+ Update FIB if:
+ - local address disappeared -> we must delete all the entries
+ referring to it.
+ - device went down -> we must shutdown all nexthops going via it.
+ */
+
+int fib_sync_down(u32 local, struct net_device *dev, int force)
+{
+ int ret = 0;
+ int scope = RT_SCOPE_NOWHERE;
+
+ if (force)
+ scope = -1;
+
+ if (local && fib_info_laddrhash) {
+ unsigned int hash = fib_laddr_hashfn(local);
+ struct hlist_head *head = &fib_info_laddrhash[hash];
+ struct hlist_node *node;
+ struct fib_info *fi;
+
+ hlist_for_each_entry(fi, node, head, fib_lhash) {
+ if (fi->fib_prefsrc == local) {
+ fi->fib_flags |= RTNH_F_DEAD;
+ ret++;
+ }
+ }
+ }
+
+ if (dev) {
+ struct fib_info *prev_fi = NULL;
+ unsigned int hash = fib_devindex_hashfn(dev->ifindex);
+ struct hlist_head *head = &fib_info_devhash[hash];
+ struct hlist_node *node;
+ struct fib_nh *nh;
+
+ hlist_for_each_entry(nh, node, head, nh_hash) {
+ struct fib_info *fi = nh->nh_parent;
+ int dead;
+
+ BUG_ON(!fi->fib_nhs);
+ if (nh->nh_dev != dev || fi == prev_fi)
+ continue;
+ prev_fi = fi;
+ dead = 0;
+ change_nexthops(fi) {
+ if (nh->nh_flags&RTNH_F_DEAD)
+ dead++;
+ else if (nh->nh_dev == dev &&
+ nh->nh_scope != scope) {
+ nh->nh_flags |= RTNH_F_DEAD;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ spin_lock_bh(&fib_multipath_lock);
+ fi->fib_power -= nh->nh_power;
+ nh->nh_power = 0;
+ spin_unlock_bh(&fib_multipath_lock);
+#endif
+ dead++;
+ }
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (force > 1 && nh->nh_dev == dev) {
+ dead = fi->fib_nhs;
+ break;
+ }
+#endif
+ } endfor_nexthops(fi)
+ if (dead == fi->fib_nhs) {
+ fi->fib_flags |= RTNH_F_DEAD;
+ ret++;
+ }
+ }
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+/*
+ Dead device goes up. We wake up dead nexthops.
+ It takes sense only on multipath routes.
+ */
+
+int fib_sync_up(struct net_device *dev)
+{
+ struct fib_info *prev_fi;
+ unsigned int hash;
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct fib_nh *nh;
+ int ret;
+
+ if (!(dev->flags&IFF_UP))
+ return 0;
+
+ prev_fi = NULL;
+ hash = fib_devindex_hashfn(dev->ifindex);
+ head = &fib_info_devhash[hash];
+ ret = 0;
+
+ hlist_for_each_entry(nh, node, head, nh_hash) {
+ struct fib_info *fi = nh->nh_parent;
+ int alive;
+
+ BUG_ON(!fi->fib_nhs);
+ if (nh->nh_dev != dev || fi == prev_fi)
+ continue;
+
+ prev_fi = fi;
+ alive = 0;
+ change_nexthops(fi) {
+ if (!(nh->nh_flags&RTNH_F_DEAD)) {
+ alive++;
+ continue;
+ }
+ if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
+ continue;
+ if (nh->nh_dev != dev || __in_dev_get(dev) == NULL)
+ continue;
+ alive++;
+ spin_lock_bh(&fib_multipath_lock);
+ nh->nh_power = 0;
+ nh->nh_flags &= ~RTNH_F_DEAD;
+ spin_unlock_bh(&fib_multipath_lock);
+ } endfor_nexthops(fi)
+
+ if (alive > 0) {
+ fi->fib_flags &= ~RTNH_F_DEAD;
+ ret++;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ The algorithm is suboptimal, but it provides really
+ fair weighted route distribution.
+ */
+
+void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
+{
+ struct fib_info *fi = res->fi;
+ int w;
+
+ spin_lock_bh(&fib_multipath_lock);
+ if (fi->fib_power <= 0) {
+ int power = 0;
+ change_nexthops(fi) {
+ if (!(nh->nh_flags&RTNH_F_DEAD)) {
+ power += nh->nh_weight;
+ nh->nh_power = nh->nh_weight;
+ }
+ } endfor_nexthops(fi);
+ fi->fib_power = power;
+ if (power <= 0) {
+ spin_unlock_bh(&fib_multipath_lock);
+ /* Race condition: route has just become dead. */
+ res->nh_sel = 0;
+ return;
+ }
+ }
+
+
+ /* w should be random number [0..fi->fib_power-1],
+ it is pretty bad approximation.
+ */
+
+ w = jiffies % fi->fib_power;
+
+ change_nexthops(fi) {
+ if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
+ if ((w -= nh->nh_power) <= 0) {
+ nh->nh_power--;
+ fi->fib_power--;
+ res->nh_sel = nhsel;
+ spin_unlock_bh(&fib_multipath_lock);
+ return;
+ }
+ }
+ } endfor_nexthops(fi);
+
+ /* Race condition: route has just become dead. */
+ res->nh_sel = 0;
+ spin_unlock_bh(&fib_multipath_lock);
+}
+#endif
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
new file mode 100644
index 00000000000..85bf0d3e294
--- /dev/null
+++ b/net/ipv4/icmp.c
@@ -0,0 +1,1143 @@
+/*
+ * NET3: Implementation of the ICMP protocol layer.
+ *
+ * Alan Cox, <alan@redhat.com>
+ *
+ * Version: $Id: icmp.c,v 1.85 2002/02/01 22:01:03 davem Exp $
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Some of the function names and the icmp unreach table for this
+ * module were derived from [icmp.c 1.0.11 06/02/93] by
+ * Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting.
+ * Other than that this module is a complete rewrite.
+ *
+ * Fixes:
+ * Clemens Fruhwirth : introduce global icmp rate limiting
+ * with icmp type masking ability instead
+ * of broken per type icmp timeouts.
+ * Mike Shaver : RFC1122 checks.
+ * Alan Cox : Multicast ping reply as self.
+ * Alan Cox : Fix atomicity lockup in ip_build_xmit
+ * call.
+ * Alan Cox : Added 216,128 byte paths to the MTU
+ * code.
+ * Martin Mares : RFC1812 checks.
+ * Martin Mares : Can be configured to follow redirects
+ * if acting as a router _without_ a
+ * routing protocol (RFC 1812).
+ * Martin Mares : Echo requests may be configured to
+ * be ignored (RFC 1812).
+ * Martin Mares : Limitation of ICMP error message
+ * transmit rate (RFC 1812).
+ * Martin Mares : TOS and Precedence set correctly
+ * (RFC 1812).
+ * Martin Mares : Now copying as much data from the
+ * original packet as we can without
+ * exceeding 576 bytes (RFC 1812).
+ * Willy Konynenberg : Transparent proxying support.
+ * Keith Owens : RFC1191 correction for 4.2BSD based
+ * path MTU bug.
+ * Thomas Quinot : ICMP Dest Unreach codes up to 15 are
+ * valid (RFC 1812).
+ * Andi Kleen : Check all packet lengths properly
+ * and moved all kfree_skb() up to
+ * icmp_rcv.
+ * Andi Kleen : Move the rate limit bookkeeping
+ * into the dest entry and use a token
+ * bucket filter (thanks to ANK). Make
+ * the rates sysctl configurable.
+ * Yu Tianli : Fixed two ugly bugs in icmp_send
+ * - IP option length was accounted wrongly
+ * - ICMP header length was not accounted
+ * at all.
+ * Tristan Greaves : Added sysctl option to ignore bogus
+ * broadcast responses from broken routers.
+ *
+ * To Fix:
+ *
+ * - Should use skb_pull() instead of all the manual checking.
+ * This would also greatly simply some upper layer error handlers. --AK
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/protocol.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/init.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <net/checksum.h>
+
+/*
+ * Build xmit assembly blocks
+ */
+
+struct icmp_bxm {
+ struct sk_buff *skb;
+ int offset;
+ int data_len;
+
+ struct {
+ struct icmphdr icmph;
+ __u32 times[3];
+ } data;
+ int head_len;
+ struct ip_options replyopts;
+ unsigned char optbuf[40];
+};
+
+/*
+ * Statistics
+ */
+DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics);
+
+/* An array of errno for error messages from dest unreach. */
+/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
+
+struct icmp_err icmp_err_convert[] = {
+ {
+ .errno = ENETUNREACH, /* ICMP_NET_UNREACH */
+ .fatal = 0,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_HOST_UNREACH */
+ .fatal = 0,
+ },
+ {
+ .errno = ENOPROTOOPT /* ICMP_PROT_UNREACH */,
+ .fatal = 1,
+ },
+ {
+ .errno = ECONNREFUSED, /* ICMP_PORT_UNREACH */
+ .fatal = 1,
+ },
+ {
+ .errno = EMSGSIZE, /* ICMP_FRAG_NEEDED */
+ .fatal = 0,
+ },
+ {
+ .errno = EOPNOTSUPP, /* ICMP_SR_FAILED */
+ .fatal = 0,
+ },
+ {
+ .errno = ENETUNREACH, /* ICMP_NET_UNKNOWN */
+ .fatal = 1,
+ },
+ {
+ .errno = EHOSTDOWN, /* ICMP_HOST_UNKNOWN */
+ .fatal = 1,
+ },
+ {
+ .errno = ENONET, /* ICMP_HOST_ISOLATED */
+ .fatal = 1,
+ },
+ {
+ .errno = ENETUNREACH, /* ICMP_NET_ANO */
+ .fatal = 1,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_HOST_ANO */
+ .fatal = 1,
+ },
+ {
+ .errno = ENETUNREACH, /* ICMP_NET_UNR_TOS */
+ .fatal = 0,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_HOST_UNR_TOS */
+ .fatal = 0,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_PKT_FILTERED */
+ .fatal = 1,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_PREC_VIOLATION */
+ .fatal = 1,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_PREC_CUTOFF */
+ .fatal = 1,
+ },
+};
+
+/* Control parameters for ECHO replies. */
+int sysctl_icmp_echo_ignore_all;
+int sysctl_icmp_echo_ignore_broadcasts;
+
+/* Control parameter - ignore bogus broadcast responses? */
+int sysctl_icmp_ignore_bogus_error_responses;
+
+/*
+ * Configurable global rate limit.
+ *
+ * ratelimit defines tokens/packet consumed for dst->rate_token bucket
+ * ratemask defines which icmp types are ratelimited by setting
+ * it's bit position.
+ *
+ * default:
+ * dest unreachable (3), source quench (4),
+ * time exceeded (11), parameter problem (12)
+ */
+
+int sysctl_icmp_ratelimit = 1 * HZ;
+int sysctl_icmp_ratemask = 0x1818;
+
+/*
+ * ICMP control array. This specifies what to do with each ICMP.
+ */
+
+struct icmp_control {
+ int output_entry; /* Field for increment on output */
+ int input_entry; /* Field for increment on input */
+ void (*handler)(struct sk_buff *skb);
+ short error; /* This ICMP is classed as an error message */
+};
+
+static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
+
+/*
+ * The ICMP socket(s). This is the most convenient way to flow control
+ * our ICMP output as well as maintain a clean interface throughout
+ * all layers. All Socketless IP sends will soon be gone.
+ *
+ * On SMP we have one ICMP socket per-cpu.
+ */
+static DEFINE_PER_CPU(struct socket *, __icmp_socket) = NULL;
+#define icmp_socket __get_cpu_var(__icmp_socket)
+
+static __inline__ int icmp_xmit_lock(void)
+{
+ local_bh_disable();
+
+ if (unlikely(!spin_trylock(&icmp_socket->sk->sk_lock.slock))) {
+ /* This can happen if the output path signals a
+ * dst_link_failure() for an outgoing ICMP packet.
+ */
+ local_bh_enable();
+ return 1;
+ }
+ return 0;
+}
+
+static void icmp_xmit_unlock(void)
+{
+ spin_unlock_bh(&icmp_socket->sk->sk_lock.slock);
+}
+
+/*
+ * Send an ICMP frame.
+ */
+
+/*
+ * Check transmit rate limitation for given message.
+ * The rate information is held in the destination cache now.
+ * This function is generic and could be used for other purposes
+ * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
+ *
+ * Note that the same dst_entry fields are modified by functions in
+ * route.c too, but these work for packet destinations while xrlim_allow
+ * works for icmp destinations. This means the rate limiting information
+ * for one "ip object" is shared - and these ICMPs are twice limited:
+ * by source and by destination.
+ *
+ * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
+ * SHOULD allow setting of rate limits
+ *
+ * Shared between ICMPv4 and ICMPv6.
+ */
+#define XRLIM_BURST_FACTOR 6
+int xrlim_allow(struct dst_entry *dst, int timeout)
+{
+ unsigned long now;
+ int rc = 0;
+
+ now = jiffies;
+ dst->rate_tokens += now - dst->rate_last;
+ dst->rate_last = now;
+ if (dst->rate_tokens > XRLIM_BURST_FACTOR * timeout)
+ dst->rate_tokens = XRLIM_BURST_FACTOR * timeout;
+ if (dst->rate_tokens >= timeout) {
+ dst->rate_tokens -= timeout;
+ rc = 1;
+ }
+ return rc;
+}
+
+static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code)
+{
+ struct dst_entry *dst = &rt->u.dst;
+ int rc = 1;
+
+ if (type > NR_ICMP_TYPES)
+ goto out;
+
+ /* Don't limit PMTU discovery. */
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ goto out;
+
+ /* No rate limit on loopback */
+ if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
+ goto out;
+
+ /* Limit if icmp type is enabled in ratemask. */
+ if ((1 << type) & sysctl_icmp_ratemask)
+ rc = xrlim_allow(dst, sysctl_icmp_ratelimit);
+out:
+ return rc;
+}
+
+/*
+ * Maintain the counters used in the SNMP statistics for outgoing ICMP
+ */
+static void icmp_out_count(int type)
+{
+ if (type <= NR_ICMP_TYPES) {
+ ICMP_INC_STATS(icmp_pointers[type].output_entry);
+ ICMP_INC_STATS(ICMP_MIB_OUTMSGS);
+ }
+}
+
+/*
+ * Checksum each fragment, and on the first include the headers and final
+ * checksum.
+ */
+static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
+ struct sk_buff *skb)
+{
+ struct icmp_bxm *icmp_param = (struct icmp_bxm *)from;
+ unsigned int csum;
+
+ csum = skb_copy_and_csum_bits(icmp_param->skb,
+ icmp_param->offset + offset,
+ to, len, 0);
+
+ skb->csum = csum_block_add(skb->csum, csum, odd);
+ if (icmp_pointers[icmp_param->data.icmph.type].error)
+ nf_ct_attach(skb, icmp_param->skb);
+ return 0;
+}
+
+static void icmp_push_reply(struct icmp_bxm *icmp_param,
+ struct ipcm_cookie *ipc, struct rtable *rt)
+{
+ struct sk_buff *skb;
+
+ ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param,
+ icmp_param->data_len+icmp_param->head_len,
+ icmp_param->head_len,
+ ipc, rt, MSG_DONTWAIT);
+
+ if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) {
+ struct icmphdr *icmph = skb->h.icmph;
+ unsigned int csum = 0;
+ struct sk_buff *skb1;
+
+ skb_queue_walk(&icmp_socket->sk->sk_write_queue, skb1) {
+ csum = csum_add(csum, skb1->csum);
+ }
+ csum = csum_partial_copy_nocheck((void *)&icmp_param->data,
+ (char *)icmph,
+ icmp_param->head_len, csum);
+ icmph->checksum = csum_fold(csum);
+ skb->ip_summed = CHECKSUM_NONE;
+ ip_push_pending_frames(icmp_socket->sk);
+ }
+}
+
+/*
+ * Driving logic for building and sending ICMP messages.
+ */
+
+static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
+{
+ struct sock *sk = icmp_socket->sk;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipcm_cookie ipc;
+ struct rtable *rt = (struct rtable *)skb->dst;
+ u32 daddr;
+
+ if (ip_options_echo(&icmp_param->replyopts, skb))
+ goto out;
+
+ if (icmp_xmit_lock())
+ return;
+
+ icmp_param->data.icmph.checksum = 0;
+ icmp_out_count(icmp_param->data.icmph.type);
+
+ inet->tos = skb->nh.iph->tos;
+ daddr = ipc.addr = rt->rt_src;
+ ipc.opt = NULL;
+ if (icmp_param->replyopts.optlen) {
+ ipc.opt = &icmp_param->replyopts;
+ if (ipc.opt->srr)
+ daddr = icmp_param->replyopts.faddr;
+ }
+ {
+ struct flowi fl = { .nl_u = { .ip4_u =
+ { .daddr = daddr,
+ .saddr = rt->rt_spec_dst,
+ .tos = RT_TOS(skb->nh.iph->tos) } },
+ .proto = IPPROTO_ICMP };
+ if (ip_route_output_key(&rt, &fl))
+ goto out_unlock;
+ }
+ if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type,
+ icmp_param->data.icmph.code))
+ icmp_push_reply(icmp_param, &ipc, rt);
+ ip_rt_put(rt);
+out_unlock:
+ icmp_xmit_unlock();
+out:;
+}
+
+
+/*
+ * Send an ICMP message in response to a situation
+ *
+ * RFC 1122: 3.2.2 MUST send at least the IP header and 8 bytes of header.
+ * MAY send more (we do).
+ * MUST NOT change this header information.
+ * MUST NOT reply to a multicast/broadcast IP address.
+ * MUST NOT reply to a multicast/broadcast MAC address.
+ * MUST reply to only the first fragment.
+ */
+
+void icmp_send(struct sk_buff *skb_in, int type, int code, u32 info)
+{
+ struct iphdr *iph;
+ int room;
+ struct icmp_bxm icmp_param;
+ struct rtable *rt = (struct rtable *)skb_in->dst;
+ struct ipcm_cookie ipc;
+ u32 saddr;
+ u8 tos;
+
+ if (!rt)
+ goto out;
+
+ /*
+ * Find the original header. It is expected to be valid, of course.
+ * Check this, icmp_send is called from the most obscure devices
+ * sometimes.
+ */
+ iph = skb_in->nh.iph;
+
+ if ((u8 *)iph < skb_in->head || (u8 *)(iph + 1) > skb_in->tail)
+ goto out;
+
+ /*
+ * No replies to physical multicast/broadcast
+ */
+ if (skb_in->pkt_type != PACKET_HOST)
+ goto out;
+
+ /*
+ * Now check at the protocol level
+ */
+ if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+ goto out;
+
+ /*
+ * Only reply to fragment 0. We byte re-order the constant
+ * mask for efficiency.
+ */
+ if (iph->frag_off & htons(IP_OFFSET))
+ goto out;
+
+ /*
+ * If we send an ICMP error to an ICMP error a mess would result..
+ */
+ if (icmp_pointers[type].error) {
+ /*
+ * We are an error, check if we are replying to an
+ * ICMP error
+ */
+ if (iph->protocol == IPPROTO_ICMP) {
+ u8 _inner_type, *itp;
+
+ itp = skb_header_pointer(skb_in,
+ skb_in->nh.raw +
+ (iph->ihl << 2) +
+ offsetof(struct icmphdr,
+ type) -
+ skb_in->data,
+ sizeof(_inner_type),
+ &_inner_type);
+ if (itp == NULL)
+ goto out;
+
+ /*
+ * Assume any unknown ICMP type is an error. This
+ * isn't specified by the RFC, but think about it..
+ */
+ if (*itp > NR_ICMP_TYPES ||
+ icmp_pointers[*itp].error)
+ goto out;
+ }
+ }
+
+ if (icmp_xmit_lock())
+ return;
+
+ /*
+ * Construct source address and options.
+ */
+
+ saddr = iph->daddr;
+ if (!(rt->rt_flags & RTCF_LOCAL))
+ saddr = 0;
+
+ tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
+ IPTOS_PREC_INTERNETCONTROL) :
+ iph->tos;
+
+ if (ip_options_echo(&icmp_param.replyopts, skb_in))
+ goto ende;
+
+
+ /*
+ * Prepare data for ICMP header.
+ */
+
+ icmp_param.data.icmph.type = type;
+ icmp_param.data.icmph.code = code;
+ icmp_param.data.icmph.un.gateway = info;
+ icmp_param.data.icmph.checksum = 0;
+ icmp_param.skb = skb_in;
+ icmp_param.offset = skb_in->nh.raw - skb_in->data;
+ icmp_out_count(icmp_param.data.icmph.type);
+ inet_sk(icmp_socket->sk)->tos = tos;
+ ipc.addr = iph->saddr;
+ ipc.opt = &icmp_param.replyopts;
+
+ {
+ struct flowi fl = {
+ .nl_u = {
+ .ip4_u = {
+ .daddr = icmp_param.replyopts.srr ?
+ icmp_param.replyopts.faddr :
+ iph->saddr,
+ .saddr = saddr,
+ .tos = RT_TOS(tos)
+ }
+ },
+ .proto = IPPROTO_ICMP,
+ .uli_u = {
+ .icmpt = {
+ .type = type,
+ .code = code
+ }
+ }
+ };
+ if (ip_route_output_key(&rt, &fl))
+ goto out_unlock;
+ }
+
+ if (!icmpv4_xrlim_allow(rt, type, code))
+ goto ende;
+
+ /* RFC says return as much as we can without exceeding 576 bytes. */
+
+ room = dst_mtu(&rt->u.dst);
+ if (room > 576)
+ room = 576;
+ room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen;
+ room -= sizeof(struct icmphdr);
+
+ icmp_param.data_len = skb_in->len - icmp_param.offset;
+ if (icmp_param.data_len > room)
+ icmp_param.data_len = room;
+ icmp_param.head_len = sizeof(struct icmphdr);
+
+ icmp_push_reply(&icmp_param, &ipc, rt);
+ende:
+ ip_rt_put(rt);
+out_unlock:
+ icmp_xmit_unlock();
+out:;
+}
+
+
+/*
+ * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH.
+ */
+
+static void icmp_unreach(struct sk_buff *skb)
+{
+ struct iphdr *iph;
+ struct icmphdr *icmph;
+ int hash, protocol;
+ struct net_protocol *ipprot;
+ struct sock *raw_sk;
+ u32 info = 0;
+
+ /*
+ * Incomplete header ?
+ * Only checks for the IP header, there should be an
+ * additional check for longer headers in upper levels.
+ */
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto out_err;
+
+ icmph = skb->h.icmph;
+ iph = (struct iphdr *)skb->data;
+
+ if (iph->ihl < 5) /* Mangled header, drop. */
+ goto out_err;
+
+ if (icmph->type == ICMP_DEST_UNREACH) {
+ switch (icmph->code & 15) {
+ case ICMP_NET_UNREACH:
+ case ICMP_HOST_UNREACH:
+ case ICMP_PROT_UNREACH:
+ case ICMP_PORT_UNREACH:
+ break;
+ case ICMP_FRAG_NEEDED:
+ if (ipv4_config.no_pmtu_disc) {
+ LIMIT_NETDEBUG(
+ printk(KERN_INFO "ICMP: %u.%u.%u.%u: "
+ "fragmentation needed "
+ "and DF set.\n",
+ NIPQUAD(iph->daddr)));
+ } else {
+ info = ip_rt_frag_needed(iph,
+ ntohs(icmph->un.frag.mtu));
+ if (!info)
+ goto out;
+ }
+ break;
+ case ICMP_SR_FAILED:
+ LIMIT_NETDEBUG(
+ printk(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
+ "Route Failed.\n",
+ NIPQUAD(iph->daddr)));
+ break;
+ default:
+ break;
+ }
+ if (icmph->code > NR_ICMP_UNREACH)
+ goto out;
+ } else if (icmph->type == ICMP_PARAMETERPROB)
+ info = ntohl(icmph->un.gateway) >> 24;
+
+ /*
+ * Throw it at our lower layers
+ *
+ * RFC 1122: 3.2.2 MUST extract the protocol ID from the passed
+ * header.
+ * RFC 1122: 3.2.2.1 MUST pass ICMP unreach messages to the
+ * transport layer.
+ * RFC 1122: 3.2.2.2 MUST pass ICMP time expired messages to
+ * transport layer.
+ */
+
+ /*
+ * Check the other end isnt violating RFC 1122. Some routers send
+ * bogus responses to broadcast frames. If you see this message
+ * first check your netmask matches at both ends, if it does then
+ * get the other vendor to fix their kit.
+ */
+
+ if (!sysctl_icmp_ignore_bogus_error_responses &&
+ inet_addr_type(iph->daddr) == RTN_BROADCAST) {
+ if (net_ratelimit())
+ printk(KERN_WARNING "%u.%u.%u.%u sent an invalid ICMP "
+ "type %u, code %u "
+ "error to a broadcast: %u.%u.%u.%u on %s\n",
+ NIPQUAD(skb->nh.iph->saddr),
+ icmph->type, icmph->code,
+ NIPQUAD(iph->daddr),
+ skb->dev->name);
+ goto out;
+ }
+
+ /* Checkin full IP header plus 8 bytes of protocol to
+ * avoid additional coding at protocol handlers.
+ */
+ if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
+ goto out;
+
+ iph = (struct iphdr *)skb->data;
+ protocol = iph->protocol;
+
+ /*
+ * Deliver ICMP message to raw sockets. Pretty useless feature?
+ */
+
+ /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
+ hash = protocol & (MAX_INET_PROTOS - 1);
+ read_lock(&raw_v4_lock);
+ if ((raw_sk = sk_head(&raw_v4_htable[hash])) != NULL) {
+ while ((raw_sk = __raw_v4_lookup(raw_sk, protocol, iph->daddr,
+ iph->saddr,
+ skb->dev->ifindex)) != NULL) {
+ raw_err(raw_sk, skb, info);
+ raw_sk = sk_next(raw_sk);
+ iph = (struct iphdr *)skb->data;
+ }
+ }
+ read_unlock(&raw_v4_lock);
+
+ rcu_read_lock();
+ ipprot = rcu_dereference(inet_protos[hash]);
+ if (ipprot && ipprot->err_handler)
+ ipprot->err_handler(skb, info);
+ rcu_read_unlock();
+
+out:
+ return;
+out_err:
+ ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ goto out;
+}
+
+
+/*
+ * Handle ICMP_REDIRECT.
+ */
+
+static void icmp_redirect(struct sk_buff *skb)
+{
+ struct iphdr *iph;
+ unsigned long ip;
+
+ if (skb->len < sizeof(struct iphdr))
+ goto out_err;
+
+ /*
+ * Get the copied header of the packet that caused the redirect
+ */
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto out;
+
+ iph = (struct iphdr *)skb->data;
+ ip = iph->daddr;
+
+ switch (skb->h.icmph->code & 7) {
+ case ICMP_REDIR_NET:
+ case ICMP_REDIR_NETTOS:
+ /*
+ * As per RFC recommendations now handle it as a host redirect.
+ */
+ case ICMP_REDIR_HOST:
+ case ICMP_REDIR_HOSTTOS:
+ ip_rt_redirect(skb->nh.iph->saddr, ip, skb->h.icmph->un.gateway,
+ iph->saddr, iph->tos, skb->dev);
+ break;
+ }
+out:
+ return;
+out_err:
+ ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ goto out;
+}
+
+/*
+ * Handle ICMP_ECHO ("ping") requests.
+ *
+ * RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
+ * requests.
+ * RFC 1122: 3.2.2.6 Data received in the ICMP_ECHO request MUST be
+ * included in the reply.
+ * RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring
+ * echo requests, MUST have default=NOT.
+ * See also WRT handling of options once they are done and working.
+ */
+
+static void icmp_echo(struct sk_buff *skb)
+{
+ if (!sysctl_icmp_echo_ignore_all) {
+ struct icmp_bxm icmp_param;
+
+ icmp_param.data.icmph = *skb->h.icmph;
+ icmp_param.data.icmph.type = ICMP_ECHOREPLY;
+ icmp_param.skb = skb;
+ icmp_param.offset = 0;
+ icmp_param.data_len = skb->len;
+ icmp_param.head_len = sizeof(struct icmphdr);
+ icmp_reply(&icmp_param, skb);
+ }
+}
+
+/*
+ * Handle ICMP Timestamp requests.
+ * RFC 1122: 3.2.2.8 MAY implement ICMP timestamp requests.
+ * SHOULD be in the kernel for minimum random latency.
+ * MUST be accurate to a few minutes.
+ * MUST be updated at least at 15Hz.
+ */
+static void icmp_timestamp(struct sk_buff *skb)
+{
+ struct timeval tv;
+ struct icmp_bxm icmp_param;
+ /*
+ * Too short.
+ */
+ if (skb->len < 4)
+ goto out_err;
+
+ /*
+ * Fill in the current time as ms since midnight UT:
+ */
+ do_gettimeofday(&tv);
+ icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * 1000 +
+ tv.tv_usec / 1000);
+ icmp_param.data.times[2] = icmp_param.data.times[1];
+ if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4))
+ BUG();
+ icmp_param.data.icmph = *skb->h.icmph;
+ icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY;
+ icmp_param.data.icmph.code = 0;
+ icmp_param.skb = skb;
+ icmp_param.offset = 0;
+ icmp_param.data_len = 0;
+ icmp_param.head_len = sizeof(struct icmphdr) + 12;
+ icmp_reply(&icmp_param, skb);
+out:
+ return;
+out_err:
+ ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ goto out;
+}
+
+
+/*
+ * Handle ICMP_ADDRESS_MASK requests. (RFC950)
+ *
+ * RFC1122 (3.2.2.9). A host MUST only send replies to
+ * ADDRESS_MASK requests if it's been configured as an address mask
+ * agent. Receiving a request doesn't constitute implicit permission to
+ * act as one. Of course, implementing this correctly requires (SHOULD)
+ * a way to turn the functionality on and off. Another one for sysctl(),
+ * I guess. -- MS
+ *
+ * RFC1812 (4.3.3.9). A router MUST implement it.
+ * A router SHOULD have switch turning it on/off.
+ * This switch MUST be ON by default.
+ *
+ * Gratuitous replies, zero-source replies are not implemented,
+ * that complies with RFC. DO NOT implement them!!! All the idea
+ * of broadcast addrmask replies as specified in RFC950 is broken.
+ * The problem is that it is not uncommon to have several prefixes
+ * on one physical interface. Moreover, addrmask agent can even be
+ * not aware of existing another prefixes.
+ * If source is zero, addrmask agent cannot choose correct prefix.
+ * Gratuitous mask announcements suffer from the same problem.
+ * RFC1812 explains it, but still allows to use ADDRMASK,
+ * that is pretty silly. --ANK
+ *
+ * All these rules are so bizarre, that I removed kernel addrmask
+ * support at all. It is wrong, it is obsolete, nobody uses it in
+ * any case. --ANK
+ *
+ * Furthermore you can do it with a usermode address agent program
+ * anyway...
+ */
+
+static void icmp_address(struct sk_buff *skb)
+{
+#if 0
+ if (net_ratelimit())
+ printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n");
+#endif
+}
+
+/*
+ * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain
+ * loudly if an inconsistency is found.
+ */
+
+static void icmp_address_reply(struct sk_buff *skb)
+{
+ struct rtable *rt = (struct rtable *)skb->dst;
+ struct net_device *dev = skb->dev;
+ struct in_device *in_dev;
+ struct in_ifaddr *ifa;
+
+ if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC))
+ goto out;
+
+ in_dev = in_dev_get(dev);
+ if (!in_dev)
+ goto out;
+ rcu_read_lock();
+ if (in_dev->ifa_list &&
+ IN_DEV_LOG_MARTIANS(in_dev) &&
+ IN_DEV_FORWARD(in_dev)) {
+ u32 _mask, *mp;
+
+ mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask);
+ if (mp == NULL)
+ BUG();
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ if (*mp == ifa->ifa_mask &&
+ inet_ifa_match(rt->rt_src, ifa))
+ break;
+ }
+ if (!ifa && net_ratelimit()) {
+ printk(KERN_INFO "Wrong address mask %u.%u.%u.%u from "
+ "%s/%u.%u.%u.%u\n",
+ NIPQUAD(*mp), dev->name, NIPQUAD(rt->rt_src));
+ }
+ }
+ rcu_read_unlock();
+ in_dev_put(in_dev);
+out:;
+}
+
+static void icmp_discard(struct sk_buff *skb)
+{
+}
+
+/*
+ * Deal with incoming ICMP packets.
+ */
+int icmp_rcv(struct sk_buff *skb)
+{
+ struct icmphdr *icmph;
+ struct rtable *rt = (struct rtable *)skb->dst;
+
+ ICMP_INC_STATS_BH(ICMP_MIB_INMSGS);
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_HW:
+ if (!(u16)csum_fold(skb->csum))
+ break;
+ NETDEBUG(if (net_ratelimit())
+ printk(KERN_DEBUG "icmp v4 hw csum failure\n"));
+ case CHECKSUM_NONE:
+ if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))
+ goto error;
+ default:;
+ }
+
+ if (!pskb_pull(skb, sizeof(struct icmphdr)))
+ goto error;
+
+ icmph = skb->h.icmph;
+
+ /*
+ * 18 is the highest 'known' ICMP type. Anything else is a mystery
+ *
+ * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
+ * discarded.
+ */
+ if (icmph->type > NR_ICMP_TYPES)
+ goto error;
+
+
+ /*
+ * Parse the ICMP message
+ */
+
+ if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
+ /*
+ * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be
+ * silently ignored (we let user decide with a sysctl).
+ * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
+ * discarded if to broadcast/multicast.
+ */
+ if (icmph->type == ICMP_ECHO &&
+ sysctl_icmp_echo_ignore_broadcasts) {
+ goto error;
+ }
+ if (icmph->type != ICMP_ECHO &&
+ icmph->type != ICMP_TIMESTAMP &&
+ icmph->type != ICMP_ADDRESS &&
+ icmph->type != ICMP_ADDRESSREPLY) {
+ goto error;
+ }
+ }
+
+ ICMP_INC_STATS_BH(icmp_pointers[icmph->type].input_entry);
+ icmp_pointers[icmph->type].handler(skb);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+error:
+ ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ goto drop;
+}
+
+/*
+ * This table is the definition of how we handle ICMP.
+ */
+static struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
+ [ICMP_ECHOREPLY] = {
+ .output_entry = ICMP_MIB_OUTECHOREPS,
+ .input_entry = ICMP_MIB_INECHOREPS,
+ .handler = icmp_discard,
+ },
+ [1] = {
+ .output_entry = ICMP_MIB_DUMMY,
+ .input_entry = ICMP_MIB_INERRORS,
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [2] = {
+ .output_entry = ICMP_MIB_DUMMY,
+ .input_entry = ICMP_MIB_INERRORS,
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [ICMP_DEST_UNREACH] = {
+ .output_entry = ICMP_MIB_OUTDESTUNREACHS,
+ .input_entry = ICMP_MIB_INDESTUNREACHS,
+ .handler = icmp_unreach,
+ .error = 1,
+ },
+ [ICMP_SOURCE_QUENCH] = {
+ .output_entry = ICMP_MIB_OUTSRCQUENCHS,
+ .input_entry = ICMP_MIB_INSRCQUENCHS,
+ .handler = icmp_unreach,
+ .error = 1,
+ },
+ [ICMP_REDIRECT] = {
+ .output_entry = ICMP_MIB_OUTREDIRECTS,
+ .input_entry = ICMP_MIB_INREDIRECTS,
+ .handler = icmp_redirect,
+ .error = 1,
+ },
+ [6] = {
+ .output_entry = ICMP_MIB_DUMMY,
+ .input_entry = ICMP_MIB_INERRORS,
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [7] = {
+ .output_entry = ICMP_MIB_DUMMY,
+ .input_entry = ICMP_MIB_INERRORS,
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [ICMP_ECHO] = {
+ .output_entry = ICMP_MIB_OUTECHOS,
+ .input_entry = ICMP_MIB_INECHOS,
+ .handler = icmp_echo,
+ },
+ [9] = {
+ .output_entry = ICMP_MIB_DUMMY,
+ .input_entry = ICMP_MIB_INERRORS,
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [10] = {
+ .output_entry = ICMP_MIB_DUMMY,
+ .input_entry = ICMP_MIB_INERRORS,
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [ICMP_TIME_EXCEEDED] = {
+ .output_entry = ICMP_MIB_OUTTIMEEXCDS,
+ .input_entry = ICMP_MIB_INTIMEEXCDS,
+ .handler = icmp_unreach,
+ .error = 1,
+ },
+ [ICMP_PARAMETERPROB] = {
+ .output_entry = ICMP_MIB_OUTPARMPROBS,
+ .input_entry = ICMP_MIB_INPARMPROBS,
+ .handler = icmp_unreach,
+ .error = 1,
+ },
+ [ICMP_TIMESTAMP] = {
+ .output_entry = ICMP_MIB_OUTTIMESTAMPS,
+ .input_entry = ICMP_MIB_INTIMESTAMPS,
+ .handler = icmp_timestamp,
+ },
+ [ICMP_TIMESTAMPREPLY] = {
+ .output_entry = ICMP_MIB_OUTTIMESTAMPREPS,
+ .input_entry = ICMP_MIB_INTIMESTAMPREPS,
+ .handler = icmp_discard,
+ },
+ [ICMP_INFO_REQUEST] = {
+ .output_entry = ICMP_MIB_DUMMY,
+ .input_entry = ICMP_MIB_DUMMY,
+ .handler = icmp_discard,
+ },
+ [ICMP_INFO_REPLY] = {
+ .output_entry = ICMP_MIB_DUMMY,
+ .input_entry = ICMP_MIB_DUMMY,
+ .handler = icmp_discard,
+ },
+ [ICMP_ADDRESS] = {
+ .output_entry = ICMP_MIB_OUTADDRMASKS,
+ .input_entry = ICMP_MIB_INADDRMASKS,
+ .handler = icmp_address,
+ },
+ [ICMP_ADDRESSREPLY] = {
+ .output_entry = ICMP_MIB_OUTADDRMASKREPS,
+ .input_entry = ICMP_MIB_INADDRMASKREPS,
+ .handler = icmp_address_reply,
+ },
+};
+
+void __init icmp_init(struct net_proto_family *ops)
+{
+ struct inet_sock *inet;
+ int i;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ int err;
+
+ if (!cpu_possible(i))
+ continue;
+
+ err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_ICMP,
+ &per_cpu(__icmp_socket, i));
+
+ if (err < 0)
+ panic("Failed to create the ICMP control socket.\n");
+
+ per_cpu(__icmp_socket, i)->sk->sk_allocation = GFP_ATOMIC;
+
+ /* Enough space for 2 64K ICMP packets, including
+ * sk_buff struct overhead.
+ */
+ per_cpu(__icmp_socket, i)->sk->sk_sndbuf =
+ (2 * ((64 * 1024) + sizeof(struct sk_buff)));
+
+ inet = inet_sk(per_cpu(__icmp_socket, i)->sk);
+ inet->uc_ttl = -1;
+ inet->pmtudisc = IP_PMTUDISC_DONT;
+
+ /* Unhash it so that IP input processing does not even
+ * see it, we do not wish this socket to see incoming
+ * packets.
+ */
+ per_cpu(__icmp_socket, i)->sk->sk_prot->unhash(per_cpu(__icmp_socket, i)->sk);
+ }
+}
+
+EXPORT_SYMBOL(icmp_err_convert);
+EXPORT_SYMBOL(icmp_send);
+EXPORT_SYMBOL(icmp_statistics);
+EXPORT_SYMBOL(xrlim_allow);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
new file mode 100644
index 00000000000..1f3183168a9
--- /dev/null
+++ b/net/ipv4/igmp.c
@@ -0,0 +1,2473 @@
+/*
+ * Linux NET3: Internet Group Management Protocol [IGMP]
+ *
+ * This code implements the IGMP protocol as defined in RFC1112. There has
+ * been a further revision of this protocol since which is now supported.
+ *
+ * If you have trouble with this module be careful what gcc you have used,
+ * the older version didn't come out right using gcc 2.5.8, the newer one
+ * seems to fall out with gcc 2.6.2.
+ *
+ * Version: $Id: igmp.c,v 1.47 2002/02/01 22:01:03 davem Exp $
+ *
+ * Authors:
+ * Alan Cox <Alan.Cox@linux.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ *
+ * Alan Cox : Added lots of __inline__ to optimise
+ * the memory usage of all the tiny little
+ * functions.
+ * Alan Cox : Dumped the header building experiment.
+ * Alan Cox : Minor tweaks ready for multicast routing
+ * and extended IGMP protocol.
+ * Alan Cox : Removed a load of inline directives. Gcc 2.5.8
+ * writes utterly bogus code otherwise (sigh)
+ * fixed IGMP loopback to behave in the manner
+ * desired by mrouted, fixed the fact it has been
+ * broken since 1.3.6 and cleaned up a few minor
+ * points.
+ *
+ * Chih-Jen Chang : Tried to revise IGMP to Version 2
+ * Tsu-Sheng Tsao E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu
+ * The enhancements are mainly based on Steve Deering's
+ * ipmulti-3.5 source code.
+ * Chih-Jen Chang : Added the igmp_get_mrouter_info and
+ * Tsu-Sheng Tsao igmp_set_mrouter_info to keep track of
+ * the mrouted version on that device.
+ * Chih-Jen Chang : Added the max_resp_time parameter to
+ * Tsu-Sheng Tsao igmp_heard_query(). Using this parameter
+ * to identify the multicast router version
+ * and do what the IGMP version 2 specified.
+ * Chih-Jen Chang : Added a timer to revert to IGMP V2 router
+ * Tsu-Sheng Tsao if the specified time expired.
+ * Alan Cox : Stop IGMP from 0.0.0.0 being accepted.
+ * Alan Cox : Use GFP_ATOMIC in the right places.
+ * Christian Daudt : igmp timer wasn't set for local group
+ * memberships but was being deleted,
+ * which caused a "del_timer() called
+ * from %p with timer not initialized\n"
+ * message (960131).
+ * Christian Daudt : removed del_timer from
+ * igmp_timer_expire function (960205).
+ * Christian Daudt : igmp_heard_report now only calls
+ * igmp_timer_expire if tm->running is
+ * true (960216).
+ * Malcolm Beattie : ttl comparison wrong in igmp_rcv made
+ * igmp_heard_query never trigger. Expiry
+ * miscalculation fixed in igmp_heard_query
+ * and random() made to return unsigned to
+ * prevent negative expiry times.
+ * Alexey Kuznetsov: Wrong group leaving behaviour, backport
+ * fix from pending 2.1.x patches.
+ * Alan Cox: Forget to enable FDDI support earlier.
+ * Alexey Kuznetsov: Fixed leaving groups on device down.
+ * Alexey Kuznetsov: Accordance to igmp-v2-06 draft.
+ * David L Stevens: IGMPv3 support, with help from
+ * Vinay Kulkarni
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/times.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#ifdef CONFIG_IP_MROUTE
+#include <linux/mroute.h>
+#endif
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#endif
+
+#define IP_MAX_MEMBERSHIPS 20
+#define IP_MAX_MSF 10
+
+#ifdef CONFIG_IP_MULTICAST
+/* Parameter names and values are taken from igmp-v2-06 draft */
+
+#define IGMP_V1_Router_Present_Timeout (400*HZ)
+#define IGMP_V2_Router_Present_Timeout (400*HZ)
+#define IGMP_Unsolicited_Report_Interval (10*HZ)
+#define IGMP_Query_Response_Interval (10*HZ)
+#define IGMP_Unsolicited_Report_Count 2
+
+
+#define IGMP_Initial_Report_Delay (1)
+
+/* IGMP_Initial_Report_Delay is not from IGMP specs!
+ * IGMP specs require to report membership immediately after
+ * joining a group, but we delay the first report by a
+ * small interval. It seems more natural and still does not
+ * contradict to specs provided this delay is small enough.
+ */
+
+#define IGMP_V1_SEEN(in_dev) (ipv4_devconf.force_igmp_version == 1 || \
+ (in_dev)->cnf.force_igmp_version == 1 || \
+ ((in_dev)->mr_v1_seen && \
+ time_before(jiffies, (in_dev)->mr_v1_seen)))
+#define IGMP_V2_SEEN(in_dev) (ipv4_devconf.force_igmp_version == 2 || \
+ (in_dev)->cnf.force_igmp_version == 2 || \
+ ((in_dev)->mr_v2_seen && \
+ time_before(jiffies, (in_dev)->mr_v2_seen)))
+
+static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);
+static void igmpv3_del_delrec(struct in_device *in_dev, __u32 multiaddr);
+static void igmpv3_clear_delrec(struct in_device *in_dev);
+static int sf_setstate(struct ip_mc_list *pmc);
+static void sf_markstate(struct ip_mc_list *pmc);
+#endif
+static void ip_mc_clear_src(struct ip_mc_list *pmc);
+static int ip_mc_add_src(struct in_device *in_dev, __u32 *pmca, int sfmode,
+ int sfcount, __u32 *psfsrc, int delta);
+
+static void ip_ma_put(struct ip_mc_list *im)
+{
+ if (atomic_dec_and_test(&im->refcnt)) {
+ in_dev_put(im->interface);
+ kfree(im);
+ }
+}
+
+#ifdef CONFIG_IP_MULTICAST
+
+/*
+ * Timer management
+ */
+
+static __inline__ void igmp_stop_timer(struct ip_mc_list *im)
+{
+ spin_lock_bh(&im->lock);
+ if (del_timer(&im->timer))
+ atomic_dec(&im->refcnt);
+ im->tm_running=0;
+ im->reporter = 0;
+ im->unsolicit_count = 0;
+ spin_unlock_bh(&im->lock);
+}
+
+/* It must be called with locked im->lock */
+static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
+{
+ int tv=net_random() % max_delay;
+
+ im->tm_running=1;
+ if (!mod_timer(&im->timer, jiffies+tv+2))
+ atomic_inc(&im->refcnt);
+}
+
+static void igmp_gq_start_timer(struct in_device *in_dev)
+{
+ int tv = net_random() % in_dev->mr_maxdelay;
+
+ in_dev->mr_gq_running = 1;
+ if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2))
+ in_dev_hold(in_dev);
+}
+
+static void igmp_ifc_start_timer(struct in_device *in_dev, int delay)
+{
+ int tv = net_random() % delay;
+
+ if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2))
+ in_dev_hold(in_dev);
+}
+
+static void igmp_mod_timer(struct ip_mc_list *im, int max_delay)
+{
+ spin_lock_bh(&im->lock);
+ im->unsolicit_count = 0;
+ if (del_timer(&im->timer)) {
+ if ((long)(im->timer.expires-jiffies) < max_delay) {
+ add_timer(&im->timer);
+ im->tm_running=1;
+ spin_unlock_bh(&im->lock);
+ return;
+ }
+ atomic_dec(&im->refcnt);
+ }
+ igmp_start_timer(im, max_delay);
+ spin_unlock_bh(&im->lock);
+}
+
+
+/*
+ * Send an IGMP report.
+ */
+
+#define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4)
+
+
+static int is_in(struct ip_mc_list *pmc, struct ip_sf_list *psf, int type,
+ int gdeleted, int sdeleted)
+{
+ switch (type) {
+ case IGMPV3_MODE_IS_INCLUDE:
+ case IGMPV3_MODE_IS_EXCLUDE:
+ if (gdeleted || sdeleted)
+ return 0;
+ return !(pmc->gsquery && !psf->sf_gsresp);
+ case IGMPV3_CHANGE_TO_INCLUDE:
+ if (gdeleted || sdeleted)
+ return 0;
+ return psf->sf_count[MCAST_INCLUDE] != 0;
+ case IGMPV3_CHANGE_TO_EXCLUDE:
+ if (gdeleted || sdeleted)
+ return 0;
+ if (pmc->sfcount[MCAST_EXCLUDE] == 0 ||
+ psf->sf_count[MCAST_INCLUDE])
+ return 0;
+ return pmc->sfcount[MCAST_EXCLUDE] ==
+ psf->sf_count[MCAST_EXCLUDE];
+ case IGMPV3_ALLOW_NEW_SOURCES:
+ if (gdeleted || !psf->sf_crcount)
+ return 0;
+ return (pmc->sfmode == MCAST_INCLUDE) ^ sdeleted;
+ case IGMPV3_BLOCK_OLD_SOURCES:
+ if (pmc->sfmode == MCAST_INCLUDE)
+ return gdeleted || (psf->sf_crcount && sdeleted);
+ return psf->sf_crcount && !gdeleted && !sdeleted;
+ }
+ return 0;
+}
+
+static int
+igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
+{
+ struct ip_sf_list *psf;
+ int scount = 0;
+
+ for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ if (!is_in(pmc, psf, type, gdeleted, sdeleted))
+ continue;
+ scount++;
+ }
+ return scount;
+}
+
+static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
+{
+ struct sk_buff *skb;
+ struct rtable *rt;
+ struct iphdr *pip;
+ struct igmpv3_report *pig;
+
+ skb = alloc_skb(size + LL_RESERVED_SPACE(dev), GFP_ATOMIC);
+ if (skb == NULL)
+ return NULL;
+
+ {
+ struct flowi fl = { .oif = dev->ifindex,
+ .nl_u = { .ip4_u = {
+ .daddr = IGMPV3_ALL_MCR } },
+ .proto = IPPROTO_IGMP };
+ if (ip_route_output_key(&rt, &fl)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+ }
+ if (rt->rt_src == 0) {
+ kfree_skb(skb);
+ ip_rt_put(rt);
+ return NULL;
+ }
+
+ skb->dst = &rt->u.dst;
+ skb->dev = dev;
+
+ skb_reserve(skb, LL_RESERVED_SPACE(dev));
+
+ skb->nh.iph = pip =(struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4);
+
+ pip->version = 4;
+ pip->ihl = (sizeof(struct iphdr)+4)>>2;
+ pip->tos = 0xc0;
+ pip->frag_off = htons(IP_DF);
+ pip->ttl = 1;
+ pip->daddr = rt->rt_dst;
+ pip->saddr = rt->rt_src;
+ pip->protocol = IPPROTO_IGMP;
+ pip->tot_len = 0; /* filled in later */
+ ip_select_ident(pip, &rt->u.dst, NULL);
+ ((u8*)&pip[1])[0] = IPOPT_RA;
+ ((u8*)&pip[1])[1] = 4;
+ ((u8*)&pip[1])[2] = 0;
+ ((u8*)&pip[1])[3] = 0;
+
+ pig =(struct igmpv3_report *)skb_put(skb, sizeof(*pig));
+ skb->h.igmph = (struct igmphdr *)pig;
+ pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT;
+ pig->resv1 = 0;
+ pig->csum = 0;
+ pig->resv2 = 0;
+ pig->ngrec = 0;
+ return skb;
+}
+
+static int igmpv3_sendpack(struct sk_buff *skb)
+{
+ struct iphdr *pip = skb->nh.iph;
+ struct igmphdr *pig = skb->h.igmph;
+ int iplen, igmplen;
+
+ iplen = skb->tail - (unsigned char *)skb->nh.iph;
+ pip->tot_len = htons(iplen);
+ ip_send_check(pip);
+
+ igmplen = skb->tail - (unsigned char *)skb->h.igmph;
+ pig->csum = ip_compute_csum((void *)skb->h.igmph, igmplen);
+
+ return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dev,
+ dst_output);
+}
+
+static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
+{
+ return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc,type,gdel,sdel);
+}
+
+static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
+ int type, struct igmpv3_grec **ppgr)
+{
+ struct net_device *dev = pmc->interface->dev;
+ struct igmpv3_report *pih;
+ struct igmpv3_grec *pgr;
+
+ if (!skb)
+ skb = igmpv3_newpack(dev, dev->mtu);
+ if (!skb)
+ return NULL;
+ pgr = (struct igmpv3_grec *)skb_put(skb, sizeof(struct igmpv3_grec));
+ pgr->grec_type = type;
+ pgr->grec_auxwords = 0;
+ pgr->grec_nsrcs = 0;
+ pgr->grec_mca = pmc->multiaddr;
+ pih = (struct igmpv3_report *)skb->h.igmph;
+ pih->ngrec = htons(ntohs(pih->ngrec)+1);
+ *ppgr = pgr;
+ return skb;
+}
+
+#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \
+ skb_tailroom(skb)) : 0)
+
+static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
+ int type, int gdeleted, int sdeleted)
+{
+ struct net_device *dev = pmc->interface->dev;
+ struct igmpv3_report *pih;
+ struct igmpv3_grec *pgr = NULL;
+ struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
+ int scount, first, isquery, truncate;
+
+ if (pmc->multiaddr == IGMP_ALL_HOSTS)
+ return skb;
+
+ isquery = type == IGMPV3_MODE_IS_INCLUDE ||
+ type == IGMPV3_MODE_IS_EXCLUDE;
+ truncate = type == IGMPV3_MODE_IS_EXCLUDE ||
+ type == IGMPV3_CHANGE_TO_EXCLUDE;
+
+ psf_list = sdeleted ? &pmc->tomb : &pmc->sources;
+
+ if (!*psf_list) {
+ if (type == IGMPV3_ALLOW_NEW_SOURCES ||
+ type == IGMPV3_BLOCK_OLD_SOURCES)
+ return skb;
+ if (pmc->crcount || isquery) {
+ /* make sure we have room for group header and at
+ * least one source.
+ */
+ if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)+
+ sizeof(__u32)) {
+ igmpv3_sendpack(skb);
+ skb = NULL; /* add_grhead will get a new one */
+ }
+ skb = add_grhead(skb, pmc, type, &pgr);
+ }
+ return skb;
+ }
+ pih = skb ? (struct igmpv3_report *)skb->h.igmph : NULL;
+
+ /* EX and TO_EX get a fresh packet, if needed */
+ if (truncate) {
+ if (pih && pih->ngrec &&
+ AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
+ if (skb)
+ igmpv3_sendpack(skb);
+ skb = igmpv3_newpack(dev, dev->mtu);
+ }
+ }
+ first = 1;
+ scount = 0;
+ psf_prev = NULL;
+ for (psf=*psf_list; psf; psf=psf_next) {
+ u32 *psrc;
+
+ psf_next = psf->sf_next;
+
+ if (!is_in(pmc, psf, type, gdeleted, sdeleted)) {
+ psf_prev = psf;
+ continue;
+ }
+
+ /* clear marks on query responses */
+ if (isquery)
+ psf->sf_gsresp = 0;
+
+ if (AVAILABLE(skb) < sizeof(u32) +
+ first*sizeof(struct igmpv3_grec)) {
+ if (truncate && !first)
+ break; /* truncate these */
+ if (pgr)
+ pgr->grec_nsrcs = htons(scount);
+ if (skb)
+ igmpv3_sendpack(skb);
+ skb = igmpv3_newpack(dev, dev->mtu);
+ first = 1;
+ scount = 0;
+ }
+ if (first) {
+ skb = add_grhead(skb, pmc, type, &pgr);
+ first = 0;
+ }
+ psrc = (u32 *)skb_put(skb, sizeof(u32));
+ *psrc = psf->sf_inaddr;
+ scount++;
+ if ((type == IGMPV3_ALLOW_NEW_SOURCES ||
+ type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
+ psf->sf_crcount--;
+ if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
+ if (psf_prev)
+ psf_prev->sf_next = psf->sf_next;
+ else
+ *psf_list = psf->sf_next;
+ kfree(psf);
+ continue;
+ }
+ }
+ psf_prev = psf;
+ }
+ if (pgr)
+ pgr->grec_nsrcs = htons(scount);
+
+ if (isquery)
+ pmc->gsquery = 0; /* clear query state on report */
+ return skb;
+}
+
+static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
+{
+ struct sk_buff *skb = NULL;
+ int type;
+
+ if (!pmc) {
+ read_lock(&in_dev->mc_list_lock);
+ for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+ if (pmc->multiaddr == IGMP_ALL_HOSTS)
+ continue;
+ spin_lock_bh(&pmc->lock);
+ if (pmc->sfcount[MCAST_EXCLUDE])
+ type = IGMPV3_MODE_IS_EXCLUDE;
+ else
+ type = IGMPV3_MODE_IS_INCLUDE;
+ skb = add_grec(skb, pmc, type, 0, 0);
+ spin_unlock_bh(&pmc->lock);
+ }
+ read_unlock(&in_dev->mc_list_lock);
+ } else {
+ spin_lock_bh(&pmc->lock);
+ if (pmc->sfcount[MCAST_EXCLUDE])
+ type = IGMPV3_MODE_IS_EXCLUDE;
+ else
+ type = IGMPV3_MODE_IS_INCLUDE;
+ skb = add_grec(skb, pmc, type, 0, 0);
+ spin_unlock_bh(&pmc->lock);
+ }
+ if (!skb)
+ return 0;
+ return igmpv3_sendpack(skb);
+}
+
+/*
+ * remove zero-count source records from a source filter list
+ */
+static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
+{
+ struct ip_sf_list *psf_prev, *psf_next, *psf;
+
+ psf_prev = NULL;
+ for (psf=*ppsf; psf; psf = psf_next) {
+ psf_next = psf->sf_next;
+ if (psf->sf_crcount == 0) {
+ if (psf_prev)
+ psf_prev->sf_next = psf->sf_next;
+ else
+ *ppsf = psf->sf_next;
+ kfree(psf);
+ } else
+ psf_prev = psf;
+ }
+}
+
+static void igmpv3_send_cr(struct in_device *in_dev)
+{
+ struct ip_mc_list *pmc, *pmc_prev, *pmc_next;
+ struct sk_buff *skb = NULL;
+ int type, dtype;
+
+ read_lock(&in_dev->mc_list_lock);
+ spin_lock_bh(&in_dev->mc_tomb_lock);
+
+ /* deleted MCA's */
+ pmc_prev = NULL;
+ for (pmc=in_dev->mc_tomb; pmc; pmc=pmc_next) {
+ pmc_next = pmc->next;
+ if (pmc->sfmode == MCAST_INCLUDE) {
+ type = IGMPV3_BLOCK_OLD_SOURCES;
+ dtype = IGMPV3_BLOCK_OLD_SOURCES;
+ skb = add_grec(skb, pmc, type, 1, 0);
+ skb = add_grec(skb, pmc, dtype, 1, 1);
+ }
+ if (pmc->crcount) {
+ pmc->crcount--;
+ if (pmc->sfmode == MCAST_EXCLUDE) {
+ type = IGMPV3_CHANGE_TO_INCLUDE;
+ skb = add_grec(skb, pmc, type, 1, 0);
+ }
+ if (pmc->crcount == 0) {
+ igmpv3_clear_zeros(&pmc->tomb);
+ igmpv3_clear_zeros(&pmc->sources);
+ }
+ }
+ if (pmc->crcount == 0 && !pmc->tomb && !pmc->sources) {
+ if (pmc_prev)
+ pmc_prev->next = pmc_next;
+ else
+ in_dev->mc_tomb = pmc_next;
+ in_dev_put(pmc->interface);
+ kfree(pmc);
+ } else
+ pmc_prev = pmc;
+ }
+ spin_unlock_bh(&in_dev->mc_tomb_lock);
+
+ /* change recs */
+ for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+ spin_lock_bh(&pmc->lock);
+ if (pmc->sfcount[MCAST_EXCLUDE]) {
+ type = IGMPV3_BLOCK_OLD_SOURCES;
+ dtype = IGMPV3_ALLOW_NEW_SOURCES;
+ } else {
+ type = IGMPV3_ALLOW_NEW_SOURCES;
+ dtype = IGMPV3_BLOCK_OLD_SOURCES;
+ }
+ skb = add_grec(skb, pmc, type, 0, 0);
+ skb = add_grec(skb, pmc, dtype, 0, 1); /* deleted sources */
+
+ /* filter mode changes */
+ if (pmc->crcount) {
+ pmc->crcount--;
+ if (pmc->sfmode == MCAST_EXCLUDE)
+ type = IGMPV3_CHANGE_TO_EXCLUDE;
+ else
+ type = IGMPV3_CHANGE_TO_INCLUDE;
+ skb = add_grec(skb, pmc, type, 0, 0);
+ }
+ spin_unlock_bh(&pmc->lock);
+ }
+ read_unlock(&in_dev->mc_list_lock);
+
+ if (!skb)
+ return;
+ (void) igmpv3_sendpack(skb);
+}
+
+static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
+ int type)
+{
+ struct sk_buff *skb;
+ struct iphdr *iph;
+ struct igmphdr *ih;
+ struct rtable *rt;
+ struct net_device *dev = in_dev->dev;
+ u32 group = pmc ? pmc->multiaddr : 0;
+ u32 dst;
+
+ if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
+ return igmpv3_send_report(in_dev, pmc);
+ else if (type == IGMP_HOST_LEAVE_MESSAGE)
+ dst = IGMP_ALL_ROUTER;
+ else
+ dst = group;
+
+ {
+ struct flowi fl = { .oif = dev->ifindex,
+ .nl_u = { .ip4_u = { .daddr = dst } },
+ .proto = IPPROTO_IGMP };
+ if (ip_route_output_key(&rt, &fl))
+ return -1;
+ }
+ if (rt->rt_src == 0) {
+ ip_rt_put(rt);
+ return -1;
+ }
+
+ skb=alloc_skb(IGMP_SIZE+LL_RESERVED_SPACE(dev), GFP_ATOMIC);
+ if (skb == NULL) {
+ ip_rt_put(rt);
+ return -1;
+ }
+
+ skb->dst = &rt->u.dst;
+
+ skb_reserve(skb, LL_RESERVED_SPACE(dev));
+
+ skb->nh.iph = iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4);
+
+ iph->version = 4;
+ iph->ihl = (sizeof(struct iphdr)+4)>>2;
+ iph->tos = 0xc0;
+ iph->frag_off = htons(IP_DF);
+ iph->ttl = 1;
+ iph->daddr = dst;
+ iph->saddr = rt->rt_src;
+ iph->protocol = IPPROTO_IGMP;
+ iph->tot_len = htons(IGMP_SIZE);
+ ip_select_ident(iph, &rt->u.dst, NULL);
+ ((u8*)&iph[1])[0] = IPOPT_RA;
+ ((u8*)&iph[1])[1] = 4;
+ ((u8*)&iph[1])[2] = 0;
+ ((u8*)&iph[1])[3] = 0;
+ ip_send_check(iph);
+
+ ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
+ ih->type=type;
+ ih->code=0;
+ ih->csum=0;
+ ih->group=group;
+ ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr));
+
+ return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+ dst_output);
+}
+
+static void igmp_gq_timer_expire(unsigned long data)
+{
+ struct in_device *in_dev = (struct in_device *)data;
+
+ in_dev->mr_gq_running = 0;
+ igmpv3_send_report(in_dev, NULL);
+ __in_dev_put(in_dev);
+}
+
+static void igmp_ifc_timer_expire(unsigned long data)
+{
+ struct in_device *in_dev = (struct in_device *)data;
+
+ igmpv3_send_cr(in_dev);
+ if (in_dev->mr_ifc_count) {
+ in_dev->mr_ifc_count--;
+ igmp_ifc_start_timer(in_dev, IGMP_Unsolicited_Report_Interval);
+ }
+ __in_dev_put(in_dev);
+}
+
+static void igmp_ifc_event(struct in_device *in_dev)
+{
+ if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
+ return;
+ in_dev->mr_ifc_count = in_dev->mr_qrv ? in_dev->mr_qrv :
+ IGMP_Unsolicited_Report_Count;
+ igmp_ifc_start_timer(in_dev, 1);
+}
+
+
+static void igmp_timer_expire(unsigned long data)
+{
+ struct ip_mc_list *im=(struct ip_mc_list *)data;
+ struct in_device *in_dev = im->interface;
+
+ spin_lock(&im->lock);
+ im->tm_running=0;
+
+ if (im->unsolicit_count) {
+ im->unsolicit_count--;
+ igmp_start_timer(im, IGMP_Unsolicited_Report_Interval);
+ }
+ im->reporter = 1;
+ spin_unlock(&im->lock);
+
+ if (IGMP_V1_SEEN(in_dev))
+ igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT);
+ else if (IGMP_V2_SEEN(in_dev))
+ igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT);
+ else
+ igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT);
+
+ ip_ma_put(im);
+}
+
+static void igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __u32 *srcs)
+{
+ struct ip_sf_list *psf;
+ int i, scount;
+
+ scount = 0;
+ for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ if (scount == nsrcs)
+ break;
+ for (i=0; i<nsrcs; i++)
+ if (srcs[i] == psf->sf_inaddr) {
+ psf->sf_gsresp = 1;
+ scount++;
+ break;
+ }
+ }
+}
+
+static void igmp_heard_report(struct in_device *in_dev, u32 group)
+{
+ struct ip_mc_list *im;
+
+ /* Timers are only set for non-local groups */
+
+ if (group == IGMP_ALL_HOSTS)
+ return;
+
+ read_lock(&in_dev->mc_list_lock);
+ for (im=in_dev->mc_list; im!=NULL; im=im->next) {
+ if (im->multiaddr == group) {
+ igmp_stop_timer(im);
+ break;
+ }
+ }
+ read_unlock(&in_dev->mc_list_lock);
+}
+
+static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
+ int len)
+{
+ struct igmphdr *ih = skb->h.igmph;
+ struct igmpv3_query *ih3 = (struct igmpv3_query *)ih;
+ struct ip_mc_list *im;
+ u32 group = ih->group;
+ int max_delay;
+ int mark = 0;
+
+
+ if (len == 8) {
+ if (ih->code == 0) {
+ /* Alas, old v1 router presents here. */
+
+ max_delay = IGMP_Query_Response_Interval;
+ in_dev->mr_v1_seen = jiffies +
+ IGMP_V1_Router_Present_Timeout;
+ group = 0;
+ } else {
+ /* v2 router present */
+ max_delay = ih->code*(HZ/IGMP_TIMER_SCALE);
+ in_dev->mr_v2_seen = jiffies +
+ IGMP_V2_Router_Present_Timeout;
+ }
+ /* cancel the interface change timer */
+ in_dev->mr_ifc_count = 0;
+ if (del_timer(&in_dev->mr_ifc_timer))
+ __in_dev_put(in_dev);
+ /* clear deleted report items */
+ igmpv3_clear_delrec(in_dev);
+ } else if (len < 12) {
+ return; /* ignore bogus packet; freed by caller */
+ } else { /* v3 */
+ if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
+ return;
+
+ ih3 = (struct igmpv3_query *) skb->h.raw;
+ if (ih3->nsrcs) {
+ if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)
+ + ntohs(ih3->nsrcs)*sizeof(__u32)))
+ return;
+ ih3 = (struct igmpv3_query *) skb->h.raw;
+ }
+
+ max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
+ if (!max_delay)
+ max_delay = 1; /* can't mod w/ 0 */
+ in_dev->mr_maxdelay = max_delay;
+ if (ih3->qrv)
+ in_dev->mr_qrv = ih3->qrv;
+ if (!group) { /* general query */
+ if (ih3->nsrcs)
+ return; /* no sources allowed */
+ igmp_gq_start_timer(in_dev);
+ return;
+ }
+ /* mark sources to include, if group & source-specific */
+ mark = ih3->nsrcs != 0;
+ }
+
+ /*
+ * - Start the timers in all of our membership records
+ * that the query applies to for the interface on
+ * which the query arrived excl. those that belong
+ * to a "local" group (224.0.0.X)
+ * - For timers already running check if they need to
+ * be reset.
+ * - Use the igmp->igmp_code field as the maximum
+ * delay possible
+ */
+ read_lock(&in_dev->mc_list_lock);
+ for (im=in_dev->mc_list; im!=NULL; im=im->next) {
+ if (group && group != im->multiaddr)
+ continue;
+ if (im->multiaddr == IGMP_ALL_HOSTS)
+ continue;
+ spin_lock_bh(&im->lock);
+ if (im->tm_running)
+ im->gsquery = im->gsquery && mark;
+ else
+ im->gsquery = mark;
+ if (im->gsquery)
+ igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs);
+ spin_unlock_bh(&im->lock);
+ igmp_mod_timer(im, max_delay);
+ }
+ read_unlock(&in_dev->mc_list_lock);
+}
+
+int igmp_rcv(struct sk_buff *skb)
+{
+ /* This basically follows the spec line by line -- see RFC1112 */
+ struct igmphdr *ih;
+ struct in_device *in_dev = in_dev_get(skb->dev);
+ int len = skb->len;
+
+ if (in_dev==NULL) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ if (!pskb_may_pull(skb, sizeof(struct igmphdr)) ||
+ (u16)csum_fold(skb_checksum(skb, 0, len, 0))) {
+ in_dev_put(in_dev);
+ kfree_skb(skb);
+ return 0;
+ }
+
+ ih = skb->h.igmph;
+ switch (ih->type) {
+ case IGMP_HOST_MEMBERSHIP_QUERY:
+ igmp_heard_query(in_dev, skb, len);
+ break;
+ case IGMP_HOST_MEMBERSHIP_REPORT:
+ case IGMPV2_HOST_MEMBERSHIP_REPORT:
+ case IGMPV3_HOST_MEMBERSHIP_REPORT:
+ /* Is it our report looped back? */
+ if (((struct rtable*)skb->dst)->fl.iif == 0)
+ break;
+ igmp_heard_report(in_dev, ih->group);
+ break;
+ case IGMP_PIM:
+#ifdef CONFIG_IP_PIMSM_V1
+ in_dev_put(in_dev);
+ return pim_rcv_v1(skb);
+#endif
+ case IGMP_DVMRP:
+ case IGMP_TRACE:
+ case IGMP_HOST_LEAVE_MESSAGE:
+ case IGMP_MTRACE:
+ case IGMP_MTRACE_RESP:
+ break;
+ default:
+ NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type));
+ }
+ in_dev_put(in_dev);
+ kfree_skb(skb);
+ return 0;
+}
+
+#endif
+
+
+/*
+ * Add a filter to a device
+ */
+
+static void ip_mc_filter_add(struct in_device *in_dev, u32 addr)
+{
+ char buf[MAX_ADDR_LEN];
+ struct net_device *dev = in_dev->dev;
+
+ /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG.
+ We will get multicast token leakage, when IFF_MULTICAST
+ is changed. This check should be done in dev->set_multicast_list
+ routine. Something sort of:
+ if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; }
+ --ANK
+ */
+ if (arp_mc_map(addr, buf, dev, 0) == 0)
+ dev_mc_add(dev,buf,dev->addr_len,0);
+}
+
+/*
+ * Remove a filter from a device
+ */
+
+static void ip_mc_filter_del(struct in_device *in_dev, u32 addr)
+{
+ char buf[MAX_ADDR_LEN];
+ struct net_device *dev = in_dev->dev;
+
+ if (arp_mc_map(addr, buf, dev, 0) == 0)
+ dev_mc_delete(dev,buf,dev->addr_len,0);
+}
+
+#ifdef CONFIG_IP_MULTICAST
+/*
+ * deleted ip_mc_list manipulation
+ */
+static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
+{
+ struct ip_mc_list *pmc;
+
+ /* this is an "ip_mc_list" for convenience; only the fields below
+ * are actually used. In particular, the refcnt and users are not
+ * used for management of the delete list. Using the same structure
+ * for deleted items allows change reports to use common code with
+ * non-deleted or query-response MCA's.
+ */
+ pmc = (struct ip_mc_list *)kmalloc(sizeof(*pmc), GFP_KERNEL);
+ if (!pmc)
+ return;
+ memset(pmc, 0, sizeof(*pmc));
+ spin_lock_bh(&im->lock);
+ pmc->interface = im->interface;
+ in_dev_hold(in_dev);
+ pmc->multiaddr = im->multiaddr;
+ pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
+ IGMP_Unsolicited_Report_Count;
+ pmc->sfmode = im->sfmode;
+ if (pmc->sfmode == MCAST_INCLUDE) {
+ struct ip_sf_list *psf;
+
+ pmc->tomb = im->tomb;
+ pmc->sources = im->sources;
+ im->tomb = im->sources = NULL;
+ for (psf=pmc->sources; psf; psf=psf->sf_next)
+ psf->sf_crcount = pmc->crcount;
+ }
+ spin_unlock_bh(&im->lock);
+
+ spin_lock_bh(&in_dev->mc_tomb_lock);
+ pmc->next = in_dev->mc_tomb;
+ in_dev->mc_tomb = pmc;
+ spin_unlock_bh(&in_dev->mc_tomb_lock);
+}
+
+static void igmpv3_del_delrec(struct in_device *in_dev, __u32 multiaddr)
+{
+ struct ip_mc_list *pmc, *pmc_prev;
+ struct ip_sf_list *psf, *psf_next;
+
+ spin_lock_bh(&in_dev->mc_tomb_lock);
+ pmc_prev = NULL;
+ for (pmc=in_dev->mc_tomb; pmc; pmc=pmc->next) {
+ if (pmc->multiaddr == multiaddr)
+ break;
+ pmc_prev = pmc;
+ }
+ if (pmc) {
+ if (pmc_prev)
+ pmc_prev->next = pmc->next;
+ else
+ in_dev->mc_tomb = pmc->next;
+ }
+ spin_unlock_bh(&in_dev->mc_tomb_lock);
+ if (pmc) {
+ for (psf=pmc->tomb; psf; psf=psf_next) {
+ psf_next = psf->sf_next;
+ kfree(psf);
+ }
+ in_dev_put(pmc->interface);
+ kfree(pmc);
+ }
+}
+
+static void igmpv3_clear_delrec(struct in_device *in_dev)
+{
+ struct ip_mc_list *pmc, *nextpmc;
+
+ spin_lock_bh(&in_dev->mc_tomb_lock);
+ pmc = in_dev->mc_tomb;
+ in_dev->mc_tomb = NULL;
+ spin_unlock_bh(&in_dev->mc_tomb_lock);
+
+ for (; pmc; pmc = nextpmc) {
+ nextpmc = pmc->next;
+ ip_mc_clear_src(pmc);
+ in_dev_put(pmc->interface);
+ kfree(pmc);
+ }
+ /* clear dead sources, too */
+ read_lock(&in_dev->mc_list_lock);
+ for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+ struct ip_sf_list *psf, *psf_next;
+
+ spin_lock_bh(&pmc->lock);
+ psf = pmc->tomb;
+ pmc->tomb = NULL;
+ spin_unlock_bh(&pmc->lock);
+ for (; psf; psf=psf_next) {
+ psf_next = psf->sf_next;
+ kfree(psf);
+ }
+ }
+ read_unlock(&in_dev->mc_list_lock);
+}
+#endif
+
+static void igmp_group_dropped(struct ip_mc_list *im)
+{
+ struct in_device *in_dev = im->interface;
+#ifdef CONFIG_IP_MULTICAST
+ int reporter;
+#endif
+
+ if (im->loaded) {
+ im->loaded = 0;
+ ip_mc_filter_del(in_dev, im->multiaddr);
+ }
+
+#ifdef CONFIG_IP_MULTICAST
+ if (im->multiaddr == IGMP_ALL_HOSTS)
+ return;
+
+ reporter = im->reporter;
+ igmp_stop_timer(im);
+
+ if (!in_dev->dead) {
+ if (IGMP_V1_SEEN(in_dev))
+ goto done;
+ if (IGMP_V2_SEEN(in_dev)) {
+ if (reporter)
+ igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE);
+ goto done;
+ }
+ /* IGMPv3 */
+ igmpv3_add_delrec(in_dev, im);
+
+ igmp_ifc_event(in_dev);
+ }
+done:
+#endif
+ ip_mc_clear_src(im);
+}
+
+static void igmp_group_added(struct ip_mc_list *im)
+{
+ struct in_device *in_dev = im->interface;
+
+ if (im->loaded == 0) {
+ im->loaded = 1;
+ ip_mc_filter_add(in_dev, im->multiaddr);
+ }
+
+#ifdef CONFIG_IP_MULTICAST
+ if (im->multiaddr == IGMP_ALL_HOSTS)
+ return;
+
+ if (in_dev->dead)
+ return;
+ if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
+ spin_lock_bh(&im->lock);
+ igmp_start_timer(im, IGMP_Initial_Report_Delay);
+ spin_unlock_bh(&im->lock);
+ return;
+ }
+ /* else, v3 */
+
+ im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
+ IGMP_Unsolicited_Report_Count;
+ igmp_ifc_event(in_dev);
+#endif
+}
+
+
+/*
+ * Multicast list managers
+ */
+
+
+/*
+ * A socket has joined a multicast group on device dev.
+ */
+
+void ip_mc_inc_group(struct in_device *in_dev, u32 addr)
+{
+ struct ip_mc_list *im;
+
+ ASSERT_RTNL();
+
+ for (im=in_dev->mc_list; im; im=im->next) {
+ if (im->multiaddr == addr) {
+ im->users++;
+ ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0);
+ goto out;
+ }
+ }
+
+ im = (struct ip_mc_list *)kmalloc(sizeof(*im), GFP_KERNEL);
+ if (!im)
+ goto out;
+
+ im->users=1;
+ im->interface=in_dev;
+ in_dev_hold(in_dev);
+ im->multiaddr=addr;
+ /* initial mode is (EX, empty) */
+ im->sfmode = MCAST_EXCLUDE;
+ im->sfcount[MCAST_INCLUDE] = 0;
+ im->sfcount[MCAST_EXCLUDE] = 1;
+ im->sources = NULL;
+ im->tomb = NULL;
+ im->crcount = 0;
+ atomic_set(&im->refcnt, 1);
+ spin_lock_init(&im->lock);
+#ifdef CONFIG_IP_MULTICAST
+ im->tm_running=0;
+ init_timer(&im->timer);
+ im->timer.data=(unsigned long)im;
+ im->timer.function=&igmp_timer_expire;
+ im->unsolicit_count = IGMP_Unsolicited_Report_Count;
+ im->reporter = 0;
+ im->gsquery = 0;
+#endif
+ im->loaded = 0;
+ write_lock_bh(&in_dev->mc_list_lock);
+ im->next=in_dev->mc_list;
+ in_dev->mc_list=im;
+ write_unlock_bh(&in_dev->mc_list_lock);
+#ifdef CONFIG_IP_MULTICAST
+ igmpv3_del_delrec(in_dev, im->multiaddr);
+#endif
+ igmp_group_added(im);
+ if (!in_dev->dead)
+ ip_rt_multicast_event(in_dev);
+out:
+ return;
+}
+
+/*
+ * A socket has left a multicast group on device dev
+ */
+
+void ip_mc_dec_group(struct in_device *in_dev, u32 addr)
+{
+ struct ip_mc_list *i, **ip;
+
+ ASSERT_RTNL();
+
+ for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) {
+ if (i->multiaddr==addr) {
+ if (--i->users == 0) {
+ write_lock_bh(&in_dev->mc_list_lock);
+ *ip = i->next;
+ write_unlock_bh(&in_dev->mc_list_lock);
+ igmp_group_dropped(i);
+
+ if (!in_dev->dead)
+ ip_rt_multicast_event(in_dev);
+
+ ip_ma_put(i);
+ return;
+ }
+ break;
+ }
+ }
+}
+
+/* Device going down */
+
+void ip_mc_down(struct in_device *in_dev)
+{
+ struct ip_mc_list *i;
+
+ ASSERT_RTNL();
+
+ for (i=in_dev->mc_list; i; i=i->next)
+ igmp_group_dropped(i);
+
+#ifdef CONFIG_IP_MULTICAST
+ in_dev->mr_ifc_count = 0;
+ if (del_timer(&in_dev->mr_ifc_timer))
+ __in_dev_put(in_dev);
+ in_dev->mr_gq_running = 0;
+ if (del_timer(&in_dev->mr_gq_timer))
+ __in_dev_put(in_dev);
+ igmpv3_clear_delrec(in_dev);
+#endif
+
+ ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
+}
+
+void ip_mc_init_dev(struct in_device *in_dev)
+{
+ ASSERT_RTNL();
+
+ in_dev->mc_tomb = NULL;
+#ifdef CONFIG_IP_MULTICAST
+ in_dev->mr_gq_running = 0;
+ init_timer(&in_dev->mr_gq_timer);
+ in_dev->mr_gq_timer.data=(unsigned long) in_dev;
+ in_dev->mr_gq_timer.function=&igmp_gq_timer_expire;
+ in_dev->mr_ifc_count = 0;
+ init_timer(&in_dev->mr_ifc_timer);
+ in_dev->mr_ifc_timer.data=(unsigned long) in_dev;
+ in_dev->mr_ifc_timer.function=&igmp_ifc_timer_expire;
+ in_dev->mr_qrv = IGMP_Unsolicited_Report_Count;
+#endif
+
+ rwlock_init(&in_dev->mc_list_lock);
+ spin_lock_init(&in_dev->mc_tomb_lock);
+}
+
+/* Device going up */
+
+void ip_mc_up(struct in_device *in_dev)
+{
+ struct ip_mc_list *i;
+
+ ASSERT_RTNL();
+
+ ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
+
+ for (i=in_dev->mc_list; i; i=i->next)
+ igmp_group_added(i);
+}
+
+/*
+ * Device is about to be destroyed: clean up.
+ */
+
+void ip_mc_destroy_dev(struct in_device *in_dev)
+{
+ struct ip_mc_list *i;
+
+ ASSERT_RTNL();
+
+ /* Deactivate timers */
+ ip_mc_down(in_dev);
+
+ write_lock_bh(&in_dev->mc_list_lock);
+ while ((i = in_dev->mc_list) != NULL) {
+ in_dev->mc_list = i->next;
+ write_unlock_bh(&in_dev->mc_list_lock);
+
+ igmp_group_dropped(i);
+ ip_ma_put(i);
+
+ write_lock_bh(&in_dev->mc_list_lock);
+ }
+ write_unlock_bh(&in_dev->mc_list_lock);
+}
+
+static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr)
+{
+ struct flowi fl = { .nl_u = { .ip4_u =
+ { .daddr = imr->imr_multiaddr.s_addr } } };
+ struct rtable *rt;
+ struct net_device *dev = NULL;
+ struct in_device *idev = NULL;
+
+ if (imr->imr_ifindex) {
+ idev = inetdev_by_index(imr->imr_ifindex);
+ if (idev)
+ __in_dev_put(idev);
+ return idev;
+ }
+ if (imr->imr_address.s_addr) {
+ dev = ip_dev_find(imr->imr_address.s_addr);
+ if (!dev)
+ return NULL;
+ __dev_put(dev);
+ }
+
+ if (!dev && !ip_route_output_key(&rt, &fl)) {
+ dev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ }
+ if (dev) {
+ imr->imr_ifindex = dev->ifindex;
+ idev = __in_dev_get(dev);
+ }
+ return idev;
+}
+
+/*
+ * Join a socket to a group
+ */
+int sysctl_igmp_max_memberships = IP_MAX_MEMBERSHIPS;
+int sysctl_igmp_max_msf = IP_MAX_MSF;
+
+
+static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
+ __u32 *psfsrc)
+{
+ struct ip_sf_list *psf, *psf_prev;
+ int rv = 0;
+
+ psf_prev = NULL;
+ for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ if (psf->sf_inaddr == *psfsrc)
+ break;
+ psf_prev = psf;
+ }
+ if (!psf || psf->sf_count[sfmode] == 0) {
+ /* source filter not found, or count wrong => bug */
+ return -ESRCH;
+ }
+ psf->sf_count[sfmode]--;
+ if (psf->sf_count[sfmode] == 0) {
+ ip_rt_multicast_event(pmc->interface);
+ }
+ if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
+#ifdef CONFIG_IP_MULTICAST
+ struct in_device *in_dev = pmc->interface;
+#endif
+
+ /* no more filters for this source */
+ if (psf_prev)
+ psf_prev->sf_next = psf->sf_next;
+ else
+ pmc->sources = psf->sf_next;
+#ifdef CONFIG_IP_MULTICAST
+ if (psf->sf_oldin &&
+ !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) {
+ psf->sf_crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
+ IGMP_Unsolicited_Report_Count;
+ psf->sf_next = pmc->tomb;
+ pmc->tomb = psf;
+ rv = 1;
+ } else
+#endif
+ kfree(psf);
+ }
+ return rv;
+}
+
+#ifndef CONFIG_IP_MULTICAST
+#define igmp_ifc_event(x) do { } while (0)
+#endif
+
+static int ip_mc_del_src(struct in_device *in_dev, __u32 *pmca, int sfmode,
+ int sfcount, __u32 *psfsrc, int delta)
+{
+ struct ip_mc_list *pmc;
+ int changerec = 0;
+ int i, err;
+
+ if (!in_dev)
+ return -ENODEV;
+ read_lock(&in_dev->mc_list_lock);
+ for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+ if (*pmca == pmc->multiaddr)
+ break;
+ }
+ if (!pmc) {
+ /* MCA not found?? bug */
+ read_unlock(&in_dev->mc_list_lock);
+ return -ESRCH;
+ }
+ spin_lock_bh(&pmc->lock);
+ read_unlock(&in_dev->mc_list_lock);
+#ifdef CONFIG_IP_MULTICAST
+ sf_markstate(pmc);
+#endif
+ if (!delta) {
+ err = -EINVAL;
+ if (!pmc->sfcount[sfmode])
+ goto out_unlock;
+ pmc->sfcount[sfmode]--;
+ }
+ err = 0;
+ for (i=0; i<sfcount; i++) {
+ int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
+
+ changerec |= rv > 0;
+ if (!err && rv < 0)
+ err = rv;
+ }
+ if (pmc->sfmode == MCAST_EXCLUDE &&
+ pmc->sfcount[MCAST_EXCLUDE] == 0 &&
+ pmc->sfcount[MCAST_INCLUDE]) {
+#ifdef CONFIG_IP_MULTICAST
+ struct ip_sf_list *psf;
+#endif
+
+ /* filter mode change */
+ pmc->sfmode = MCAST_INCLUDE;
+#ifdef CONFIG_IP_MULTICAST
+ pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
+ IGMP_Unsolicited_Report_Count;
+ in_dev->mr_ifc_count = pmc->crcount;
+ for (psf=pmc->sources; psf; psf = psf->sf_next)
+ psf->sf_crcount = 0;
+ igmp_ifc_event(pmc->interface);
+ } else if (sf_setstate(pmc) || changerec) {
+ igmp_ifc_event(pmc->interface);
+#endif
+ }
+out_unlock:
+ spin_unlock_bh(&pmc->lock);
+ return err;
+}
+
+/*
+ * Add multicast single-source filter to the interface list
+ */
+static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode,
+ __u32 *psfsrc, int delta)
+{
+ struct ip_sf_list *psf, *psf_prev;
+
+ psf_prev = NULL;
+ for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ if (psf->sf_inaddr == *psfsrc)
+ break;
+ psf_prev = psf;
+ }
+ if (!psf) {
+ psf = (struct ip_sf_list *)kmalloc(sizeof(*psf), GFP_ATOMIC);
+ if (!psf)
+ return -ENOBUFS;
+ memset(psf, 0, sizeof(*psf));
+ psf->sf_inaddr = *psfsrc;
+ if (psf_prev) {
+ psf_prev->sf_next = psf;
+ } else
+ pmc->sources = psf;
+ }
+ psf->sf_count[sfmode]++;
+ if (psf->sf_count[sfmode] == 1) {
+ ip_rt_multicast_event(pmc->interface);
+ }
+ return 0;
+}
+
+#ifdef CONFIG_IP_MULTICAST
+static void sf_markstate(struct ip_mc_list *pmc)
+{
+ struct ip_sf_list *psf;
+ int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
+
+ for (psf=pmc->sources; psf; psf=psf->sf_next)
+ if (pmc->sfcount[MCAST_EXCLUDE]) {
+ psf->sf_oldin = mca_xcount ==
+ psf->sf_count[MCAST_EXCLUDE] &&
+ !psf->sf_count[MCAST_INCLUDE];
+ } else
+ psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0;
+}
+
+static int sf_setstate(struct ip_mc_list *pmc)
+{
+ struct ip_sf_list *psf;
+ int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
+ int qrv = pmc->interface->mr_qrv;
+ int new_in, rv;
+
+ rv = 0;
+ for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ if (pmc->sfcount[MCAST_EXCLUDE]) {
+ new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
+ !psf->sf_count[MCAST_INCLUDE];
+ } else
+ new_in = psf->sf_count[MCAST_INCLUDE] != 0;
+ if (new_in != psf->sf_oldin) {
+ psf->sf_crcount = qrv;
+ rv++;
+ }
+ }
+ return rv;
+}
+#endif
+
+/*
+ * Add multicast source filter list to the interface list
+ */
+static int ip_mc_add_src(struct in_device *in_dev, __u32 *pmca, int sfmode,
+ int sfcount, __u32 *psfsrc, int delta)
+{
+ struct ip_mc_list *pmc;
+ int isexclude;
+ int i, err;
+
+ if (!in_dev)
+ return -ENODEV;
+ read_lock(&in_dev->mc_list_lock);
+ for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+ if (*pmca == pmc->multiaddr)
+ break;
+ }
+ if (!pmc) {
+ /* MCA not found?? bug */
+ read_unlock(&in_dev->mc_list_lock);
+ return -ESRCH;
+ }
+ spin_lock_bh(&pmc->lock);
+ read_unlock(&in_dev->mc_list_lock);
+
+#ifdef CONFIG_IP_MULTICAST
+ sf_markstate(pmc);
+#endif
+ isexclude = pmc->sfmode == MCAST_EXCLUDE;
+ if (!delta)
+ pmc->sfcount[sfmode]++;
+ err = 0;
+ for (i=0; i<sfcount; i++) {
+ err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i], delta);
+ if (err)
+ break;
+ }
+ if (err) {
+ int j;
+
+ pmc->sfcount[sfmode]--;
+ for (j=0; j<i; j++)
+ (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
+ } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
+#ifdef CONFIG_IP_MULTICAST
+ struct in_device *in_dev = pmc->interface;
+ struct ip_sf_list *psf;
+#endif
+
+ /* filter mode change */
+ if (pmc->sfcount[MCAST_EXCLUDE])
+ pmc->sfmode = MCAST_EXCLUDE;
+ else if (pmc->sfcount[MCAST_INCLUDE])
+ pmc->sfmode = MCAST_INCLUDE;
+#ifdef CONFIG_IP_MULTICAST
+ /* else no filters; keep old mode for reports */
+
+ pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
+ IGMP_Unsolicited_Report_Count;
+ in_dev->mr_ifc_count = pmc->crcount;
+ for (psf=pmc->sources; psf; psf = psf->sf_next)
+ psf->sf_crcount = 0;
+ igmp_ifc_event(in_dev);
+ } else if (sf_setstate(pmc)) {
+ igmp_ifc_event(in_dev);
+#endif
+ }
+ spin_unlock_bh(&pmc->lock);
+ return err;
+}
+
+static void ip_mc_clear_src(struct ip_mc_list *pmc)
+{
+ struct ip_sf_list *psf, *nextpsf;
+
+ for (psf=pmc->tomb; psf; psf=nextpsf) {
+ nextpsf = psf->sf_next;
+ kfree(psf);
+ }
+ pmc->tomb = NULL;
+ for (psf=pmc->sources; psf; psf=nextpsf) {
+ nextpsf = psf->sf_next;
+ kfree(psf);
+ }
+ pmc->sources = NULL;
+ pmc->sfmode = MCAST_EXCLUDE;
+ pmc->sfcount[MCAST_EXCLUDE] = 0;
+ pmc->sfcount[MCAST_EXCLUDE] = 1;
+}
+
+
+/*
+ * Join a multicast group
+ */
+int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
+{
+ int err;
+ u32 addr = imr->imr_multiaddr.s_addr;
+ struct ip_mc_socklist *iml, *i;
+ struct in_device *in_dev;
+ struct inet_sock *inet = inet_sk(sk);
+ int count = 0;
+
+ if (!MULTICAST(addr))
+ return -EINVAL;
+
+ rtnl_shlock();
+
+ in_dev = ip_mc_find_dev(imr);
+
+ if (!in_dev) {
+ iml = NULL;
+ err = -ENODEV;
+ goto done;
+ }
+
+ iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
+
+ err = -EADDRINUSE;
+ for (i = inet->mc_list; i; i = i->next) {
+ if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) {
+ /* New style additions are reference counted */
+ if (imr->imr_address.s_addr == 0) {
+ i->count++;
+ err = 0;
+ }
+ goto done;
+ }
+ count++;
+ }
+ err = -ENOBUFS;
+ if (iml == NULL || count >= sysctl_igmp_max_memberships)
+ goto done;
+ memcpy(&iml->multi, imr, sizeof(*imr));
+ iml->next = inet->mc_list;
+ iml->count = 1;
+ iml->sflist = NULL;
+ iml->sfmode = MCAST_EXCLUDE;
+ inet->mc_list = iml;
+ ip_mc_inc_group(in_dev, addr);
+ iml = NULL;
+ err = 0;
+
+done:
+ rtnl_shunlock();
+ if (iml)
+ sock_kfree_s(sk, iml, sizeof(*iml));
+ return err;
+}
+
+static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
+ struct in_device *in_dev)
+{
+ int err;
+
+ if (iml->sflist == 0) {
+ /* any-source empty exclude case */
+ return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
+ iml->sfmode, 0, NULL, 0);
+ }
+ err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
+ iml->sfmode, iml->sflist->sl_count,
+ iml->sflist->sl_addr, 0);
+ sock_kfree_s(sk, iml->sflist, IP_SFLSIZE(iml->sflist->sl_max));
+ iml->sflist = NULL;
+ return err;
+}
+
+/*
+ * Ask a socket to leave a group.
+ */
+
+int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_mc_socklist *iml, **imlp;
+
+ rtnl_lock();
+ for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) {
+ if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr &&
+ iml->multi.imr_address.s_addr==imr->imr_address.s_addr &&
+ (!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) {
+ struct in_device *in_dev;
+
+ in_dev = inetdev_by_index(iml->multi.imr_ifindex);
+ if (in_dev)
+ (void) ip_mc_leave_src(sk, iml, in_dev);
+ if (--iml->count) {
+ rtnl_unlock();
+ if (in_dev)
+ in_dev_put(in_dev);
+ return 0;
+ }
+
+ *imlp = iml->next;
+
+ if (in_dev) {
+ ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr);
+ in_dev_put(in_dev);
+ }
+ rtnl_unlock();
+ sock_kfree_s(sk, iml, sizeof(*iml));
+ return 0;
+ }
+ }
+ rtnl_unlock();
+ return -EADDRNOTAVAIL;
+}
+
+int ip_mc_source(int add, int omode, struct sock *sk, struct
+ ip_mreq_source *mreqs, int ifindex)
+{
+ int err;
+ struct ip_mreqn imr;
+ u32 addr = mreqs->imr_multiaddr;
+ struct ip_mc_socklist *pmc;
+ struct in_device *in_dev = NULL;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_sf_socklist *psl;
+ int i, j, rv;
+
+ if (!MULTICAST(addr))
+ return -EINVAL;
+
+ rtnl_shlock();
+
+ imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr;
+ imr.imr_address.s_addr = mreqs->imr_interface;
+ imr.imr_ifindex = ifindex;
+ in_dev = ip_mc_find_dev(&imr);
+
+ if (!in_dev) {
+ err = -ENODEV;
+ goto done;
+ }
+ err = -EADDRNOTAVAIL;
+
+ for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+ if (memcmp(&pmc->multi, mreqs, 2*sizeof(__u32)) == 0)
+ break;
+ }
+ if (!pmc) /* must have a prior join */
+ goto done;
+ /* if a source filter was set, must be the same mode as before */
+ if (pmc->sflist) {
+ if (pmc->sfmode != omode)
+ goto done;
+ } else if (pmc->sfmode != omode) {
+ /* allow mode switches for empty-set filters */
+ ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
+ ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, pmc->sfmode, 0,
+ NULL, 0);
+ pmc->sfmode = omode;
+ }
+
+ psl = pmc->sflist;
+ if (!add) {
+ if (!psl)
+ goto done;
+ rv = !0;
+ for (i=0; i<psl->sl_count; i++) {
+ rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
+ sizeof(__u32));
+ if (rv == 0)
+ break;
+ }
+ if (rv) /* source not found */
+ goto done;
+
+ /* update the interface filter */
+ ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
+ &mreqs->imr_sourceaddr, 1);
+
+ for (j=i+1; j<psl->sl_count; j++)
+ psl->sl_addr[j-1] = psl->sl_addr[j];
+ psl->sl_count--;
+ err = 0;
+ goto done;
+ }
+ /* else, add a new source to the filter */
+
+ if (psl && psl->sl_count >= sysctl_igmp_max_msf) {
+ err = -ENOBUFS;
+ goto done;
+ }
+ if (!psl || psl->sl_count == psl->sl_max) {
+ struct ip_sf_socklist *newpsl;
+ int count = IP_SFBLOCK;
+
+ if (psl)
+ count += psl->sl_max;
+ newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk,
+ IP_SFLSIZE(count), GFP_KERNEL);
+ if (!newpsl) {
+ err = -ENOBUFS;
+ goto done;
+ }
+ newpsl->sl_max = count;
+ newpsl->sl_count = count - IP_SFBLOCK;
+ if (psl) {
+ for (i=0; i<psl->sl_count; i++)
+ newpsl->sl_addr[i] = psl->sl_addr[i];
+ sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max));
+ }
+ pmc->sflist = psl = newpsl;
+ }
+ rv = 1; /* > 0 for insert logic below if sl_count is 0 */
+ for (i=0; i<psl->sl_count; i++) {
+ rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
+ sizeof(__u32));
+ if (rv == 0)
+ break;
+ }
+ if (rv == 0) /* address already there is an error */
+ goto done;
+ for (j=psl->sl_count-1; j>=i; j--)
+ psl->sl_addr[j+1] = psl->sl_addr[j];
+ psl->sl_addr[i] = mreqs->imr_sourceaddr;
+ psl->sl_count++;
+ err = 0;
+ /* update the interface list */
+ ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
+ &mreqs->imr_sourceaddr, 1);
+done:
+ rtnl_shunlock();
+ return err;
+}
+
+int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
+{
+ int err;
+ struct ip_mreqn imr;
+ u32 addr = msf->imsf_multiaddr;
+ struct ip_mc_socklist *pmc;
+ struct in_device *in_dev;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_sf_socklist *newpsl, *psl;
+
+ if (!MULTICAST(addr))
+ return -EINVAL;
+ if (msf->imsf_fmode != MCAST_INCLUDE &&
+ msf->imsf_fmode != MCAST_EXCLUDE)
+ return -EINVAL;
+
+ rtnl_shlock();
+
+ imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
+ imr.imr_address.s_addr = msf->imsf_interface;
+ imr.imr_ifindex = ifindex;
+ in_dev = ip_mc_find_dev(&imr);
+
+ if (!in_dev) {
+ err = -ENODEV;
+ goto done;
+ }
+ err = -EADDRNOTAVAIL;
+
+ for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+ if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
+ pmc->multi.imr_ifindex == imr.imr_ifindex)
+ break;
+ }
+ if (!pmc) /* must have a prior join */
+ goto done;
+ if (msf->imsf_numsrc) {
+ newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk,
+ IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL);
+ if (!newpsl) {
+ err = -ENOBUFS;
+ goto done;
+ }
+ newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc;
+ memcpy(newpsl->sl_addr, msf->imsf_slist,
+ msf->imsf_numsrc * sizeof(msf->imsf_slist[0]));
+ err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
+ msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0);
+ if (err) {
+ sock_kfree_s(sk, newpsl, IP_SFLSIZE(newpsl->sl_max));
+ goto done;
+ }
+ } else
+ newpsl = NULL;
+ psl = pmc->sflist;
+ if (psl) {
+ (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
+ psl->sl_count, psl->sl_addr, 0);
+ sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max));
+ } else
+ (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
+ 0, NULL, 0);
+ pmc->sflist = newpsl;
+ pmc->sfmode = msf->imsf_fmode;
+done:
+ rtnl_shunlock();
+ return err;
+}
+
+int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
+ struct ip_msfilter __user *optval, int __user *optlen)
+{
+ int err, len, count, copycount;
+ struct ip_mreqn imr;
+ u32 addr = msf->imsf_multiaddr;
+ struct ip_mc_socklist *pmc;
+ struct in_device *in_dev;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_sf_socklist *psl;
+
+ if (!MULTICAST(addr))
+ return -EINVAL;
+
+ rtnl_shlock();
+
+ imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
+ imr.imr_address.s_addr = msf->imsf_interface;
+ imr.imr_ifindex = 0;
+ in_dev = ip_mc_find_dev(&imr);
+
+ if (!in_dev) {
+ err = -ENODEV;
+ goto done;
+ }
+ err = -EADDRNOTAVAIL;
+
+ for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+ if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
+ pmc->multi.imr_ifindex == imr.imr_ifindex)
+ break;
+ }
+ if (!pmc) /* must have a prior join */
+ goto done;
+ msf->imsf_fmode = pmc->sfmode;
+ psl = pmc->sflist;
+ rtnl_shunlock();
+ if (!psl) {
+ len = 0;
+ count = 0;
+ } else {
+ count = psl->sl_count;
+ }
+ copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc;
+ len = copycount * sizeof(psl->sl_addr[0]);
+ msf->imsf_numsrc = count;
+ if (put_user(IP_MSFILTER_SIZE(copycount), optlen) ||
+ copy_to_user(optval, msf, IP_MSFILTER_SIZE(0))) {
+ return -EFAULT;
+ }
+ if (len &&
+ copy_to_user(&optval->imsf_slist[0], psl->sl_addr, len))
+ return -EFAULT;
+ return 0;
+done:
+ rtnl_shunlock();
+ return err;
+}
+
+int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
+ struct group_filter __user *optval, int __user *optlen)
+{
+ int err, i, count, copycount;
+ struct sockaddr_in *psin;
+ u32 addr;
+ struct ip_mc_socklist *pmc;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_sf_socklist *psl;
+
+ psin = (struct sockaddr_in *)&gsf->gf_group;
+ if (psin->sin_family != AF_INET)
+ return -EINVAL;
+ addr = psin->sin_addr.s_addr;
+ if (!MULTICAST(addr))
+ return -EINVAL;
+
+ rtnl_shlock();
+
+ err = -EADDRNOTAVAIL;
+
+ for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+ if (pmc->multi.imr_multiaddr.s_addr == addr &&
+ pmc->multi.imr_ifindex == gsf->gf_interface)
+ break;
+ }
+ if (!pmc) /* must have a prior join */
+ goto done;
+ gsf->gf_fmode = pmc->sfmode;
+ psl = pmc->sflist;
+ rtnl_shunlock();
+ count = psl ? psl->sl_count : 0;
+ copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
+ gsf->gf_numsrc = count;
+ if (put_user(GROUP_FILTER_SIZE(copycount), optlen) ||
+ copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {
+ return -EFAULT;
+ }
+ for (i=0; i<copycount; i++) {
+ struct sockaddr_in *psin;
+ struct sockaddr_storage ss;
+
+ psin = (struct sockaddr_in *)&ss;
+ memset(&ss, 0, sizeof(ss));
+ psin->sin_family = AF_INET;
+ psin->sin_addr.s_addr = psl->sl_addr[i];
+ if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss)))
+ return -EFAULT;
+ }
+ return 0;
+done:
+ rtnl_shunlock();
+ return err;
+}
+
+/*
+ * check if a multicast source filter allows delivery for a given <src,dst,intf>
+ */
+int ip_mc_sf_allow(struct sock *sk, u32 loc_addr, u32 rmt_addr, int dif)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_mc_socklist *pmc;
+ struct ip_sf_socklist *psl;
+ int i;
+
+ if (!MULTICAST(loc_addr))
+ return 1;
+
+ for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+ if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
+ pmc->multi.imr_ifindex == dif)
+ break;
+ }
+ if (!pmc)
+ return 1;
+ psl = pmc->sflist;
+ if (!psl)
+ return pmc->sfmode == MCAST_EXCLUDE;
+
+ for (i=0; i<psl->sl_count; i++) {
+ if (psl->sl_addr[i] == rmt_addr)
+ break;
+ }
+ if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
+ return 0;
+ if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
+ return 0;
+ return 1;
+}
+
+/*
+ * A socket is closing.
+ */
+
+void ip_mc_drop_socket(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_mc_socklist *iml;
+
+ if (inet->mc_list == NULL)
+ return;
+
+ rtnl_lock();
+ while ((iml = inet->mc_list) != NULL) {
+ struct in_device *in_dev;
+ inet->mc_list = iml->next;
+
+ if ((in_dev = inetdev_by_index(iml->multi.imr_ifindex)) != NULL) {
+ (void) ip_mc_leave_src(sk, iml, in_dev);
+ ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
+ in_dev_put(in_dev);
+ }
+ sock_kfree_s(sk, iml, sizeof(*iml));
+
+ }
+ rtnl_unlock();
+}
+
+int ip_check_mc(struct in_device *in_dev, u32 mc_addr, u32 src_addr, u16 proto)
+{
+ struct ip_mc_list *im;
+ struct ip_sf_list *psf;
+ int rv = 0;
+
+ read_lock(&in_dev->mc_list_lock);
+ for (im=in_dev->mc_list; im; im=im->next) {
+ if (im->multiaddr == mc_addr)
+ break;
+ }
+ if (im && proto == IPPROTO_IGMP) {
+ rv = 1;
+ } else if (im) {
+ if (src_addr) {
+ for (psf=im->sources; psf; psf=psf->sf_next) {
+ if (psf->sf_inaddr == src_addr)
+ break;
+ }
+ if (psf)
+ rv = psf->sf_count[MCAST_INCLUDE] ||
+ psf->sf_count[MCAST_EXCLUDE] !=
+ im->sfcount[MCAST_EXCLUDE];
+ else
+ rv = im->sfcount[MCAST_EXCLUDE] != 0;
+ } else
+ rv = 1; /* unspecified source; tentatively allow */
+ }
+ read_unlock(&in_dev->mc_list_lock);
+ return rv;
+}
+
+#if defined(CONFIG_PROC_FS)
+struct igmp_mc_iter_state {
+ struct net_device *dev;
+ struct in_device *in_dev;
+};
+
+#define igmp_mc_seq_private(seq) ((struct igmp_mc_iter_state *)(seq)->private)
+
+static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
+{
+ struct ip_mc_list *im = NULL;
+ struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+
+ for (state->dev = dev_base, state->in_dev = NULL;
+ state->dev;
+ state->dev = state->dev->next) {
+ struct in_device *in_dev;
+ in_dev = in_dev_get(state->dev);
+ if (!in_dev)
+ continue;
+ read_lock(&in_dev->mc_list_lock);
+ im = in_dev->mc_list;
+ if (im) {
+ state->in_dev = in_dev;
+ break;
+ }
+ read_unlock(&in_dev->mc_list_lock);
+ in_dev_put(in_dev);
+ }
+ return im;
+}
+
+static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im)
+{
+ struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+ im = im->next;
+ while (!im) {
+ if (likely(state->in_dev != NULL)) {
+ read_unlock(&state->in_dev->mc_list_lock);
+ in_dev_put(state->in_dev);
+ }
+ state->dev = state->dev->next;
+ if (!state->dev) {
+ state->in_dev = NULL;
+ break;
+ }
+ state->in_dev = in_dev_get(state->dev);
+ if (!state->in_dev)
+ continue;
+ read_lock(&state->in_dev->mc_list_lock);
+ im = state->in_dev->mc_list;
+ }
+ return im;
+}
+
+static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct ip_mc_list *im = igmp_mc_get_first(seq);
+ if (im)
+ while (pos && (im = igmp_mc_get_next(seq, im)) != NULL)
+ --pos;
+ return pos ? NULL : im;
+}
+
+static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ read_lock(&dev_base_lock);
+ return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ip_mc_list *im;
+ if (v == SEQ_START_TOKEN)
+ im = igmp_mc_get_first(seq);
+ else
+ im = igmp_mc_get_next(seq, v);
+ ++*pos;
+ return im;
+}
+
+static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
+{
+ struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+ if (likely(state->in_dev != NULL)) {
+ read_unlock(&state->in_dev->mc_list_lock);
+ in_dev_put(state->in_dev);
+ state->in_dev = NULL;
+ }
+ state->dev = NULL;
+ read_unlock(&dev_base_lock);
+}
+
+static int igmp_mc_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq,
+ "Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n");
+ else {
+ struct ip_mc_list *im = (struct ip_mc_list *)v;
+ struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+ char *querier;
+#ifdef CONFIG_IP_MULTICAST
+ querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :
+ IGMP_V2_SEEN(state->in_dev) ? "V2" :
+ "V3";
+#else
+ querier = "NONE";
+#endif
+
+ if (state->in_dev->mc_list == im) {
+ seq_printf(seq, "%d\t%-10s: %5d %7s\n",
+ state->dev->ifindex, state->dev->name, state->dev->mc_count, querier);
+ }
+
+ seq_printf(seq,
+ "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n",
+ im->multiaddr, im->users,
+ im->tm_running, im->tm_running ?
+ jiffies_to_clock_t(im->timer.expires-jiffies) : 0,
+ im->reporter);
+ }
+ return 0;
+}
+
+static struct seq_operations igmp_mc_seq_ops = {
+ .start = igmp_mc_seq_start,
+ .next = igmp_mc_seq_next,
+ .stop = igmp_mc_seq_stop,
+ .show = igmp_mc_seq_show,
+};
+
+static int igmp_mc_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc = -ENOMEM;
+ struct igmp_mc_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (!s)
+ goto out;
+ rc = seq_open(file, &igmp_mc_seq_ops);
+ if (rc)
+ goto out_kfree;
+
+ seq = file->private_data;
+ seq->private = s;
+ memset(s, 0, sizeof(*s));
+out:
+ return rc;
+out_kfree:
+ kfree(s);
+ goto out;
+}
+
+static struct file_operations igmp_mc_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = igmp_mc_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+struct igmp_mcf_iter_state {
+ struct net_device *dev;
+ struct in_device *idev;
+ struct ip_mc_list *im;
+};
+
+#define igmp_mcf_seq_private(seq) ((struct igmp_mcf_iter_state *)(seq)->private)
+
+static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
+{
+ struct ip_sf_list *psf = NULL;
+ struct ip_mc_list *im = NULL;
+ struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+
+ for (state->dev = dev_base, state->idev = NULL, state->im = NULL;
+ state->dev;
+ state->dev = state->dev->next) {
+ struct in_device *idev;
+ idev = in_dev_get(state->dev);
+ if (unlikely(idev == NULL))
+ continue;
+ read_lock(&idev->mc_list_lock);
+ im = idev->mc_list;
+ if (likely(im != NULL)) {
+ spin_lock_bh(&im->lock);
+ psf = im->sources;
+ if (likely(psf != NULL)) {
+ state->im = im;
+ state->idev = idev;
+ break;
+ }
+ spin_unlock_bh(&im->lock);
+ }
+ read_unlock(&idev->mc_list_lock);
+ in_dev_put(idev);
+ }
+ return psf;
+}
+
+static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_list *psf)
+{
+ struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+
+ psf = psf->sf_next;
+ while (!psf) {
+ spin_unlock_bh(&state->im->lock);
+ state->im = state->im->next;
+ while (!state->im) {
+ if (likely(state->idev != NULL)) {
+ read_unlock(&state->idev->mc_list_lock);
+ in_dev_put(state->idev);
+ }
+ state->dev = state->dev->next;
+ if (!state->dev) {
+ state->idev = NULL;
+ goto out;
+ }
+ state->idev = in_dev_get(state->dev);
+ if (!state->idev)
+ continue;
+ read_lock(&state->idev->mc_list_lock);
+ state->im = state->idev->mc_list;
+ }
+ if (!state->im)
+ break;
+ spin_lock_bh(&state->im->lock);
+ psf = state->im->sources;
+ }
+out:
+ return psf;
+}
+
+static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct ip_sf_list *psf = igmp_mcf_get_first(seq);
+ if (psf)
+ while (pos && (psf = igmp_mcf_get_next(seq, psf)) != NULL)
+ --pos;
+ return pos ? NULL : psf;
+}
+
+static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ read_lock(&dev_base_lock);
+ return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ip_sf_list *psf;
+ if (v == SEQ_START_TOKEN)
+ psf = igmp_mcf_get_first(seq);
+ else
+ psf = igmp_mcf_get_next(seq, v);
+ ++*pos;
+ return psf;
+}
+
+static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
+{
+ struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+ if (likely(state->im != NULL)) {
+ spin_unlock_bh(&state->im->lock);
+ state->im = NULL;
+ }
+ if (likely(state->idev != NULL)) {
+ read_unlock(&state->idev->mc_list_lock);
+ in_dev_put(state->idev);
+ state->idev = NULL;
+ }
+ state->dev = NULL;
+ read_unlock(&dev_base_lock);
+}
+
+static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
+{
+ struct ip_sf_list *psf = (struct ip_sf_list *)v;
+ struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq,
+ "%3s %6s "
+ "%10s %10s %6s %6s\n", "Idx",
+ "Device", "MCA",
+ "SRC", "INC", "EXC");
+ } else {
+ seq_printf(seq,
+ "%3d %6.6s 0x%08x "
+ "0x%08x %6lu %6lu\n",
+ state->dev->ifindex, state->dev->name,
+ ntohl(state->im->multiaddr),
+ ntohl(psf->sf_inaddr),
+ psf->sf_count[MCAST_INCLUDE],
+ psf->sf_count[MCAST_EXCLUDE]);
+ }
+ return 0;
+}
+
+static struct seq_operations igmp_mcf_seq_ops = {
+ .start = igmp_mcf_seq_start,
+ .next = igmp_mcf_seq_next,
+ .stop = igmp_mcf_seq_stop,
+ .show = igmp_mcf_seq_show,
+};
+
+static int igmp_mcf_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc = -ENOMEM;
+ struct igmp_mcf_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (!s)
+ goto out;
+ rc = seq_open(file, &igmp_mcf_seq_ops);
+ if (rc)
+ goto out_kfree;
+
+ seq = file->private_data;
+ seq->private = s;
+ memset(s, 0, sizeof(*s));
+out:
+ return rc;
+out_kfree:
+ kfree(s);
+ goto out;
+}
+
+static struct file_operations igmp_mcf_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = igmp_mcf_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+int __init igmp_mc_proc_init(void)
+{
+ proc_net_fops_create("igmp", S_IRUGO, &igmp_mc_seq_fops);
+ proc_net_fops_create("mcfilter", S_IRUGO, &igmp_mcf_seq_fops);
+ return 0;
+}
+#endif
+
+EXPORT_SYMBOL(ip_mc_dec_group);
+EXPORT_SYMBOL(ip_mc_inc_group);
+EXPORT_SYMBOL(ip_mc_join_group);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
new file mode 100644
index 00000000000..95473953c40
--- /dev/null
+++ b/net/ipv4/inetpeer.c
@@ -0,0 +1,460 @@
+/*
+ * INETPEER - A storage for permanent information about peers
+ *
+ * This source is covered by the GNU GPL, the same as all kernel sources.
+ *
+ * Version: $Id: inetpeer.c,v 1.7 2001/09/20 21:22:50 davem Exp $
+ *
+ * Authors: Andrey V. Savochkin <saw@msu.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/net.h>
+#include <net/inetpeer.h>
+
+/*
+ * Theory of operations.
+ * We keep one entry for each peer IP address. The nodes contains long-living
+ * information about the peer which doesn't depend on routes.
+ * At this moment this information consists only of ID field for the next
+ * outgoing IP packet. This field is incremented with each packet as encoded
+ * in inet_getid() function (include/net/inetpeer.h).
+ * At the moment of writing this notes identifier of IP packets is generated
+ * to be unpredictable using this code only for packets subjected
+ * (actually or potentially) to defragmentation. I.e. DF packets less than
+ * PMTU in size uses a constant ID and do not use this code (see
+ * ip_select_ident() in include/net/ip.h).
+ *
+ * Route cache entries hold references to our nodes.
+ * New cache entries get references via lookup by destination IP address in
+ * the avl tree. The reference is grabbed only when it's needed i.e. only
+ * when we try to output IP packet which needs an unpredictable ID (see
+ * __ip_select_ident() in net/ipv4/route.c).
+ * Nodes are removed only when reference counter goes to 0.
+ * When it's happened the node may be removed when a sufficient amount of
+ * time has been passed since its last use. The less-recently-used entry can
+ * also be removed if the pool is overloaded i.e. if the total amount of
+ * entries is greater-or-equal than the threshold.
+ *
+ * Node pool is organised as an AVL tree.
+ * Such an implementation has been chosen not just for fun. It's a way to
+ * prevent easy and efficient DoS attacks by creating hash collisions. A huge
+ * amount of long living nodes in a single hash slot would significantly delay
+ * lookups performed with disabled BHs.
+ *
+ * Serialisation issues.
+ * 1. Nodes may appear in the tree only with the pool write lock held.
+ * 2. Nodes may disappear from the tree only with the pool write lock held
+ * AND reference count being 0.
+ * 3. Nodes appears and disappears from unused node list only under
+ * "inet_peer_unused_lock".
+ * 4. Global variable peer_total is modified under the pool lock.
+ * 5. struct inet_peer fields modification:
+ * avl_left, avl_right, avl_parent, avl_height: pool lock
+ * unused_next, unused_prevp: unused node list lock
+ * refcnt: atomically against modifications on other CPU;
+ * usually under some other lock to prevent node disappearing
+ * dtime: unused node list lock
+ * v4daddr: unchangeable
+ * ip_id_count: idlock
+ */
+
+/* Exported for inet_getid inline function. */
+DEFINE_SPINLOCK(inet_peer_idlock);
+
+static kmem_cache_t *peer_cachep;
+
+#define node_height(x) x->avl_height
+static struct inet_peer peer_fake_node = {
+ .avl_left = &peer_fake_node,
+ .avl_right = &peer_fake_node,
+ .avl_height = 0
+};
+#define peer_avl_empty (&peer_fake_node)
+static struct inet_peer *peer_root = peer_avl_empty;
+static DEFINE_RWLOCK(peer_pool_lock);
+#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
+
+static volatile int peer_total;
+/* Exported for sysctl_net_ipv4. */
+int inet_peer_threshold = 65536 + 128; /* start to throw entries more
+ * aggressively at this stage */
+int inet_peer_minttl = 120 * HZ; /* TTL under high load: 120 sec */
+int inet_peer_maxttl = 10 * 60 * HZ; /* usual time to live: 10 min */
+
+static struct inet_peer *inet_peer_unused_head;
+/* Exported for inet_putpeer inline function. */
+struct inet_peer **inet_peer_unused_tailp = &inet_peer_unused_head;
+DEFINE_SPINLOCK(inet_peer_unused_lock);
+#define PEER_MAX_CLEANUP_WORK 30
+
+static void peer_check_expire(unsigned long dummy);
+static struct timer_list peer_periodic_timer =
+ TIMER_INITIALIZER(peer_check_expire, 0, 0);
+
+/* Exported for sysctl_net_ipv4. */
+int inet_peer_gc_mintime = 10 * HZ,
+ inet_peer_gc_maxtime = 120 * HZ;
+
+/* Called from ip_output.c:ip_init */
+void __init inet_initpeers(void)
+{
+ struct sysinfo si;
+
+ /* Use the straight interface to information about memory. */
+ si_meminfo(&si);
+ /* The values below were suggested by Alexey Kuznetsov
+ * <kuznet@ms2.inr.ac.ru>. I don't have any opinion about the values
+ * myself. --SAW
+ */
+ if (si.totalram <= (32768*1024)/PAGE_SIZE)
+ inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */
+ if (si.totalram <= (16384*1024)/PAGE_SIZE)
+ inet_peer_threshold >>= 1; /* about 512KB */
+ if (si.totalram <= (8192*1024)/PAGE_SIZE)
+ inet_peer_threshold >>= 2; /* about 128KB */
+
+ peer_cachep = kmem_cache_create("inet_peer_cache",
+ sizeof(struct inet_peer),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+
+ if (!peer_cachep)
+ panic("cannot create inet_peer_cache");
+
+ /* All the timers, started at system startup tend
+ to synchronize. Perturb it a bit.
+ */
+ peer_periodic_timer.expires = jiffies
+ + net_random() % inet_peer_gc_maxtime
+ + inet_peer_gc_maxtime;
+ add_timer(&peer_periodic_timer);
+}
+
+/* Called with or without local BH being disabled. */
+static void unlink_from_unused(struct inet_peer *p)
+{
+ spin_lock_bh(&inet_peer_unused_lock);
+ if (p->unused_prevp != NULL) {
+ /* On unused list. */
+ *p->unused_prevp = p->unused_next;
+ if (p->unused_next != NULL)
+ p->unused_next->unused_prevp = p->unused_prevp;
+ else
+ inet_peer_unused_tailp = p->unused_prevp;
+ p->unused_prevp = NULL; /* mark it as removed */
+ }
+ spin_unlock_bh(&inet_peer_unused_lock);
+}
+
+/* Called with local BH disabled and the pool lock held. */
+#define lookup(daddr) \
+({ \
+ struct inet_peer *u, **v; \
+ stackptr = stack; \
+ *stackptr++ = &peer_root; \
+ for (u = peer_root; u != peer_avl_empty; ) { \
+ if (daddr == u->v4daddr) \
+ break; \
+ if (daddr < u->v4daddr) \
+ v = &u->avl_left; \
+ else \
+ v = &u->avl_right; \
+ *stackptr++ = v; \
+ u = *v; \
+ } \
+ u; \
+})
+
+/* Called with local BH disabled and the pool write lock held. */
+#define lookup_rightempty(start) \
+({ \
+ struct inet_peer *u, **v; \
+ *stackptr++ = &start->avl_left; \
+ v = &start->avl_left; \
+ for (u = *v; u->avl_right != peer_avl_empty; ) { \
+ v = &u->avl_right; \
+ *stackptr++ = v; \
+ u = *v; \
+ } \
+ u; \
+})
+
+/* Called with local BH disabled and the pool write lock held.
+ * Variable names are the proof of operation correctness.
+ * Look into mm/map_avl.c for more detail description of the ideas. */
+static void peer_avl_rebalance(struct inet_peer **stack[],
+ struct inet_peer ***stackend)
+{
+ struct inet_peer **nodep, *node, *l, *r;
+ int lh, rh;
+
+ while (stackend > stack) {
+ nodep = *--stackend;
+ node = *nodep;
+ l = node->avl_left;
+ r = node->avl_right;
+ lh = node_height(l);
+ rh = node_height(r);
+ if (lh > rh + 1) { /* l: RH+2 */
+ struct inet_peer *ll, *lr, *lrl, *lrr;
+ int lrh;
+ ll = l->avl_left;
+ lr = l->avl_right;
+ lrh = node_height(lr);
+ if (lrh <= node_height(ll)) { /* ll: RH+1 */
+ node->avl_left = lr; /* lr: RH or RH+1 */
+ node->avl_right = r; /* r: RH */
+ node->avl_height = lrh + 1; /* RH+1 or RH+2 */
+ l->avl_left = ll; /* ll: RH+1 */
+ l->avl_right = node; /* node: RH+1 or RH+2 */
+ l->avl_height = node->avl_height + 1;
+ *nodep = l;
+ } else { /* ll: RH, lr: RH+1 */
+ lrl = lr->avl_left; /* lrl: RH or RH-1 */
+ lrr = lr->avl_right; /* lrr: RH or RH-1 */
+ node->avl_left = lrr; /* lrr: RH or RH-1 */
+ node->avl_right = r; /* r: RH */
+ node->avl_height = rh + 1; /* node: RH+1 */
+ l->avl_left = ll; /* ll: RH */
+ l->avl_right = lrl; /* lrl: RH or RH-1 */
+ l->avl_height = rh + 1; /* l: RH+1 */
+ lr->avl_left = l; /* l: RH+1 */
+ lr->avl_right = node; /* node: RH+1 */
+ lr->avl_height = rh + 2;
+ *nodep = lr;
+ }
+ } else if (rh > lh + 1) { /* r: LH+2 */
+ struct inet_peer *rr, *rl, *rlr, *rll;
+ int rlh;
+ rr = r->avl_right;
+ rl = r->avl_left;
+ rlh = node_height(rl);
+ if (rlh <= node_height(rr)) { /* rr: LH+1 */
+ node->avl_right = rl; /* rl: LH or LH+1 */
+ node->avl_left = l; /* l: LH */
+ node->avl_height = rlh + 1; /* LH+1 or LH+2 */
+ r->avl_right = rr; /* rr: LH+1 */
+ r->avl_left = node; /* node: LH+1 or LH+2 */
+ r->avl_height = node->avl_height + 1;
+ *nodep = r;
+ } else { /* rr: RH, rl: RH+1 */
+ rlr = rl->avl_right; /* rlr: LH or LH-1 */
+ rll = rl->avl_left; /* rll: LH or LH-1 */
+ node->avl_right = rll; /* rll: LH or LH-1 */
+ node->avl_left = l; /* l: LH */
+ node->avl_height = lh + 1; /* node: LH+1 */
+ r->avl_right = rr; /* rr: LH */
+ r->avl_left = rlr; /* rlr: LH or LH-1 */
+ r->avl_height = lh + 1; /* r: LH+1 */
+ rl->avl_right = r; /* r: LH+1 */
+ rl->avl_left = node; /* node: LH+1 */
+ rl->avl_height = lh + 2;
+ *nodep = rl;
+ }
+ } else {
+ node->avl_height = (lh > rh ? lh : rh) + 1;
+ }
+ }
+}
+
+/* Called with local BH disabled and the pool write lock held. */
+#define link_to_pool(n) \
+do { \
+ n->avl_height = 1; \
+ n->avl_left = peer_avl_empty; \
+ n->avl_right = peer_avl_empty; \
+ **--stackptr = n; \
+ peer_avl_rebalance(stack, stackptr); \
+} while(0)
+
+/* May be called with local BH enabled. */
+static void unlink_from_pool(struct inet_peer *p)
+{
+ int do_free;
+
+ do_free = 0;
+
+ write_lock_bh(&peer_pool_lock);
+ /* Check the reference counter. It was artificially incremented by 1
+ * in cleanup() function to prevent sudden disappearing. If the
+ * reference count is still 1 then the node is referenced only as `p'
+ * here and from the pool. So under the exclusive pool lock it's safe
+ * to remove the node and free it later. */
+ if (atomic_read(&p->refcnt) == 1) {
+ struct inet_peer **stack[PEER_MAXDEPTH];
+ struct inet_peer ***stackptr, ***delp;
+ if (lookup(p->v4daddr) != p)
+ BUG();
+ delp = stackptr - 1; /* *delp[0] == p */
+ if (p->avl_left == peer_avl_empty) {
+ *delp[0] = p->avl_right;
+ --stackptr;
+ } else {
+ /* look for a node to insert instead of p */
+ struct inet_peer *t;
+ t = lookup_rightempty(p);
+ if (*stackptr[-1] != t)
+ BUG();
+ **--stackptr = t->avl_left;
+ /* t is removed, t->v4daddr > x->v4daddr for any
+ * x in p->avl_left subtree.
+ * Put t in the old place of p. */
+ *delp[0] = t;
+ t->avl_left = p->avl_left;
+ t->avl_right = p->avl_right;
+ t->avl_height = p->avl_height;
+ if (delp[1] != &p->avl_left)
+ BUG();
+ delp[1] = &t->avl_left; /* was &p->avl_left */
+ }
+ peer_avl_rebalance(stack, stackptr);
+ peer_total--;
+ do_free = 1;
+ }
+ write_unlock_bh(&peer_pool_lock);
+
+ if (do_free)
+ kmem_cache_free(peer_cachep, p);
+ else
+ /* The node is used again. Decrease the reference counter
+ * back. The loop "cleanup -> unlink_from_unused
+ * -> unlink_from_pool -> putpeer -> link_to_unused
+ * -> cleanup (for the same node)"
+ * doesn't really exist because the entry will have a
+ * recent deletion time and will not be cleaned again soon. */
+ inet_putpeer(p);
+}
+
+/* May be called with local BH enabled. */
+static int cleanup_once(unsigned long ttl)
+{
+ struct inet_peer *p;
+
+ /* Remove the first entry from the list of unused nodes. */
+ spin_lock_bh(&inet_peer_unused_lock);
+ p = inet_peer_unused_head;
+ if (p != NULL) {
+ if (time_after(p->dtime + ttl, jiffies)) {
+ /* Do not prune fresh entries. */
+ spin_unlock_bh(&inet_peer_unused_lock);
+ return -1;
+ }
+ inet_peer_unused_head = p->unused_next;
+ if (p->unused_next != NULL)
+ p->unused_next->unused_prevp = p->unused_prevp;
+ else
+ inet_peer_unused_tailp = p->unused_prevp;
+ p->unused_prevp = NULL; /* mark as not on the list */
+ /* Grab an extra reference to prevent node disappearing
+ * before unlink_from_pool() call. */
+ atomic_inc(&p->refcnt);
+ }
+ spin_unlock_bh(&inet_peer_unused_lock);
+
+ if (p == NULL)
+ /* It means that the total number of USED entries has
+ * grown over inet_peer_threshold. It shouldn't really
+ * happen because of entry limits in route cache. */
+ return -1;
+
+ unlink_from_pool(p);
+ return 0;
+}
+
+/* Called with or without local BH being disabled. */
+struct inet_peer *inet_getpeer(__u32 daddr, int create)
+{
+ struct inet_peer *p, *n;
+ struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr;
+
+ /* Look up for the address quickly. */
+ read_lock_bh(&peer_pool_lock);
+ p = lookup(daddr);
+ if (p != peer_avl_empty)
+ atomic_inc(&p->refcnt);
+ read_unlock_bh(&peer_pool_lock);
+
+ if (p != peer_avl_empty) {
+ /* The existing node has been found. */
+ /* Remove the entry from unused list if it was there. */
+ unlink_from_unused(p);
+ return p;
+ }
+
+ if (!create)
+ return NULL;
+
+ /* Allocate the space outside the locked region. */
+ n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
+ if (n == NULL)
+ return NULL;
+ n->v4daddr = daddr;
+ atomic_set(&n->refcnt, 1);
+ n->ip_id_count = secure_ip_id(daddr);
+ n->tcp_ts_stamp = 0;
+
+ write_lock_bh(&peer_pool_lock);
+ /* Check if an entry has suddenly appeared. */
+ p = lookup(daddr);
+ if (p != peer_avl_empty)
+ goto out_free;
+
+ /* Link the node. */
+ link_to_pool(n);
+ n->unused_prevp = NULL; /* not on the list */
+ peer_total++;
+ write_unlock_bh(&peer_pool_lock);
+
+ if (peer_total >= inet_peer_threshold)
+ /* Remove one less-recently-used entry. */
+ cleanup_once(0);
+
+ return n;
+
+out_free:
+ /* The appropriate node is already in the pool. */
+ atomic_inc(&p->refcnt);
+ write_unlock_bh(&peer_pool_lock);
+ /* Remove the entry from unused list if it was there. */
+ unlink_from_unused(p);
+ /* Free preallocated the preallocated node. */
+ kmem_cache_free(peer_cachep, n);
+ return p;
+}
+
+/* Called with local BH disabled. */
+static void peer_check_expire(unsigned long dummy)
+{
+ int i;
+ int ttl;
+
+ if (peer_total >= inet_peer_threshold)
+ ttl = inet_peer_minttl;
+ else
+ ttl = inet_peer_maxttl
+ - (inet_peer_maxttl - inet_peer_minttl) / HZ *
+ peer_total / inet_peer_threshold * HZ;
+ for (i = 0; i < PEER_MAX_CLEANUP_WORK && !cleanup_once(ttl); i++);
+
+ /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
+ * interval depending on the total number of entries (more entries,
+ * less interval). */
+ peer_periodic_timer.expires = jiffies
+ + inet_peer_gc_maxtime
+ - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
+ peer_total / inet_peer_threshold * HZ;
+ add_timer(&peer_periodic_timer);
+}
+
+EXPORT_SYMBOL(inet_peer_idlock);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
new file mode 100644
index 00000000000..77094aac6c2
--- /dev/null
+++ b/net/ipv4/ip_forward.c
@@ -0,0 +1,127 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The IP forwarding functionality.
+ *
+ * Version: $Id: ip_forward.c,v 1.48 2000/12/13 18:31:48 davem Exp $
+ *
+ * Authors: see ip.c
+ *
+ * Fixes:
+ * Many : Split from ip.c , see ip_input.c for
+ * history.
+ * Dave Gregorich : NULL ip_rt_put fix for multicast
+ * routing.
+ * Jos Vos : Add call_out_firewall before sending,
+ * use output device for accounting.
+ * Jos Vos : Call forward firewall after routing
+ * (always use output device).
+ * Mike McLagan : Routing by source
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/checksum.h>
+#include <linux/route.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+
+static inline int ip_forward_finish(struct sk_buff *skb)
+{
+ struct ip_options * opt = &(IPCB(skb)->opt);
+
+ IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
+
+ if (unlikely(opt->optlen))
+ ip_forward_options(skb);
+
+ return dst_output(skb);
+}
+
+int ip_forward(struct sk_buff *skb)
+{
+ struct iphdr *iph; /* Our header */
+ struct rtable *rt; /* Route we use */
+ struct ip_options * opt = &(IPCB(skb)->opt);
+
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
+ goto drop;
+
+ if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
+ return NET_RX_SUCCESS;
+
+ if (skb->pkt_type != PACKET_HOST)
+ goto drop;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ /*
+ * According to the RFC, we must first decrease the TTL field. If
+ * that reaches zero, we must reply an ICMP control message telling
+ * that the packet's lifetime expired.
+ */
+
+ iph = skb->nh.iph;
+
+ if (iph->ttl <= 1)
+ goto too_many_hops;
+
+ if (!xfrm4_route_forward(skb))
+ goto drop;
+
+ iph = skb->nh.iph;
+ rt = (struct rtable*)skb->dst;
+
+ if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+ goto sr_failed;
+
+ /* We are about to mangle packet. Copy it! */
+ if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))
+ goto drop;
+ iph = skb->nh.iph;
+
+ /* Decrease ttl after skb cow done */
+ ip_decrease_ttl(iph);
+
+ /*
+ * We now generate an ICMP HOST REDIRECT giving the route
+ * we calculated.
+ */
+ if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr)
+ ip_rt_send_redirect(skb);
+
+ skb->priority = rt_tos2priority(iph->tos);
+
+ return NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, rt->u.dst.dev,
+ ip_forward_finish);
+
+sr_failed:
+ /*
+ * Strict routing permits no gatewaying
+ */
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
+ goto drop;
+
+too_many_hops:
+ /* Tell the sender its packet died... */
+ icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
new file mode 100644
index 00000000000..7f68e27eb4e
--- /dev/null
+++ b/net/ipv4/ip_fragment.c
@@ -0,0 +1,691 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The IP fragmentation functionality.
+ *
+ * Version: $Id: ip_fragment.c,v 1.59 2002/01/12 07:54:56 davem Exp $
+ *
+ * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
+ * Alan Cox <Alan.Cox@linux.org>
+ *
+ * Fixes:
+ * Alan Cox : Split from ip.c , see ip_input.c for history.
+ * David S. Miller : Begin massive cleanup...
+ * Andi Kleen : Add sysctls.
+ * xxxx : Overlapfrag bug.
+ * Ultima : ip_expire() kernel panic.
+ * Bill Hawes : Frag accounting and evictor fixes.
+ * John McDonald : 0 length frag bug.
+ * Alexey Kuznetsov: SMP races, threading, cleanup.
+ * Patrick McHardy : LRU queue of frag heads for evictor.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/jiffies.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/checksum.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/inet.h>
+#include <linux/netfilter_ipv4.h>
+
+/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
+ * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
+ * as well. Or notify me, at least. --ANK
+ */
+
+/* Fragment cache limits. We will commit 256K at one time. Should we
+ * cross that limit we will prune down to 192K. This should cope with
+ * even the most extreme cases without allowing an attacker to measurably
+ * harm machine performance.
+ */
+int sysctl_ipfrag_high_thresh = 256*1024;
+int sysctl_ipfrag_low_thresh = 192*1024;
+
+/* Important NOTE! Fragment queue must be destroyed before MSL expires.
+ * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL.
+ */
+int sysctl_ipfrag_time = IP_FRAG_TIME;
+
+struct ipfrag_skb_cb
+{
+ struct inet_skb_parm h;
+ int offset;
+};
+
+#define FRAG_CB(skb) ((struct ipfrag_skb_cb*)((skb)->cb))
+
+/* Describe an entry in the "incomplete datagrams" queue. */
+struct ipq {
+ struct ipq *next; /* linked list pointers */
+ struct list_head lru_list; /* lru list member */
+ u32 user;
+ u32 saddr;
+ u32 daddr;
+ u16 id;
+ u8 protocol;
+ u8 last_in;
+#define COMPLETE 4
+#define FIRST_IN 2
+#define LAST_IN 1
+
+ struct sk_buff *fragments; /* linked list of received fragments */
+ int len; /* total length of original datagram */
+ int meat;
+ spinlock_t lock;
+ atomic_t refcnt;
+ struct timer_list timer; /* when will this queue expire? */
+ struct ipq **pprev;
+ int iif;
+ struct timeval stamp;
+};
+
+/* Hash table. */
+
+#define IPQ_HASHSZ 64
+
+/* Per-bucket lock is easy to add now. */
+static struct ipq *ipq_hash[IPQ_HASHSZ];
+static DEFINE_RWLOCK(ipfrag_lock);
+static u32 ipfrag_hash_rnd;
+static LIST_HEAD(ipq_lru_list);
+int ip_frag_nqueues = 0;
+
+static __inline__ void __ipq_unlink(struct ipq *qp)
+{
+ if(qp->next)
+ qp->next->pprev = qp->pprev;
+ *qp->pprev = qp->next;
+ list_del(&qp->lru_list);
+ ip_frag_nqueues--;
+}
+
+static __inline__ void ipq_unlink(struct ipq *ipq)
+{
+ write_lock(&ipfrag_lock);
+ __ipq_unlink(ipq);
+ write_unlock(&ipfrag_lock);
+}
+
+static unsigned int ipqhashfn(u16 id, u32 saddr, u32 daddr, u8 prot)
+{
+ return jhash_3words((u32)id << 16 | prot, saddr, daddr,
+ ipfrag_hash_rnd) & (IPQ_HASHSZ - 1);
+}
+
+static struct timer_list ipfrag_secret_timer;
+int sysctl_ipfrag_secret_interval = 10 * 60 * HZ;
+
+static void ipfrag_secret_rebuild(unsigned long dummy)
+{
+ unsigned long now = jiffies;
+ int i;
+
+ write_lock(&ipfrag_lock);
+ get_random_bytes(&ipfrag_hash_rnd, sizeof(u32));
+ for (i = 0; i < IPQ_HASHSZ; i++) {
+ struct ipq *q;
+
+ q = ipq_hash[i];
+ while (q) {
+ struct ipq *next = q->next;
+ unsigned int hval = ipqhashfn(q->id, q->saddr,
+ q->daddr, q->protocol);
+
+ if (hval != i) {
+ /* Unlink. */
+ if (q->next)
+ q->next->pprev = q->pprev;
+ *q->pprev = q->next;
+
+ /* Relink to new hash chain. */
+ if ((q->next = ipq_hash[hval]) != NULL)
+ q->next->pprev = &q->next;
+ ipq_hash[hval] = q;
+ q->pprev = &ipq_hash[hval];
+ }
+
+ q = next;
+ }
+ }
+ write_unlock(&ipfrag_lock);
+
+ mod_timer(&ipfrag_secret_timer, now + sysctl_ipfrag_secret_interval);
+}
+
+atomic_t ip_frag_mem = ATOMIC_INIT(0); /* Memory used for fragments */
+
+/* Memory Tracking Functions. */
+static __inline__ void frag_kfree_skb(struct sk_buff *skb, int *work)
+{
+ if (work)
+ *work -= skb->truesize;
+ atomic_sub(skb->truesize, &ip_frag_mem);
+ kfree_skb(skb);
+}
+
+static __inline__ void frag_free_queue(struct ipq *qp, int *work)
+{
+ if (work)
+ *work -= sizeof(struct ipq);
+ atomic_sub(sizeof(struct ipq), &ip_frag_mem);
+ kfree(qp);
+}
+
+static __inline__ struct ipq *frag_alloc_queue(void)
+{
+ struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC);
+
+ if(!qp)
+ return NULL;
+ atomic_add(sizeof(struct ipq), &ip_frag_mem);
+ return qp;
+}
+
+
+/* Destruction primitives. */
+
+/* Complete destruction of ipq. */
+static void ip_frag_destroy(struct ipq *qp, int *work)
+{
+ struct sk_buff *fp;
+
+ BUG_TRAP(qp->last_in&COMPLETE);
+ BUG_TRAP(del_timer(&qp->timer) == 0);
+
+ /* Release all fragment data. */
+ fp = qp->fragments;
+ while (fp) {
+ struct sk_buff *xp = fp->next;
+
+ frag_kfree_skb(fp, work);
+ fp = xp;
+ }
+
+ /* Finally, release the queue descriptor itself. */
+ frag_free_queue(qp, work);
+}
+
+static __inline__ void ipq_put(struct ipq *ipq, int *work)
+{
+ if (atomic_dec_and_test(&ipq->refcnt))
+ ip_frag_destroy(ipq, work);
+}
+
+/* Kill ipq entry. It is not destroyed immediately,
+ * because caller (and someone more) holds reference count.
+ */
+static void ipq_kill(struct ipq *ipq)
+{
+ if (del_timer(&ipq->timer))
+ atomic_dec(&ipq->refcnt);
+
+ if (!(ipq->last_in & COMPLETE)) {
+ ipq_unlink(ipq);
+ atomic_dec(&ipq->refcnt);
+ ipq->last_in |= COMPLETE;
+ }
+}
+
+/* Memory limiting on fragments. Evictor trashes the oldest
+ * fragment queue until we are back under the threshold.
+ */
+static void ip_evictor(void)
+{
+ struct ipq *qp;
+ struct list_head *tmp;
+ int work;
+
+ work = atomic_read(&ip_frag_mem) - sysctl_ipfrag_low_thresh;
+ if (work <= 0)
+ return;
+
+ while (work > 0) {
+ read_lock(&ipfrag_lock);
+ if (list_empty(&ipq_lru_list)) {
+ read_unlock(&ipfrag_lock);
+ return;
+ }
+ tmp = ipq_lru_list.next;
+ qp = list_entry(tmp, struct ipq, lru_list);
+ atomic_inc(&qp->refcnt);
+ read_unlock(&ipfrag_lock);
+
+ spin_lock(&qp->lock);
+ if (!(qp->last_in&COMPLETE))
+ ipq_kill(qp);
+ spin_unlock(&qp->lock);
+
+ ipq_put(qp, &work);
+ IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+ }
+}
+
+/*
+ * Oops, a fragment queue timed out. Kill it and send an ICMP reply.
+ */
+static void ip_expire(unsigned long arg)
+{
+ struct ipq *qp = (struct ipq *) arg;
+
+ spin_lock(&qp->lock);
+
+ if (qp->last_in & COMPLETE)
+ goto out;
+
+ ipq_kill(qp);
+
+ IP_INC_STATS_BH(IPSTATS_MIB_REASMTIMEOUT);
+ IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+
+ if ((qp->last_in&FIRST_IN) && qp->fragments != NULL) {
+ struct sk_buff *head = qp->fragments;
+ /* Send an ICMP "Fragment Reassembly Timeout" message. */
+ if ((head->dev = dev_get_by_index(qp->iif)) != NULL) {
+ icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
+ dev_put(head->dev);
+ }
+ }
+out:
+ spin_unlock(&qp->lock);
+ ipq_put(qp, NULL);
+}
+
+/* Creation primitives. */
+
+static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in)
+{
+ struct ipq *qp;
+
+ write_lock(&ipfrag_lock);
+#ifdef CONFIG_SMP
+ /* With SMP race we have to recheck hash table, because
+ * such entry could be created on other cpu, while we
+ * promoted read lock to write lock.
+ */
+ for(qp = ipq_hash[hash]; qp; qp = qp->next) {
+ if(qp->id == qp_in->id &&
+ qp->saddr == qp_in->saddr &&
+ qp->daddr == qp_in->daddr &&
+ qp->protocol == qp_in->protocol &&
+ qp->user == qp_in->user) {
+ atomic_inc(&qp->refcnt);
+ write_unlock(&ipfrag_lock);
+ qp_in->last_in |= COMPLETE;
+ ipq_put(qp_in, NULL);
+ return qp;
+ }
+ }
+#endif
+ qp = qp_in;
+
+ if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time))
+ atomic_inc(&qp->refcnt);
+
+ atomic_inc(&qp->refcnt);
+ if((qp->next = ipq_hash[hash]) != NULL)
+ qp->next->pprev = &qp->next;
+ ipq_hash[hash] = qp;
+ qp->pprev = &ipq_hash[hash];
+ INIT_LIST_HEAD(&qp->lru_list);
+ list_add_tail(&qp->lru_list, &ipq_lru_list);
+ ip_frag_nqueues++;
+ write_unlock(&ipfrag_lock);
+ return qp;
+}
+
+/* Add an entry to the 'ipq' queue for a newly received IP datagram. */
+static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
+{
+ struct ipq *qp;
+
+ if ((qp = frag_alloc_queue()) == NULL)
+ goto out_nomem;
+
+ qp->protocol = iph->protocol;
+ qp->last_in = 0;
+ qp->id = iph->id;
+ qp->saddr = iph->saddr;
+ qp->daddr = iph->daddr;
+ qp->user = user;
+ qp->len = 0;
+ qp->meat = 0;
+ qp->fragments = NULL;
+ qp->iif = 0;
+
+ /* Initialize a timer for this entry. */
+ init_timer(&qp->timer);
+ qp->timer.data = (unsigned long) qp; /* pointer to queue */
+ qp->timer.function = ip_expire; /* expire function */
+ spin_lock_init(&qp->lock);
+ atomic_set(&qp->refcnt, 1);
+
+ return ip_frag_intern(hash, qp);
+
+out_nomem:
+ NETDEBUG(if (net_ratelimit()) printk(KERN_ERR "ip_frag_create: no memory left !\n"));
+ return NULL;
+}
+
+/* Find the correct entry in the "incomplete datagrams" queue for
+ * this IP datagram, and create new one, if nothing is found.
+ */
+static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
+{
+ __u16 id = iph->id;
+ __u32 saddr = iph->saddr;
+ __u32 daddr = iph->daddr;
+ __u8 protocol = iph->protocol;
+ unsigned int hash = ipqhashfn(id, saddr, daddr, protocol);
+ struct ipq *qp;
+
+ read_lock(&ipfrag_lock);
+ for(qp = ipq_hash[hash]; qp; qp = qp->next) {
+ if(qp->id == id &&
+ qp->saddr == saddr &&
+ qp->daddr == daddr &&
+ qp->protocol == protocol &&
+ qp->user == user) {
+ atomic_inc(&qp->refcnt);
+ read_unlock(&ipfrag_lock);
+ return qp;
+ }
+ }
+ read_unlock(&ipfrag_lock);
+
+ return ip_frag_create(hash, iph, user);
+}
+
+/* Add new segment to existing queue. */
+static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+{
+ struct sk_buff *prev, *next;
+ int flags, offset;
+ int ihl, end;
+
+ if (qp->last_in & COMPLETE)
+ goto err;
+
+ offset = ntohs(skb->nh.iph->frag_off);
+ flags = offset & ~IP_OFFSET;
+ offset &= IP_OFFSET;
+ offset <<= 3; /* offset is in 8-byte chunks */
+ ihl = skb->nh.iph->ihl * 4;
+
+ /* Determine the position of this fragment. */
+ end = offset + skb->len - ihl;
+
+ /* Is this the final fragment? */
+ if ((flags & IP_MF) == 0) {
+ /* If we already have some bits beyond end
+ * or have different end, the segment is corrrupted.
+ */
+ if (end < qp->len ||
+ ((qp->last_in & LAST_IN) && end != qp->len))
+ goto err;
+ qp->last_in |= LAST_IN;
+ qp->len = end;
+ } else {
+ if (end&7) {
+ end &= ~7;
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+ if (end > qp->len) {
+ /* Some bits beyond end -> corruption. */
+ if (qp->last_in & LAST_IN)
+ goto err;
+ qp->len = end;
+ }
+ }
+ if (end == offset)
+ goto err;
+
+ if (pskb_pull(skb, ihl) == NULL)
+ goto err;
+ if (pskb_trim(skb, end-offset))
+ goto err;
+
+ /* Find out which fragments are in front and at the back of us
+ * in the chain of fragments so far. We must know where to put
+ * this fragment, right?
+ */
+ prev = NULL;
+ for(next = qp->fragments; next != NULL; next = next->next) {
+ if (FRAG_CB(next)->offset >= offset)
+ break; /* bingo! */
+ prev = next;
+ }
+
+ /* We found where to put this one. Check for overlap with
+ * preceding fragment, and, if needed, align things so that
+ * any overlaps are eliminated.
+ */
+ if (prev) {
+ int i = (FRAG_CB(prev)->offset + prev->len) - offset;
+
+ if (i > 0) {
+ offset += i;
+ if (end <= offset)
+ goto err;
+ if (!pskb_pull(skb, i))
+ goto err;
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+ }
+
+ while (next && FRAG_CB(next)->offset < end) {
+ int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
+
+ if (i < next->len) {
+ /* Eat head of the next overlapped fragment
+ * and leave the loop. The next ones cannot overlap.
+ */
+ if (!pskb_pull(next, i))
+ goto err;
+ FRAG_CB(next)->offset += i;
+ qp->meat -= i;
+ if (next->ip_summed != CHECKSUM_UNNECESSARY)
+ next->ip_summed = CHECKSUM_NONE;
+ break;
+ } else {
+ struct sk_buff *free_it = next;
+
+ /* Old fragmnet is completely overridden with
+ * new one drop it.
+ */
+ next = next->next;
+
+ if (prev)
+ prev->next = next;
+ else
+ qp->fragments = next;
+
+ qp->meat -= free_it->len;
+ frag_kfree_skb(free_it, NULL);
+ }
+ }
+
+ FRAG_CB(skb)->offset = offset;
+
+ /* Insert this fragment in the chain of fragments. */
+ skb->next = next;
+ if (prev)
+ prev->next = skb;
+ else
+ qp->fragments = skb;
+
+ if (skb->dev)
+ qp->iif = skb->dev->ifindex;
+ skb->dev = NULL;
+ qp->stamp = skb->stamp;
+ qp->meat += skb->len;
+ atomic_add(skb->truesize, &ip_frag_mem);
+ if (offset == 0)
+ qp->last_in |= FIRST_IN;
+
+ write_lock(&ipfrag_lock);
+ list_move_tail(&qp->lru_list, &ipq_lru_list);
+ write_unlock(&ipfrag_lock);
+
+ return;
+
+err:
+ kfree_skb(skb);
+}
+
+
+/* Build a new IP datagram from all its fragments. */
+
+static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
+{
+ struct iphdr *iph;
+ struct sk_buff *fp, *head = qp->fragments;
+ int len;
+ int ihlen;
+
+ ipq_kill(qp);
+
+ BUG_TRAP(head != NULL);
+ BUG_TRAP(FRAG_CB(head)->offset == 0);
+
+ /* Allocate a new buffer for the datagram. */
+ ihlen = head->nh.iph->ihl*4;
+ len = ihlen + qp->len;
+
+ if(len > 65535)
+ goto out_oversize;
+
+ /* Head of list must not be cloned. */
+ if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
+ goto out_nomem;
+
+ /* If the first fragment is fragmented itself, we split
+ * it to two chunks: the first with data and paged part
+ * and the second, holding only fragments. */
+ if (skb_shinfo(head)->frag_list) {
+ struct sk_buff *clone;
+ int i, plen = 0;
+
+ if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL)
+ goto out_nomem;
+ clone->next = head->next;
+ head->next = clone;
+ skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
+ skb_shinfo(head)->frag_list = NULL;
+ for (i=0; i<skb_shinfo(head)->nr_frags; i++)
+ plen += skb_shinfo(head)->frags[i].size;
+ clone->len = clone->data_len = head->data_len - plen;
+ head->data_len -= clone->len;
+ head->len -= clone->len;
+ clone->csum = 0;
+ clone->ip_summed = head->ip_summed;
+ atomic_add(clone->truesize, &ip_frag_mem);
+ }
+
+ skb_shinfo(head)->frag_list = head->next;
+ skb_push(head, head->data - head->nh.raw);
+ atomic_sub(head->truesize, &ip_frag_mem);
+
+ for (fp=head->next; fp; fp = fp->next) {
+ head->data_len += fp->len;
+ head->len += fp->len;
+ if (head->ip_summed != fp->ip_summed)
+ head->ip_summed = CHECKSUM_NONE;
+ else if (head->ip_summed == CHECKSUM_HW)
+ head->csum = csum_add(head->csum, fp->csum);
+ head->truesize += fp->truesize;
+ atomic_sub(fp->truesize, &ip_frag_mem);
+ }
+
+ head->next = NULL;
+ head->dev = dev;
+ head->stamp = qp->stamp;
+
+ iph = head->nh.iph;
+ iph->frag_off = 0;
+ iph->tot_len = htons(len);
+ IP_INC_STATS_BH(IPSTATS_MIB_REASMOKS);
+ qp->fragments = NULL;
+ return head;
+
+out_nomem:
+ NETDEBUG(if (net_ratelimit())
+ printk(KERN_ERR
+ "IP: queue_glue: no memory for gluing queue %p\n",
+ qp));
+ goto out_fail;
+out_oversize:
+ if (net_ratelimit())
+ printk(KERN_INFO
+ "Oversized IP packet from %d.%d.%d.%d.\n",
+ NIPQUAD(qp->saddr));
+out_fail:
+ IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+ return NULL;
+}
+
+/* Process an incoming IP datagram fragment. */
+struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct ipq *qp;
+ struct net_device *dev;
+
+ IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS);
+
+ /* Start by cleaning up the memory. */
+ if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh)
+ ip_evictor();
+
+ dev = skb->dev;
+
+ /* Lookup (or create) queue header */
+ if ((qp = ip_find(iph, user)) != NULL) {
+ struct sk_buff *ret = NULL;
+
+ spin_lock(&qp->lock);
+
+ ip_frag_queue(qp, skb);
+
+ if (qp->last_in == (FIRST_IN|LAST_IN) &&
+ qp->meat == qp->len)
+ ret = ip_frag_reasm(qp, dev);
+
+ spin_unlock(&qp->lock);
+ ipq_put(qp, NULL);
+ return ret;
+ }
+
+ IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+ kfree_skb(skb);
+ return NULL;
+}
+
+void ipfrag_init(void)
+{
+ ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
+ (jiffies ^ (jiffies >> 6)));
+
+ init_timer(&ipfrag_secret_timer);
+ ipfrag_secret_timer.function = ipfrag_secret_rebuild;
+ ipfrag_secret_timer.expires = jiffies + sysctl_ipfrag_secret_interval;
+ add_timer(&ipfrag_secret_timer);
+}
+
+EXPORT_SYMBOL(ip_defrag);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
new file mode 100644
index 00000000000..88483552222
--- /dev/null
+++ b/net/ipv4/ip_gre.c
@@ -0,0 +1,1290 @@
+/*
+ * Linux NET3: GRE over IP protocol decoder.
+ *
+ * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ipip.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+
+#ifdef CONFIG_IPV6
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#endif
+
+/*
+ Problems & solutions
+ --------------------
+
+ 1. The most important issue is detecting local dead loops.
+ They would cause complete host lockup in transmit, which
+ would be "resolved" by stack overflow or, if queueing is enabled,
+ with infinite looping in net_bh.
+
+ We cannot track such dead loops during route installation,
+ it is infeasible task. The most general solutions would be
+ to keep skb->encapsulation counter (sort of local ttl),
+ and silently drop packet when it expires. It is the best
+ solution, but it supposes maintaing new variable in ALL
+ skb, even if no tunneling is used.
+
+ Current solution: t->recursion lock breaks dead loops. It looks
+ like dev->tbusy flag, but I preferred new variable, because
+ the semantics is different. One day, when hard_start_xmit
+ will be multithreaded we will have to use skb->encapsulation.
+
+
+
+ 2. Networking dead loops would not kill routers, but would really
+ kill network. IP hop limit plays role of "t->recursion" in this case,
+ if we copy it from packet being encapsulated to upper header.
+ It is very good solution, but it introduces two problems:
+
+ - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
+ do not work over tunnels.
+ - traceroute does not work. I planned to relay ICMP from tunnel,
+ so that this problem would be solved and traceroute output
+ would even more informative. This idea appeared to be wrong:
+ only Linux complies to rfc1812 now (yes, guys, Linux is the only
+ true router now :-)), all routers (at least, in neighbourhood of mine)
+ return only 8 bytes of payload. It is the end.
+
+ Hence, if we want that OSPF worked or traceroute said something reasonable,
+ we should search for another solution.
+
+ One of them is to parse packet trying to detect inner encapsulation
+ made by our node. It is difficult or even impossible, especially,
+ taking into account fragmentation. TO be short, tt is not solution at all.
+
+ Current solution: The solution was UNEXPECTEDLY SIMPLE.
+ We force DF flag on tunnels with preconfigured hop limit,
+ that is ALL. :-) Well, it does not remove the problem completely,
+ but exponential growth of network traffic is changed to linear
+ (branches, that exceed pmtu are pruned) and tunnel mtu
+ fastly degrades to value <68, where looping stops.
+ Yes, it is not good if there exists a router in the loop,
+ which does not force DF, even when encapsulating packets have DF set.
+ But it is not our problem! Nobody could accuse us, we made
+ all that we could make. Even if it is your gated who injected
+ fatal route to network, even if it were you who configured
+ fatal static route: you are innocent. :-)
+
+
+
+ 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
+ practically identical code. It would be good to glue them
+ together, but it is not very evident, how to make them modular.
+ sit is integral part of IPv6, ipip and gre are naturally modular.
+ We could extract common parts (hash table, ioctl etc)
+ to a separate module (ip_tunnel.c).
+
+ Alexey Kuznetsov.
+ */
+
+static int ipgre_tunnel_init(struct net_device *dev);
+static void ipgre_tunnel_setup(struct net_device *dev);
+
+/* Fallback tunnel: no source, no destination, no key, no options */
+
+static int ipgre_fb_tunnel_init(struct net_device *dev);
+
+static struct net_device *ipgre_fb_tunnel_dev;
+
+/* Tunnel hash table */
+
+/*
+ 4 hash tables:
+
+ 3: (remote,local)
+ 2: (remote,*)
+ 1: (*,local)
+ 0: (*,*)
+
+ We require exact key match i.e. if a key is present in packet
+ it will match only tunnel with the same key; if it is not present,
+ it will match only keyless tunnel.
+
+ All keysless packets, if not matched configured keyless tunnels
+ will match fallback tunnel.
+ */
+
+#define HASH_SIZE 16
+#define HASH(addr) ((addr^(addr>>4))&0xF)
+
+static struct ip_tunnel *tunnels[4][HASH_SIZE];
+
+#define tunnels_r_l (tunnels[3])
+#define tunnels_r (tunnels[2])
+#define tunnels_l (tunnels[1])
+#define tunnels_wc (tunnels[0])
+
+static DEFINE_RWLOCK(ipgre_lock);
+
+/* Given src, dst and key, find appropriate for input tunnel. */
+
+static struct ip_tunnel * ipgre_tunnel_lookup(u32 remote, u32 local, u32 key)
+{
+ unsigned h0 = HASH(remote);
+ unsigned h1 = HASH(key);
+ struct ip_tunnel *t;
+
+ for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
+ if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
+ if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
+ return t;
+ }
+ }
+ for (t = tunnels_r[h0^h1]; t; t = t->next) {
+ if (remote == t->parms.iph.daddr) {
+ if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
+ return t;
+ }
+ }
+ for (t = tunnels_l[h1]; t; t = t->next) {
+ if (local == t->parms.iph.saddr ||
+ (local == t->parms.iph.daddr && MULTICAST(local))) {
+ if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
+ return t;
+ }
+ }
+ for (t = tunnels_wc[h1]; t; t = t->next) {
+ if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
+ return t;
+ }
+
+ if (ipgre_fb_tunnel_dev->flags&IFF_UP)
+ return ipgre_fb_tunnel_dev->priv;
+ return NULL;
+}
+
+static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t)
+{
+ u32 remote = t->parms.iph.daddr;
+ u32 local = t->parms.iph.saddr;
+ u32 key = t->parms.i_key;
+ unsigned h = HASH(key);
+ int prio = 0;
+
+ if (local)
+ prio |= 1;
+ if (remote && !MULTICAST(remote)) {
+ prio |= 2;
+ h ^= HASH(remote);
+ }
+
+ return &tunnels[prio][h];
+}
+
+static void ipgre_tunnel_link(struct ip_tunnel *t)
+{
+ struct ip_tunnel **tp = ipgre_bucket(t);
+
+ t->next = *tp;
+ write_lock_bh(&ipgre_lock);
+ *tp = t;
+ write_unlock_bh(&ipgre_lock);
+}
+
+static void ipgre_tunnel_unlink(struct ip_tunnel *t)
+{
+ struct ip_tunnel **tp;
+
+ for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) {
+ if (t == *tp) {
+ write_lock_bh(&ipgre_lock);
+ *tp = t->next;
+ write_unlock_bh(&ipgre_lock);
+ break;
+ }
+ }
+}
+
+static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create)
+{
+ u32 remote = parms->iph.daddr;
+ u32 local = parms->iph.saddr;
+ u32 key = parms->i_key;
+ struct ip_tunnel *t, **tp, *nt;
+ struct net_device *dev;
+ unsigned h = HASH(key);
+ int prio = 0;
+ char name[IFNAMSIZ];
+
+ if (local)
+ prio |= 1;
+ if (remote && !MULTICAST(remote)) {
+ prio |= 2;
+ h ^= HASH(remote);
+ }
+ for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
+ if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
+ if (key == t->parms.i_key)
+ return t;
+ }
+ }
+ if (!create)
+ return NULL;
+
+ if (parms->name[0])
+ strlcpy(name, parms->name, IFNAMSIZ);
+ else {
+ int i;
+ for (i=1; i<100; i++) {
+ sprintf(name, "gre%d", i);
+ if (__dev_get_by_name(name) == NULL)
+ break;
+ }
+ if (i==100)
+ goto failed;
+ }
+
+ dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
+ if (!dev)
+ return NULL;
+
+ dev->init = ipgre_tunnel_init;
+ nt = dev->priv;
+ nt->parms = *parms;
+
+ if (register_netdevice(dev) < 0) {
+ free_netdev(dev);
+ goto failed;
+ }
+
+ nt = dev->priv;
+ nt->parms = *parms;
+
+ dev_hold(dev);
+ ipgre_tunnel_link(nt);
+ /* Do not decrement MOD_USE_COUNT here. */
+ return nt;
+
+failed:
+ return NULL;
+}
+
+static void ipgre_tunnel_uninit(struct net_device *dev)
+{
+ ipgre_tunnel_unlink((struct ip_tunnel*)dev->priv);
+ dev_put(dev);
+}
+
+
+static void ipgre_err(struct sk_buff *skb, u32 info)
+{
+#ifndef I_WISH_WORLD_WERE_PERFECT
+
+/* It is not :-( All the routers (except for Linux) return only
+ 8 bytes of packet payload. It means, that precise relaying of
+ ICMP in the real Internet is absolutely infeasible.
+
+ Moreover, Cisco "wise men" put GRE key to the third word
+ in GRE header. It makes impossible maintaining even soft state for keyed
+ GRE tunnels with enabled checksum. Tell them "thank you".
+
+ Well, I wonder, rfc1812 was written by Cisco employee,
+ what the hell these idiots break standrads established
+ by themself???
+ */
+
+ struct iphdr *iph = (struct iphdr*)skb->data;
+ u16 *p = (u16*)(skb->data+(iph->ihl<<2));
+ int grehlen = (iph->ihl<<2) + 4;
+ int type = skb->h.icmph->type;
+ int code = skb->h.icmph->code;
+ struct ip_tunnel *t;
+ u16 flags;
+
+ flags = p[0];
+ if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
+ if (flags&(GRE_VERSION|GRE_ROUTING))
+ return;
+ if (flags&GRE_KEY) {
+ grehlen += 4;
+ if (flags&GRE_CSUM)
+ grehlen += 4;
+ }
+ }
+
+ /* If only 8 bytes returned, keyed message will be dropped here */
+ if (skb_headlen(skb) < grehlen)
+ return;
+
+ switch (type) {
+ default:
+ case ICMP_PARAMETERPROB:
+ return;
+
+ case ICMP_DEST_UNREACH:
+ switch (code) {
+ case ICMP_SR_FAILED:
+ case ICMP_PORT_UNREACH:
+ /* Impossible event. */
+ return;
+ case ICMP_FRAG_NEEDED:
+ /* Soft state for pmtu is maintained by IP core. */
+ return;
+ default:
+ /* All others are translated to HOST_UNREACH.
+ rfc2003 contains "deep thoughts" about NET_UNREACH,
+ I believe they are just ether pollution. --ANK
+ */
+ break;
+ }
+ break;
+ case ICMP_TIME_EXCEEDED:
+ if (code != ICMP_EXC_TTL)
+ return;
+ break;
+ }
+
+ read_lock(&ipgre_lock);
+ t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((u32*)p) + (grehlen>>2) - 1) : 0);
+ if (t == NULL || t->parms.iph.daddr == 0 || MULTICAST(t->parms.iph.daddr))
+ goto out;
+
+ if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+ goto out;
+
+ if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
+ t->err_count++;
+ else
+ t->err_count = 1;
+ t->err_time = jiffies;
+out:
+ read_unlock(&ipgre_lock);
+ return;
+#else
+ struct iphdr *iph = (struct iphdr*)dp;
+ struct iphdr *eiph;
+ u16 *p = (u16*)(dp+(iph->ihl<<2));
+ int type = skb->h.icmph->type;
+ int code = skb->h.icmph->code;
+ int rel_type = 0;
+ int rel_code = 0;
+ int rel_info = 0;
+ u16 flags;
+ int grehlen = (iph->ihl<<2) + 4;
+ struct sk_buff *skb2;
+ struct flowi fl;
+ struct rtable *rt;
+
+ if (p[1] != htons(ETH_P_IP))
+ return;
+
+ flags = p[0];
+ if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
+ if (flags&(GRE_VERSION|GRE_ROUTING))
+ return;
+ if (flags&GRE_CSUM)
+ grehlen += 4;
+ if (flags&GRE_KEY)
+ grehlen += 4;
+ if (flags&GRE_SEQ)
+ grehlen += 4;
+ }
+ if (len < grehlen + sizeof(struct iphdr))
+ return;
+ eiph = (struct iphdr*)(dp + grehlen);
+
+ switch (type) {
+ default:
+ return;
+ case ICMP_PARAMETERPROB:
+ if (skb->h.icmph->un.gateway < (iph->ihl<<2))
+ return;
+
+ /* So... This guy found something strange INSIDE encapsulated
+ packet. Well, he is fool, but what can we do ?
+ */
+ rel_type = ICMP_PARAMETERPROB;
+ rel_info = skb->h.icmph->un.gateway - grehlen;
+ break;
+
+ case ICMP_DEST_UNREACH:
+ switch (code) {
+ case ICMP_SR_FAILED:
+ case ICMP_PORT_UNREACH:
+ /* Impossible event. */
+ return;
+ case ICMP_FRAG_NEEDED:
+ /* And it is the only really necessary thing :-) */
+ rel_info = ntohs(skb->h.icmph->un.frag.mtu);
+ if (rel_info < grehlen+68)
+ return;
+ rel_info -= grehlen;
+ /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
+ if (rel_info > ntohs(eiph->tot_len))
+ return;
+ break;
+ default:
+ /* All others are translated to HOST_UNREACH.
+ rfc2003 contains "deep thoughts" about NET_UNREACH,
+ I believe, it is just ether pollution. --ANK
+ */
+ rel_type = ICMP_DEST_UNREACH;
+ rel_code = ICMP_HOST_UNREACH;
+ break;
+ }
+ break;
+ case ICMP_TIME_EXCEEDED:
+ if (code != ICMP_EXC_TTL)
+ return;
+ break;
+ }
+
+ /* Prepare fake skb to feed it to icmp_send */
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2 == NULL)
+ return;
+ dst_release(skb2->dst);
+ skb2->dst = NULL;
+ skb_pull(skb2, skb->data - (u8*)eiph);
+ skb2->nh.raw = skb2->data;
+
+ /* Try to guess incoming interface */
+ memset(&fl, 0, sizeof(fl));
+ fl.fl4_dst = eiph->saddr;
+ fl.fl4_tos = RT_TOS(eiph->tos);
+ fl.proto = IPPROTO_GRE;
+ if (ip_route_output_key(&rt, &fl)) {
+ kfree_skb(skb2);
+ return;
+ }
+ skb2->dev = rt->u.dst.dev;
+
+ /* route "incoming" packet */
+ if (rt->rt_flags&RTCF_LOCAL) {
+ ip_rt_put(rt);
+ rt = NULL;
+ fl.fl4_dst = eiph->daddr;
+ fl.fl4_src = eiph->saddr;
+ fl.fl4_tos = eiph->tos;
+ if (ip_route_output_key(&rt, &fl) ||
+ rt->u.dst.dev->type != ARPHRD_IPGRE) {
+ ip_rt_put(rt);
+ kfree_skb(skb2);
+ return;
+ }
+ } else {
+ ip_rt_put(rt);
+ if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
+ skb2->dst->dev->type != ARPHRD_IPGRE) {
+ kfree_skb(skb2);
+ return;
+ }
+ }
+
+ /* change mtu on this route */
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ if (rel_info > dst_mtu(skb2->dst)) {
+ kfree_skb(skb2);
+ return;
+ }
+ skb2->dst->ops->update_pmtu(skb2->dst, rel_info);
+ rel_info = htonl(rel_info);
+ } else if (type == ICMP_TIME_EXCEEDED) {
+ struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv;
+ if (t->parms.iph.ttl) {
+ rel_type = ICMP_DEST_UNREACH;
+ rel_code = ICMP_HOST_UNREACH;
+ }
+ }
+
+ icmp_send(skb2, rel_type, rel_code, rel_info);
+ kfree_skb(skb2);
+#endif
+}
+
+static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
+{
+ if (INET_ECN_is_ce(iph->tos)) {
+ if (skb->protocol == htons(ETH_P_IP)) {
+ IP_ECN_set_ce(skb->nh.iph);
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ IP6_ECN_set_ce(skb->nh.ipv6h);
+ }
+ }
+}
+
+static inline u8
+ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
+{
+ u8 inner = 0;
+ if (skb->protocol == htons(ETH_P_IP))
+ inner = old_iph->tos;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
+ return INET_ECN_encapsulate(tos, inner);
+}
+
+static int ipgre_rcv(struct sk_buff *skb)
+{
+ struct iphdr *iph;
+ u8 *h;
+ u16 flags;
+ u16 csum = 0;
+ u32 key = 0;
+ u32 seqno = 0;
+ struct ip_tunnel *tunnel;
+ int offset = 4;
+
+ if (!pskb_may_pull(skb, 16))
+ goto drop_nolock;
+
+ iph = skb->nh.iph;
+ h = skb->data;
+ flags = *(u16*)h;
+
+ if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
+ /* - Version must be 0.
+ - We do not support routing headers.
+ */
+ if (flags&(GRE_VERSION|GRE_ROUTING))
+ goto drop_nolock;
+
+ if (flags&GRE_CSUM) {
+ if (skb->ip_summed == CHECKSUM_HW) {
+ csum = (u16)csum_fold(skb->csum);
+ if (csum)
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+ if (skb->ip_summed == CHECKSUM_NONE) {
+ skb->csum = skb_checksum(skb, 0, skb->len, 0);
+ skb->ip_summed = CHECKSUM_HW;
+ csum = (u16)csum_fold(skb->csum);
+ }
+ offset += 4;
+ }
+ if (flags&GRE_KEY) {
+ key = *(u32*)(h + offset);
+ offset += 4;
+ }
+ if (flags&GRE_SEQ) {
+ seqno = ntohl(*(u32*)(h + offset));
+ offset += 4;
+ }
+ }
+
+ read_lock(&ipgre_lock);
+ if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) {
+ secpath_reset(skb);
+
+ skb->protocol = *(u16*)(h + 2);
+ /* WCCP version 1 and 2 protocol decoding.
+ * - Change protocol to IP
+ * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+ */
+ if (flags == 0 &&
+ skb->protocol == __constant_htons(ETH_P_WCCP)) {
+ skb->protocol = __constant_htons(ETH_P_IP);
+ if ((*(h + offset) & 0xF0) != 0x40)
+ offset += 4;
+ }
+
+ skb->mac.raw = skb->nh.raw;
+ skb->nh.raw = __pskb_pull(skb, offset);
+ skb_postpull_rcsum(skb, skb->mac.raw, offset);
+ memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+ skb->pkt_type = PACKET_HOST;
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (MULTICAST(iph->daddr)) {
+ /* Looped back packet, drop it! */
+ if (((struct rtable*)skb->dst)->fl.iif == 0)
+ goto drop;
+ tunnel->stat.multicast++;
+ skb->pkt_type = PACKET_BROADCAST;
+ }
+#endif
+
+ if (((flags&GRE_CSUM) && csum) ||
+ (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
+ tunnel->stat.rx_crc_errors++;
+ tunnel->stat.rx_errors++;
+ goto drop;
+ }
+ if (tunnel->parms.i_flags&GRE_SEQ) {
+ if (!(flags&GRE_SEQ) ||
+ (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
+ tunnel->stat.rx_fifo_errors++;
+ tunnel->stat.rx_errors++;
+ goto drop;
+ }
+ tunnel->i_seqno = seqno + 1;
+ }
+ tunnel->stat.rx_packets++;
+ tunnel->stat.rx_bytes += skb->len;
+ skb->dev = tunnel->dev;
+ dst_release(skb->dst);
+ skb->dst = NULL;
+ nf_reset(skb);
+ ipgre_ecn_decapsulate(iph, skb);
+ netif_rx(skb);
+ read_unlock(&ipgre_lock);
+ return(0);
+ }
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);
+
+drop:
+ read_unlock(&ipgre_lock);
+drop_nolock:
+ kfree_skb(skb);
+ return(0);
+}
+
+static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
+ struct net_device_stats *stats = &tunnel->stat;
+ struct iphdr *old_iph = skb->nh.iph;
+ struct iphdr *tiph;
+ u8 tos;
+ u16 df;
+ struct rtable *rt; /* Route to the other host */
+ struct net_device *tdev; /* Device to other host */
+ struct iphdr *iph; /* Our new IP header */
+ int max_headroom; /* The extra header space needed */
+ int gre_hlen;
+ u32 dst;
+ int mtu;
+
+ if (tunnel->recursion++) {
+ tunnel->stat.collisions++;
+ goto tx_error;
+ }
+
+ if (dev->hard_header) {
+ gre_hlen = 0;
+ tiph = (struct iphdr*)skb->data;
+ } else {
+ gre_hlen = tunnel->hlen;
+ tiph = &tunnel->parms.iph;
+ }
+
+ if ((dst = tiph->daddr) == 0) {
+ /* NBMA tunnel */
+
+ if (skb->dst == NULL) {
+ tunnel->stat.tx_fifo_errors++;
+ goto tx_error;
+ }
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ rt = (struct rtable*)skb->dst;
+ if ((dst = rt->rt_gateway) == 0)
+ goto tx_error_icmp;
+ }
+#ifdef CONFIG_IPV6
+ else if (skb->protocol == htons(ETH_P_IPV6)) {
+ struct in6_addr *addr6;
+ int addr_type;
+ struct neighbour *neigh = skb->dst->neighbour;
+
+ if (neigh == NULL)
+ goto tx_error;
+
+ addr6 = (struct in6_addr*)&neigh->primary_key;
+ addr_type = ipv6_addr_type(addr6);
+
+ if (addr_type == IPV6_ADDR_ANY) {
+ addr6 = &skb->nh.ipv6h->daddr;
+ addr_type = ipv6_addr_type(addr6);
+ }
+
+ if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
+ goto tx_error_icmp;
+
+ dst = addr6->s6_addr32[3];
+ }
+#endif
+ else
+ goto tx_error;
+ }
+
+ tos = tiph->tos;
+ if (tos&1) {
+ if (skb->protocol == htons(ETH_P_IP))
+ tos = old_iph->tos;
+ tos &= ~1;
+ }
+
+ {
+ struct flowi fl = { .oif = tunnel->parms.link,
+ .nl_u = { .ip4_u =
+ { .daddr = dst,
+ .saddr = tiph->saddr,
+ .tos = RT_TOS(tos) } },
+ .proto = IPPROTO_GRE };
+ if (ip_route_output_key(&rt, &fl)) {
+ tunnel->stat.tx_carrier_errors++;
+ goto tx_error;
+ }
+ }
+ tdev = rt->u.dst.dev;
+
+ if (tdev == dev) {
+ ip_rt_put(rt);
+ tunnel->stat.collisions++;
+ goto tx_error;
+ }
+
+ df = tiph->frag_off;
+ if (df)
+ mtu = dst_mtu(&rt->u.dst) - tunnel->hlen;
+ else
+ mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
+
+ if (skb->dst)
+ skb->dst->ops->update_pmtu(skb->dst, mtu);
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ df |= (old_iph->frag_off&htons(IP_DF));
+
+ if ((old_iph->frag_off&htons(IP_DF)) &&
+ mtu < ntohs(old_iph->tot_len)) {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+ }
+#ifdef CONFIG_IPV6
+ else if (skb->protocol == htons(ETH_P_IPV6)) {
+ struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
+
+ if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
+ if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) ||
+ rt6->rt6i_dst.plen == 128) {
+ rt6->rt6i_flags |= RTF_MODIFIED;
+ skb->dst->metrics[RTAX_MTU-1] = mtu;
+ }
+ }
+
+ if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+ }
+#endif
+
+ if (tunnel->err_count > 0) {
+ if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
+ tunnel->err_count--;
+
+ dst_link_failure(skb);
+ } else
+ tunnel->err_count = 0;
+ }
+
+ max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
+
+ if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
+ struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
+ if (!new_skb) {
+ ip_rt_put(rt);
+ stats->tx_dropped++;
+ dev_kfree_skb(skb);
+ tunnel->recursion--;
+ return 0;
+ }
+ if (skb->sk)
+ skb_set_owner_w(new_skb, skb->sk);
+ dev_kfree_skb(skb);
+ skb = new_skb;
+ old_iph = skb->nh.iph;
+ }
+
+ skb->h.raw = skb->nh.raw;
+ skb->nh.raw = skb_push(skb, gre_hlen);
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ dst_release(skb->dst);
+ skb->dst = &rt->u.dst;
+
+ /*
+ * Push down and install the IPIP header.
+ */
+
+ iph = skb->nh.iph;
+ iph->version = 4;
+ iph->ihl = sizeof(struct iphdr) >> 2;
+ iph->frag_off = df;
+ iph->protocol = IPPROTO_GRE;
+ iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
+ iph->daddr = rt->rt_dst;
+ iph->saddr = rt->rt_src;
+
+ if ((iph->ttl = tiph->ttl) == 0) {
+ if (skb->protocol == htons(ETH_P_IP))
+ iph->ttl = old_iph->ttl;
+#ifdef CONFIG_IPV6
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
+#endif
+ else
+ iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
+ }
+
+ ((u16*)(iph+1))[0] = tunnel->parms.o_flags;
+ ((u16*)(iph+1))[1] = skb->protocol;
+
+ if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
+ u32 *ptr = (u32*)(((u8*)iph) + tunnel->hlen - 4);
+
+ if (tunnel->parms.o_flags&GRE_SEQ) {
+ ++tunnel->o_seqno;
+ *ptr = htonl(tunnel->o_seqno);
+ ptr--;
+ }
+ if (tunnel->parms.o_flags&GRE_KEY) {
+ *ptr = tunnel->parms.o_key;
+ ptr--;
+ }
+ if (tunnel->parms.o_flags&GRE_CSUM) {
+ *ptr = 0;
+ *(__u16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
+ }
+ }
+
+ nf_reset(skb);
+
+ IPTUNNEL_XMIT();
+ tunnel->recursion--;
+ return 0;
+
+tx_error_icmp:
+ dst_link_failure(skb);
+
+tx_error:
+ stats->tx_errors++;
+ dev_kfree_skb(skb);
+ tunnel->recursion--;
+ return 0;
+}
+
+static int
+ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+ int err = 0;
+ struct ip_tunnel_parm p;
+ struct ip_tunnel *t;
+
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ t = NULL;
+ if (dev == ipgre_fb_tunnel_dev) {
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+ err = -EFAULT;
+ break;
+ }
+ t = ipgre_tunnel_locate(&p, 0);
+ }
+ if (t == NULL)
+ t = (struct ip_tunnel*)dev->priv;
+ memcpy(&p, &t->parms, sizeof(p));
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ err = -EFAULT;
+ break;
+
+ case SIOCADDTUNNEL:
+ case SIOCCHGTUNNEL:
+ err = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ goto done;
+
+ err = -EFAULT;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ goto done;
+
+ err = -EINVAL;
+ if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
+ p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
+ ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
+ goto done;
+ if (p.iph.ttl)
+ p.iph.frag_off |= htons(IP_DF);
+
+ if (!(p.i_flags&GRE_KEY))
+ p.i_key = 0;
+ if (!(p.o_flags&GRE_KEY))
+ p.o_key = 0;
+
+ t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
+
+ if (dev != ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+ if (t != NULL) {
+ if (t->dev != dev) {
+ err = -EEXIST;
+ break;
+ }
+ } else {
+ unsigned nflags=0;
+
+ t = (struct ip_tunnel*)dev->priv;
+
+ if (MULTICAST(p.iph.daddr))
+ nflags = IFF_BROADCAST;
+ else if (p.iph.daddr)
+ nflags = IFF_POINTOPOINT;
+
+ if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
+ err = -EINVAL;
+ break;
+ }
+ ipgre_tunnel_unlink(t);
+ t->parms.iph.saddr = p.iph.saddr;
+ t->parms.iph.daddr = p.iph.daddr;
+ t->parms.i_key = p.i_key;
+ t->parms.o_key = p.o_key;
+ memcpy(dev->dev_addr, &p.iph.saddr, 4);
+ memcpy(dev->broadcast, &p.iph.daddr, 4);
+ ipgre_tunnel_link(t);
+ netdev_state_change(dev);
+ }
+ }
+
+ if (t) {
+ err = 0;
+ if (cmd == SIOCCHGTUNNEL) {
+ t->parms.iph.ttl = p.iph.ttl;
+ t->parms.iph.tos = p.iph.tos;
+ t->parms.iph.frag_off = p.iph.frag_off;
+ }
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
+ err = -EFAULT;
+ } else
+ err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+ break;
+
+ case SIOCDELTUNNEL:
+ err = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ goto done;
+
+ if (dev == ipgre_fb_tunnel_dev) {
+ err = -EFAULT;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ goto done;
+ err = -ENOENT;
+ if ((t = ipgre_tunnel_locate(&p, 0)) == NULL)
+ goto done;
+ err = -EPERM;
+ if (t == ipgre_fb_tunnel_dev->priv)
+ goto done;
+ dev = t->dev;
+ }
+ err = unregister_netdevice(dev);
+ break;
+
+ default:
+ err = -EINVAL;
+ }
+
+done:
+ return err;
+}
+
+static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
+{
+ return &(((struct ip_tunnel*)dev->priv)->stat);
+}
+
+static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
+ if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+/* Nice toy. Unfortunately, useless in real life :-)
+ It allows to construct virtual multiprotocol broadcast "LAN"
+ over the Internet, provided multicast routing is tuned.
+
+
+ I have no idea was this bicycle invented before me,
+ so that I had to set ARPHRD_IPGRE to a random value.
+ I have an impression, that Cisco could make something similar,
+ but this feature is apparently missing in IOS<=11.2(8).
+
+ I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
+ with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
+
+ ping -t 255 224.66.66.66
+
+ If nobody answers, mbone does not work.
+
+ ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
+ ip addr add 10.66.66.<somewhat>/24 dev Universe
+ ifconfig Universe up
+ ifconfig Universe add fe80::<Your_real_addr>/10
+ ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
+ ftp 10.66.66.66
+ ...
+ ftp fec0:6666:6666::193.233.7.65
+ ...
+
+ */
+
+static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
+ void *daddr, void *saddr, unsigned len)
+{
+ struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
+ struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
+ u16 *p = (u16*)(iph+1);
+
+ memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
+ p[0] = t->parms.o_flags;
+ p[1] = htons(type);
+
+ /*
+ * Set the source hardware address.
+ */
+
+ if (saddr)
+ memcpy(&iph->saddr, saddr, 4);
+
+ if (daddr) {
+ memcpy(&iph->daddr, daddr, 4);
+ return t->hlen;
+ }
+ if (iph->daddr && !MULTICAST(iph->daddr))
+ return t->hlen;
+
+ return -t->hlen;
+}
+
+static int ipgre_open(struct net_device *dev)
+{
+ struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
+
+ if (MULTICAST(t->parms.iph.daddr)) {
+ struct flowi fl = { .oif = t->parms.link,
+ .nl_u = { .ip4_u =
+ { .daddr = t->parms.iph.daddr,
+ .saddr = t->parms.iph.saddr,
+ .tos = RT_TOS(t->parms.iph.tos) } },
+ .proto = IPPROTO_GRE };
+ struct rtable *rt;
+ if (ip_route_output_key(&rt, &fl))
+ return -EADDRNOTAVAIL;
+ dev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ if (__in_dev_get(dev) == NULL)
+ return -EADDRNOTAVAIL;
+ t->mlink = dev->ifindex;
+ ip_mc_inc_group(__in_dev_get(dev), t->parms.iph.daddr);
+ }
+ return 0;
+}
+
+static int ipgre_close(struct net_device *dev)
+{
+ struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
+ if (MULTICAST(t->parms.iph.daddr) && t->mlink) {
+ struct in_device *in_dev = inetdev_by_index(t->mlink);
+ if (in_dev) {
+ ip_mc_dec_group(in_dev, t->parms.iph.daddr);
+ in_dev_put(in_dev);
+ }
+ }
+ return 0;
+}
+
+#endif
+
+static void ipgre_tunnel_setup(struct net_device *dev)
+{
+ SET_MODULE_OWNER(dev);
+ dev->uninit = ipgre_tunnel_uninit;
+ dev->destructor = free_netdev;
+ dev->hard_start_xmit = ipgre_tunnel_xmit;
+ dev->get_stats = ipgre_tunnel_get_stats;
+ dev->do_ioctl = ipgre_tunnel_ioctl;
+ dev->change_mtu = ipgre_tunnel_change_mtu;
+
+ dev->type = ARPHRD_IPGRE;
+ dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
+ dev->mtu = 1500 - sizeof(struct iphdr) - 4;
+ dev->flags = IFF_NOARP;
+ dev->iflink = 0;
+ dev->addr_len = 4;
+}
+
+static int ipgre_tunnel_init(struct net_device *dev)
+{
+ struct net_device *tdev = NULL;
+ struct ip_tunnel *tunnel;
+ struct iphdr *iph;
+ int hlen = LL_MAX_HEADER;
+ int mtu = 1500;
+ int addend = sizeof(struct iphdr) + 4;
+
+ tunnel = (struct ip_tunnel*)dev->priv;
+ iph = &tunnel->parms.iph;
+
+ tunnel->dev = dev;
+ strcpy(tunnel->parms.name, dev->name);
+
+ memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
+ memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+
+ /* Guess output device to choose reasonable mtu and hard_header_len */
+
+ if (iph->daddr) {
+ struct flowi fl = { .oif = tunnel->parms.link,
+ .nl_u = { .ip4_u =
+ { .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .tos = RT_TOS(iph->tos) } },
+ .proto = IPPROTO_GRE };
+ struct rtable *rt;
+ if (!ip_route_output_key(&rt, &fl)) {
+ tdev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ }
+
+ dev->flags |= IFF_POINTOPOINT;
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (MULTICAST(iph->daddr)) {
+ if (!iph->saddr)
+ return -EINVAL;
+ dev->flags = IFF_BROADCAST;
+ dev->hard_header = ipgre_header;
+ dev->open = ipgre_open;
+ dev->stop = ipgre_close;
+ }
+#endif
+ }
+
+ if (!tdev && tunnel->parms.link)
+ tdev = __dev_get_by_index(tunnel->parms.link);
+
+ if (tdev) {
+ hlen = tdev->hard_header_len;
+ mtu = tdev->mtu;
+ }
+ dev->iflink = tunnel->parms.link;
+
+ /* Precalculate GRE options length */
+ if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
+ if (tunnel->parms.o_flags&GRE_CSUM)
+ addend += 4;
+ if (tunnel->parms.o_flags&GRE_KEY)
+ addend += 4;
+ if (tunnel->parms.o_flags&GRE_SEQ)
+ addend += 4;
+ }
+ dev->hard_header_len = hlen + addend;
+ dev->mtu = mtu - addend;
+ tunnel->hlen = addend;
+ return 0;
+}
+
+int __init ipgre_fb_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
+ struct iphdr *iph = &tunnel->parms.iph;
+
+ tunnel->dev = dev;
+ strcpy(tunnel->parms.name, dev->name);
+
+ iph->version = 4;
+ iph->protocol = IPPROTO_GRE;
+ iph->ihl = 5;
+ tunnel->hlen = sizeof(struct iphdr) + 4;
+
+ dev_hold(dev);
+ tunnels_wc[0] = tunnel;
+ return 0;
+}
+
+
+static struct net_protocol ipgre_protocol = {
+ .handler = ipgre_rcv,
+ .err_handler = ipgre_err,
+};
+
+
+/*
+ * And now the modules code and kernel interface.
+ */
+
+static int __init ipgre_init(void)
+{
+ int err;
+
+ printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
+
+ if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
+ printk(KERN_INFO "ipgre init: can't add protocol\n");
+ return -EAGAIN;
+ }
+
+ ipgre_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
+ ipgre_tunnel_setup);
+ if (!ipgre_fb_tunnel_dev) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ ipgre_fb_tunnel_dev->init = ipgre_fb_tunnel_init;
+
+ if ((err = register_netdev(ipgre_fb_tunnel_dev)))
+ goto err2;
+out:
+ return err;
+err2:
+ free_netdev(ipgre_fb_tunnel_dev);
+err1:
+ inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
+ goto out;
+}
+
+static void ipgre_fini(void)
+{
+ if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
+ printk(KERN_INFO "ipgre close: can't remove protocol\n");
+
+ unregister_netdev(ipgre_fb_tunnel_dev);
+}
+
+module_init(ipgre_init);
+module_exit(ipgre_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
new file mode 100644
index 00000000000..a0d0833034b
--- /dev/null
+++ b/net/ipv4/ip_input.c
@@ -0,0 +1,431 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The Internet Protocol (IP) module.
+ *
+ * Version: $Id: ip_input.c,v 1.55 2002/01/12 07:39:45 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Donald Becker, <becker@super.org>
+ * Alan Cox, <Alan.Cox@linux.org>
+ * Richard Underwood
+ * Stefan Becker, <stefanb@yello.ping.de>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *
+ *
+ * Fixes:
+ * Alan Cox : Commented a couple of minor bits of surplus code
+ * Alan Cox : Undefining IP_FORWARD doesn't include the code
+ * (just stops a compiler warning).
+ * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes
+ * are junked rather than corrupting things.
+ * Alan Cox : Frames to bad broadcast subnets are dumped
+ * We used to process them non broadcast and
+ * boy could that cause havoc.
+ * Alan Cox : ip_forward sets the free flag on the
+ * new frame it queues. Still crap because
+ * it copies the frame but at least it
+ * doesn't eat memory too.
+ * Alan Cox : Generic queue code and memory fixes.
+ * Fred Van Kempen : IP fragment support (borrowed from NET2E)
+ * Gerhard Koerting: Forward fragmented frames correctly.
+ * Gerhard Koerting: Fixes to my fix of the above 8-).
+ * Gerhard Koerting: IP interface addressing fix.
+ * Linus Torvalds : More robustness checks
+ * Alan Cox : Even more checks: Still not as robust as it ought to be
+ * Alan Cox : Save IP header pointer for later
+ * Alan Cox : ip option setting
+ * Alan Cox : Use ip_tos/ip_ttl settings
+ * Alan Cox : Fragmentation bogosity removed
+ * (Thanks to Mark.Bush@prg.ox.ac.uk)
+ * Dmitry Gorodchanin : Send of a raw packet crash fix.
+ * Alan Cox : Silly ip bug when an overlength
+ * fragment turns up. Now frees the
+ * queue.
+ * Linus Torvalds/ : Memory leakage on fragmentation
+ * Alan Cox : handling.
+ * Gerhard Koerting: Forwarding uses IP priority hints
+ * Teemu Rantanen : Fragment problems.
+ * Alan Cox : General cleanup, comments and reformat
+ * Alan Cox : SNMP statistics
+ * Alan Cox : BSD address rule semantics. Also see
+ * UDP as there is a nasty checksum issue
+ * if you do things the wrong way.
+ * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file
+ * Alan Cox : IP options adjust sk->priority.
+ * Pedro Roque : Fix mtu/length error in ip_forward.
+ * Alan Cox : Avoid ip_chk_addr when possible.
+ * Richard Underwood : IP multicasting.
+ * Alan Cox : Cleaned up multicast handlers.
+ * Alan Cox : RAW sockets demultiplex in the BSD style.
+ * Gunther Mayer : Fix the SNMP reporting typo
+ * Alan Cox : Always in group 224.0.0.1
+ * Pauline Middelink : Fast ip_checksum update when forwarding
+ * Masquerading support.
+ * Alan Cox : Multicast loopback error for 224.0.0.1
+ * Alan Cox : IP_MULTICAST_LOOP option.
+ * Alan Cox : Use notifiers.
+ * Bjorn Ekwall : Removed ip_csum (from slhc.c too)
+ * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!)
+ * Stefan Becker : Send out ICMP HOST REDIRECT
+ * Arnt Gulbrandsen : ip_build_xmit
+ * Alan Cox : Per socket routing cache
+ * Alan Cox : Fixed routing cache, added header cache.
+ * Alan Cox : Loopback didn't work right in original ip_build_xmit - fixed it.
+ * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net.
+ * Alan Cox : Incoming IP option handling.
+ * Alan Cox : Set saddr on raw output frames as per BSD.
+ * Alan Cox : Stopped broadcast source route explosions.
+ * Alan Cox : Can disable source routing
+ * Takeshi Sone : Masquerading didn't work.
+ * Dave Bonn,Alan Cox : Faster IP forwarding whenever possible.
+ * Alan Cox : Memory leaks, tramples, misc debugging.
+ * Alan Cox : Fixed multicast (by popular demand 8))
+ * Alan Cox : Fixed forwarding (by even more popular demand 8))
+ * Alan Cox : Fixed SNMP statistics [I think]
+ * Gerhard Koerting : IP fragmentation forwarding fix
+ * Alan Cox : Device lock against page fault.
+ * Alan Cox : IP_HDRINCL facility.
+ * Werner Almesberger : Zero fragment bug
+ * Alan Cox : RAW IP frame length bug
+ * Alan Cox : Outgoing firewall on build_xmit
+ * A.N.Kuznetsov : IP_OPTIONS support throughout the kernel
+ * Alan Cox : Multicast routing hooks
+ * Jos Vos : Do accounting *before* call_in_firewall
+ * Willy Konynenberg : Transparent proxying support
+ *
+ *
+ *
+ * To Fix:
+ * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient
+ * and could be made very efficient with the addition of some virtual memory hacks to permit
+ * the allocation of a buffer that can then be 'grown' by twiddling page tables.
+ * Output fragmentation wants updating along with the buffer management to use a single
+ * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet
+ * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause
+ * fragmentation anyway.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/system.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/config.h>
+
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/icmp.h>
+#include <net/raw.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/xfrm.h>
+#include <linux/mroute.h>
+#include <linux/netlink.h>
+
+/*
+ * SNMP management statistics
+ */
+
+DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics);
+
+/*
+ * Process Router Attention IP option
+ */
+int ip_call_ra_chain(struct sk_buff *skb)
+{
+ struct ip_ra_chain *ra;
+ u8 protocol = skb->nh.iph->protocol;
+ struct sock *last = NULL;
+
+ read_lock(&ip_ra_lock);
+ for (ra = ip_ra_chain; ra; ra = ra->next) {
+ struct sock *sk = ra->sk;
+
+ /* If socket is bound to an interface, only report
+ * the packet if it came from that interface.
+ */
+ if (sk && inet_sk(sk)->num == protocol &&
+ (!sk->sk_bound_dev_if ||
+ sk->sk_bound_dev_if == skb->dev->ifindex)) {
+ if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+ skb = ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN);
+ if (skb == NULL) {
+ read_unlock(&ip_ra_lock);
+ return 1;
+ }
+ }
+ if (last) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2)
+ raw_rcv(last, skb2);
+ }
+ last = sk;
+ }
+ }
+
+ if (last) {
+ raw_rcv(last, skb);
+ read_unlock(&ip_ra_lock);
+ return 1;
+ }
+ read_unlock(&ip_ra_lock);
+ return 0;
+}
+
+static inline int ip_local_deliver_finish(struct sk_buff *skb)
+{
+ int ihl = skb->nh.iph->ihl*4;
+
+#ifdef CONFIG_NETFILTER_DEBUG
+ nf_debug_ip_local_deliver(skb);
+#endif /*CONFIG_NETFILTER_DEBUG*/
+
+ __skb_pull(skb, ihl);
+
+ /* Free reference early: we don't need it any more, and it may
+ hold ip_conntrack module loaded indefinitely. */
+ nf_reset(skb);
+
+ /* Point into the IP datagram, just past the header. */
+ skb->h.raw = skb->data;
+
+ rcu_read_lock();
+ {
+ /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
+ int protocol = skb->nh.iph->protocol;
+ int hash;
+ struct sock *raw_sk;
+ struct net_protocol *ipprot;
+
+ resubmit:
+ hash = protocol & (MAX_INET_PROTOS - 1);
+ raw_sk = sk_head(&raw_v4_htable[hash]);
+
+ /* If there maybe a raw socket we must check - if not we
+ * don't care less
+ */
+ if (raw_sk)
+ raw_v4_input(skb, skb->nh.iph, hash);
+
+ if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) {
+ int ret;
+
+ if (!ipprot->no_policy &&
+ !xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ kfree_skb(skb);
+ goto out;
+ }
+ ret = ipprot->handler(skb);
+ if (ret < 0) {
+ protocol = -ret;
+ goto resubmit;
+ }
+ IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
+ } else {
+ if (!raw_sk) {
+ if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
+ icmp_send(skb, ICMP_DEST_UNREACH,
+ ICMP_PROT_UNREACH, 0);
+ }
+ } else
+ IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
+ kfree_skb(skb);
+ }
+ }
+ out:
+ rcu_read_unlock();
+
+ return 0;
+}
+
+/*
+ * Deliver IP Packets to the higher protocol layers.
+ */
+int ip_local_deliver(struct sk_buff *skb)
+{
+ /*
+ * Reassemble IP fragments.
+ */
+
+ if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+ skb = ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER);
+ if (!skb)
+ return 0;
+ }
+
+ return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL,
+ ip_local_deliver_finish);
+}
+
+static inline int ip_rcv_finish(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct iphdr *iph = skb->nh.iph;
+
+ /*
+ * Initialise the virtual path cache for the packet. It describes
+ * how the packet travels inside Linux networking.
+ */
+ if (skb->dst == NULL) {
+ if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))
+ goto drop;
+ }
+
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (skb->dst->tclassid) {
+ struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id();
+ u32 idx = skb->dst->tclassid;
+ st[idx&0xFF].o_packets++;
+ st[idx&0xFF].o_bytes+=skb->len;
+ st[(idx>>16)&0xFF].i_packets++;
+ st[(idx>>16)&0xFF].i_bytes+=skb->len;
+ }
+#endif
+
+ if (iph->ihl > 5) {
+ struct ip_options *opt;
+
+ /* It looks as overkill, because not all
+ IP options require packet mangling.
+ But it is the easiest for now, especially taking
+ into account that combination of IP options
+ and running sniffer is extremely rare condition.
+ --ANK (980813)
+ */
+
+ if (skb_cow(skb, skb_headroom(skb))) {
+ IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+ goto drop;
+ }
+ iph = skb->nh.iph;
+
+ if (ip_options_compile(NULL, skb))
+ goto inhdr_error;
+
+ opt = &(IPCB(skb)->opt);
+ if (opt->srr) {
+ struct in_device *in_dev = in_dev_get(dev);
+ if (in_dev) {
+ if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
+ if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
+ printk(KERN_INFO "source route option %u.%u.%u.%u -> %u.%u.%u.%u\n",
+ NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+ in_dev_put(in_dev);
+ goto drop;
+ }
+ in_dev_put(in_dev);
+ }
+ if (ip_options_rcv_srr(skb))
+ goto drop;
+ }
+ }
+
+ return dst_input(skb);
+
+inhdr_error:
+ IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+/*
+ * Main IP Receive routine.
+ */
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+{
+ struct iphdr *iph;
+
+ /* When the interface is in promisc. mode, drop all the crap
+ * that it receives, do not try to analyse it.
+ */
+ if (skb->pkt_type == PACKET_OTHERHOST)
+ goto drop;
+
+ IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES);
+
+ if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
+ IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+ goto out;
+ }
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto inhdr_error;
+
+ iph = skb->nh.iph;
+
+ /*
+ * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum.
+ *
+ * Is the datagram acceptable?
+ *
+ * 1. Length at least the size of an ip header
+ * 2. Version of 4
+ * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
+ * 4. Doesn't have a bogus length
+ */
+
+ if (iph->ihl < 5 || iph->version != 4)
+ goto inhdr_error;
+
+ if (!pskb_may_pull(skb, iph->ihl*4))
+ goto inhdr_error;
+
+ iph = skb->nh.iph;
+
+ if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
+ goto inhdr_error;
+
+ {
+ __u32 len = ntohs(iph->tot_len);
+ if (skb->len < len || len < (iph->ihl<<2))
+ goto inhdr_error;
+
+ /* Our transport medium may have padded the buffer out. Now we know it
+ * is IP we can trim to the true length of the frame.
+ * Note this now means skb->len holds ntohs(iph->tot_len).
+ */
+ if (pskb_trim_rcsum(skb, len)) {
+ IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+ goto drop;
+ }
+ }
+
+ return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
+ ip_rcv_finish);
+
+inhdr_error:
+ IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
+drop:
+ kfree_skb(skb);
+out:
+ return NET_RX_DROP;
+}
+
+EXPORT_SYMBOL(ip_rcv);
+EXPORT_SYMBOL(ip_statistics);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
new file mode 100644
index 00000000000..6d89f3f3e70
--- /dev/null
+++ b/net/ipv4/ip_options.c
@@ -0,0 +1,625 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The options processing module for ip.c
+ *
+ * Version: $Id: ip_options.c,v 1.21 2001/09/01 00:31:50 davem Exp $
+ *
+ * Authors: A.N.Kuznetsov
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+
+/*
+ * Write options to IP header, record destination address to
+ * source route option, address of outgoing interface
+ * (we should already know it, so that this function is allowed be
+ * called only after routing decision) and timestamp,
+ * if we originate this datagram.
+ *
+ * daddr is real destination address, next hop is recorded in IP header.
+ * saddr is address of outgoing interface.
+ */
+
+void ip_options_build(struct sk_buff * skb, struct ip_options * opt,
+ u32 daddr, struct rtable *rt, int is_frag)
+{
+ unsigned char * iph = skb->nh.raw;
+
+ memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options));
+ memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen);
+ opt = &(IPCB(skb)->opt);
+ opt->is_data = 0;
+
+ if (opt->srr)
+ memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4);
+
+ if (!is_frag) {
+ if (opt->rr_needaddr)
+ ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, rt);
+ if (opt->ts_needaddr)
+ ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, rt);
+ if (opt->ts_needtime) {
+ struct timeval tv;
+ __u32 midtime;
+ do_gettimeofday(&tv);
+ midtime = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000);
+ memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4);
+ }
+ return;
+ }
+ if (opt->rr) {
+ memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]);
+ opt->rr = 0;
+ opt->rr_needaddr = 0;
+ }
+ if (opt->ts) {
+ memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]);
+ opt->ts = 0;
+ opt->ts_needaddr = opt->ts_needtime = 0;
+ }
+}
+
+/*
+ * Provided (sopt, skb) points to received options,
+ * build in dopt compiled option set appropriate for answering.
+ * i.e. invert SRR option, copy anothers,
+ * and grab room in RR/TS options.
+ *
+ * NOTE: dopt cannot point to skb.
+ */
+
+int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
+{
+ struct ip_options *sopt;
+ unsigned char *sptr, *dptr;
+ int soffset, doffset;
+ int optlen;
+ u32 daddr;
+
+ memset(dopt, 0, sizeof(struct ip_options));
+
+ dopt->is_data = 1;
+
+ sopt = &(IPCB(skb)->opt);
+
+ if (sopt->optlen == 0) {
+ dopt->optlen = 0;
+ return 0;
+ }
+
+ sptr = skb->nh.raw;
+ dptr = dopt->__data;
+
+ if (skb->dst)
+ daddr = ((struct rtable*)skb->dst)->rt_spec_dst;
+ else
+ daddr = skb->nh.iph->daddr;
+
+ if (sopt->rr) {
+ optlen = sptr[sopt->rr+1];
+ soffset = sptr[sopt->rr+2];
+ dopt->rr = dopt->optlen + sizeof(struct iphdr);
+ memcpy(dptr, sptr+sopt->rr, optlen);
+ if (sopt->rr_needaddr && soffset <= optlen) {
+ if (soffset + 3 > optlen)
+ return -EINVAL;
+ dptr[2] = soffset + 4;
+ dopt->rr_needaddr = 1;
+ }
+ dptr += optlen;
+ dopt->optlen += optlen;
+ }
+ if (sopt->ts) {
+ optlen = sptr[sopt->ts+1];
+ soffset = sptr[sopt->ts+2];
+ dopt->ts = dopt->optlen + sizeof(struct iphdr);
+ memcpy(dptr, sptr+sopt->ts, optlen);
+ if (soffset <= optlen) {
+ if (sopt->ts_needaddr) {
+ if (soffset + 3 > optlen)
+ return -EINVAL;
+ dopt->ts_needaddr = 1;
+ soffset += 4;
+ }
+ if (sopt->ts_needtime) {
+ if (soffset + 3 > optlen)
+ return -EINVAL;
+ if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) {
+ dopt->ts_needtime = 1;
+ soffset += 4;
+ } else {
+ dopt->ts_needtime = 0;
+
+ if (soffset + 8 <= optlen) {
+ __u32 addr;
+
+ memcpy(&addr, sptr+soffset-1, 4);
+ if (inet_addr_type(addr) != RTN_LOCAL) {
+ dopt->ts_needtime = 1;
+ soffset += 8;
+ }
+ }
+ }
+ }
+ dptr[2] = soffset;
+ }
+ dptr += optlen;
+ dopt->optlen += optlen;
+ }
+ if (sopt->srr) {
+ unsigned char * start = sptr+sopt->srr;
+ u32 faddr;
+
+ optlen = start[1];
+ soffset = start[2];
+ doffset = 0;
+ if (soffset > optlen)
+ soffset = optlen + 1;
+ soffset -= 4;
+ if (soffset > 3) {
+ memcpy(&faddr, &start[soffset-1], 4);
+ for (soffset-=4, doffset=4; soffset > 3; soffset-=4, doffset+=4)
+ memcpy(&dptr[doffset-1], &start[soffset-1], 4);
+ /*
+ * RFC1812 requires to fix illegal source routes.
+ */
+ if (memcmp(&skb->nh.iph->saddr, &start[soffset+3], 4) == 0)
+ doffset -= 4;
+ }
+ if (doffset > 3) {
+ memcpy(&start[doffset-1], &daddr, 4);
+ dopt->faddr = faddr;
+ dptr[0] = start[0];
+ dptr[1] = doffset+3;
+ dptr[2] = 4;
+ dptr += doffset+3;
+ dopt->srr = dopt->optlen + sizeof(struct iphdr);
+ dopt->optlen += doffset+3;
+ dopt->is_strictroute = sopt->is_strictroute;
+ }
+ }
+ while (dopt->optlen & 3) {
+ *dptr++ = IPOPT_END;
+ dopt->optlen++;
+ }
+ return 0;
+}
+
+/*
+ * Options "fragmenting", just fill options not
+ * allowed in fragments with NOOPs.
+ * Simple and stupid 8), but the most efficient way.
+ */
+
+void ip_options_fragment(struct sk_buff * skb)
+{
+ unsigned char * optptr = skb->nh.raw;
+ struct ip_options * opt = &(IPCB(skb)->opt);
+ int l = opt->optlen;
+ int optlen;
+
+ while (l > 0) {
+ switch (*optptr) {
+ case IPOPT_END:
+ return;
+ case IPOPT_NOOP:
+ l--;
+ optptr++;
+ continue;
+ }
+ optlen = optptr[1];
+ if (optlen<2 || optlen>l)
+ return;
+ if (!IPOPT_COPIED(*optptr))
+ memset(optptr, IPOPT_NOOP, optlen);
+ l -= optlen;
+ optptr += optlen;
+ }
+ opt->ts = 0;
+ opt->rr = 0;
+ opt->rr_needaddr = 0;
+ opt->ts_needaddr = 0;
+ opt->ts_needtime = 0;
+ return;
+}
+
+/*
+ * Verify options and fill pointers in struct options.
+ * Caller should clear *opt, and set opt->data.
+ * If opt == NULL, then skb->data should point to IP header.
+ */
+
+int ip_options_compile(struct ip_options * opt, struct sk_buff * skb)
+{
+ int l;
+ unsigned char * iph;
+ unsigned char * optptr;
+ int optlen;
+ unsigned char * pp_ptr = NULL;
+ struct rtable *rt = skb ? (struct rtable*)skb->dst : NULL;
+
+ if (!opt) {
+ opt = &(IPCB(skb)->opt);
+ memset(opt, 0, sizeof(struct ip_options));
+ iph = skb->nh.raw;
+ opt->optlen = ((struct iphdr *)iph)->ihl*4 - sizeof(struct iphdr);
+ optptr = iph + sizeof(struct iphdr);
+ opt->is_data = 0;
+ } else {
+ optptr = opt->is_data ? opt->__data : (unsigned char*)&(skb->nh.iph[1]);
+ iph = optptr - sizeof(struct iphdr);
+ }
+
+ for (l = opt->optlen; l > 0; ) {
+ switch (*optptr) {
+ case IPOPT_END:
+ for (optptr++, l--; l>0; optptr++, l--) {
+ if (*optptr != IPOPT_END) {
+ *optptr = IPOPT_END;
+ opt->is_changed = 1;
+ }
+ }
+ goto eol;
+ case IPOPT_NOOP:
+ l--;
+ optptr++;
+ continue;
+ }
+ optlen = optptr[1];
+ if (optlen<2 || optlen>l) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ switch (*optptr) {
+ case IPOPT_SSRR:
+ case IPOPT_LSRR:
+ if (optlen < 3) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ if (optptr[2] < 4) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ /* NB: cf RFC-1812 5.2.4.1 */
+ if (opt->srr) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ if (!skb) {
+ if (optptr[2] != 4 || optlen < 7 || ((optlen-3) & 3)) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ memcpy(&opt->faddr, &optptr[3], 4);
+ if (optlen > 7)
+ memmove(&optptr[3], &optptr[7], optlen-7);
+ }
+ opt->is_strictroute = (optptr[0] == IPOPT_SSRR);
+ opt->srr = optptr - iph;
+ break;
+ case IPOPT_RR:
+ if (opt->rr) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ if (optlen < 3) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ if (optptr[2] < 4) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ if (optptr[2] <= optlen) {
+ if (optptr[2]+3 > optlen) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ if (skb) {
+ memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
+ opt->is_changed = 1;
+ }
+ optptr[2] += 4;
+ opt->rr_needaddr = 1;
+ }
+ opt->rr = optptr - iph;
+ break;
+ case IPOPT_TIMESTAMP:
+ if (opt->ts) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ if (optlen < 4) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ if (optptr[2] < 5) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ if (optptr[2] <= optlen) {
+ __u32 * timeptr = NULL;
+ if (optptr[2]+3 > optptr[1]) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ switch (optptr[3]&0xF) {
+ case IPOPT_TS_TSONLY:
+ opt->ts = optptr - iph;
+ if (skb)
+ timeptr = (__u32*)&optptr[optptr[2]-1];
+ opt->ts_needtime = 1;
+ optptr[2] += 4;
+ break;
+ case IPOPT_TS_TSANDADDR:
+ if (optptr[2]+7 > optptr[1]) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ opt->ts = optptr - iph;
+ if (skb) {
+ memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
+ timeptr = (__u32*)&optptr[optptr[2]+3];
+ }
+ opt->ts_needaddr = 1;
+ opt->ts_needtime = 1;
+ optptr[2] += 8;
+ break;
+ case IPOPT_TS_PRESPEC:
+ if (optptr[2]+7 > optptr[1]) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ opt->ts = optptr - iph;
+ {
+ u32 addr;
+ memcpy(&addr, &optptr[optptr[2]-1], 4);
+ if (inet_addr_type(addr) == RTN_UNICAST)
+ break;
+ if (skb)
+ timeptr = (__u32*)&optptr[optptr[2]+3];
+ }
+ opt->ts_needtime = 1;
+ optptr[2] += 8;
+ break;
+ default:
+ if (!skb && !capable(CAP_NET_RAW)) {
+ pp_ptr = optptr + 3;
+ goto error;
+ }
+ break;
+ }
+ if (timeptr) {
+ struct timeval tv;
+ __u32 midtime;
+ do_gettimeofday(&tv);
+ midtime = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000);
+ memcpy(timeptr, &midtime, sizeof(__u32));
+ opt->is_changed = 1;
+ }
+ } else {
+ unsigned overflow = optptr[3]>>4;
+ if (overflow == 15) {
+ pp_ptr = optptr + 3;
+ goto error;
+ }
+ opt->ts = optptr - iph;
+ if (skb) {
+ optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4);
+ opt->is_changed = 1;
+ }
+ }
+ break;
+ case IPOPT_RA:
+ if (optlen < 4) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ if (optptr[2] == 0 && optptr[3] == 0)
+ opt->router_alert = optptr - iph;
+ break;
+ case IPOPT_SEC:
+ case IPOPT_SID:
+ default:
+ if (!skb && !capable(CAP_NET_RAW)) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ break;
+ }
+ l -= optlen;
+ optptr += optlen;
+ }
+
+eol:
+ if (!pp_ptr)
+ return 0;
+
+error:
+ if (skb) {
+ icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24));
+ }
+ return -EINVAL;
+}
+
+
+/*
+ * Undo all the changes done by ip_options_compile().
+ */
+
+void ip_options_undo(struct ip_options * opt)
+{
+ if (opt->srr) {
+ unsigned char * optptr = opt->__data+opt->srr-sizeof(struct iphdr);
+ memmove(optptr+7, optptr+3, optptr[1]-7);
+ memcpy(optptr+3, &opt->faddr, 4);
+ }
+ if (opt->rr_needaddr) {
+ unsigned char * optptr = opt->__data+opt->rr-sizeof(struct iphdr);
+ optptr[2] -= 4;
+ memset(&optptr[optptr[2]-1], 0, 4);
+ }
+ if (opt->ts) {
+ unsigned char * optptr = opt->__data+opt->ts-sizeof(struct iphdr);
+ if (opt->ts_needtime) {
+ optptr[2] -= 4;
+ memset(&optptr[optptr[2]-1], 0, 4);
+ if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC)
+ optptr[2] -= 4;
+ }
+ if (opt->ts_needaddr) {
+ optptr[2] -= 4;
+ memset(&optptr[optptr[2]-1], 0, 4);
+ }
+ }
+}
+
+int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user)
+{
+ struct ip_options *opt;
+
+ opt = kmalloc(sizeof(struct ip_options)+((optlen+3)&~3), GFP_KERNEL);
+ if (!opt)
+ return -ENOMEM;
+ memset(opt, 0, sizeof(struct ip_options));
+ if (optlen) {
+ if (user) {
+ if (copy_from_user(opt->__data, data, optlen)) {
+ kfree(opt);
+ return -EFAULT;
+ }
+ } else
+ memcpy(opt->__data, data, optlen);
+ }
+ while (optlen & 3)
+ opt->__data[optlen++] = IPOPT_END;
+ opt->optlen = optlen;
+ opt->is_data = 1;
+ opt->is_setbyuser = 1;
+ if (optlen && ip_options_compile(opt, NULL)) {
+ kfree(opt);
+ return -EINVAL;
+ }
+ if (*optp)
+ kfree(*optp);
+ *optp = opt;
+ return 0;
+}
+
+void ip_forward_options(struct sk_buff *skb)
+{
+ struct ip_options * opt = &(IPCB(skb)->opt);
+ unsigned char * optptr;
+ struct rtable *rt = (struct rtable*)skb->dst;
+ unsigned char *raw = skb->nh.raw;
+
+ if (opt->rr_needaddr) {
+ optptr = (unsigned char *)raw + opt->rr;
+ ip_rt_get_source(&optptr[optptr[2]-5], rt);
+ opt->is_changed = 1;
+ }
+ if (opt->srr_is_hit) {
+ int srrptr, srrspace;
+
+ optptr = raw + opt->srr;
+
+ for ( srrptr=optptr[2], srrspace = optptr[1];
+ srrptr <= srrspace;
+ srrptr += 4
+ ) {
+ if (srrptr + 3 > srrspace)
+ break;
+ if (memcmp(&rt->rt_dst, &optptr[srrptr-1], 4) == 0)
+ break;
+ }
+ if (srrptr + 3 <= srrspace) {
+ opt->is_changed = 1;
+ ip_rt_get_source(&optptr[srrptr-1], rt);
+ skb->nh.iph->daddr = rt->rt_dst;
+ optptr[2] = srrptr+4;
+ } else if (net_ratelimit())
+ printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n");
+ if (opt->ts_needaddr) {
+ optptr = raw + opt->ts;
+ ip_rt_get_source(&optptr[optptr[2]-9], rt);
+ opt->is_changed = 1;
+ }
+ }
+ if (opt->is_changed) {
+ opt->is_changed = 0;
+ ip_send_check(skb->nh.iph);
+ }
+}
+
+int ip_options_rcv_srr(struct sk_buff *skb)
+{
+ struct ip_options *opt = &(IPCB(skb)->opt);
+ int srrspace, srrptr;
+ u32 nexthop;
+ struct iphdr *iph = skb->nh.iph;
+ unsigned char * optptr = skb->nh.raw + opt->srr;
+ struct rtable *rt = (struct rtable*)skb->dst;
+ struct rtable *rt2;
+ int err;
+
+ if (!opt->srr)
+ return 0;
+
+ if (skb->pkt_type != PACKET_HOST)
+ return -EINVAL;
+ if (rt->rt_type == RTN_UNICAST) {
+ if (!opt->is_strictroute)
+ return 0;
+ icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24));
+ return -EINVAL;
+ }
+ if (rt->rt_type != RTN_LOCAL)
+ return -EINVAL;
+
+ for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {
+ if (srrptr + 3 > srrspace) {
+ icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24));
+ return -EINVAL;
+ }
+ memcpy(&nexthop, &optptr[srrptr-1], 4);
+
+ rt = (struct rtable*)skb->dst;
+ skb->dst = NULL;
+ err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
+ rt2 = (struct rtable*)skb->dst;
+ if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
+ ip_rt_put(rt2);
+ skb->dst = &rt->u.dst;
+ return -EINVAL;
+ }
+ ip_rt_put(rt);
+ if (rt2->rt_type != RTN_LOCAL)
+ break;
+ /* Superfast 8) loopback forward */
+ memcpy(&iph->daddr, &optptr[srrptr-1], 4);
+ opt->is_changed = 1;
+ }
+ if (srrptr <= srrspace) {
+ opt->srr_is_hit = 1;
+ opt->is_changed = 1;
+ }
+ return 0;
+}
+
+EXPORT_SYMBOL(ip_options_compile);
+EXPORT_SYMBOL(ip_options_undo);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
new file mode 100644
index 00000000000..30ab7b6ab76
--- /dev/null
+++ b/net/ipv4/ip_output.c
@@ -0,0 +1,1359 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The Internet Protocol (IP) output module.
+ *
+ * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Donald Becker, <becker@super.org>
+ * Alan Cox, <Alan.Cox@linux.org>
+ * Richard Underwood
+ * Stefan Becker, <stefanb@yello.ping.de>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Hirokazu Takahashi, <taka@valinux.co.jp>
+ *
+ * See ip_input.c for original log
+ *
+ * Fixes:
+ * Alan Cox : Missing nonblock feature in ip_build_xmit.
+ * Mike Kilburn : htons() missing in ip_build_xmit.
+ * Bradford Johnson: Fix faulty handling of some frames when
+ * no route is found.
+ * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
+ * (in case if packet not accepted by
+ * output firewall rules)
+ * Mike McLagan : Routing by source
+ * Alexey Kuznetsov: use new route cache
+ * Andi Kleen: Fix broken PMTU recovery and remove
+ * some redundant tests.
+ * Vitaly E. Lavrov : Transparent proxy revived after year coma.
+ * Andi Kleen : Replace ip_reply with ip_send_reply.
+ * Andi Kleen : Split fast and slow ip_build_xmit path
+ * for decreased register pressure on x86
+ * and more readibility.
+ * Marc Boucher : When call_out_firewall returns FW_QUEUE,
+ * silently drop skb instead of failing with -EPERM.
+ * Detlev Wengorz : Copy protocol for fragments.
+ * Hirokazu Takahashi: HW checksumming for outgoing UDP
+ * datagrams.
+ * Hirokazu Takahashi: sendfile() on UDP works now.
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/config.h>
+
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/icmp.h>
+#include <net/raw.h>
+#include <net/checksum.h>
+#include <net/inetpeer.h>
+#include <net/checksum.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/mroute.h>
+#include <linux/netlink.h>
+
+/*
+ * Shall we try to damage output packets if routing dev changes?
+ */
+
+int sysctl_ip_dynaddr;
+int sysctl_ip_default_ttl = IPDEFTTL;
+
+/* Generate a checksum for an outgoing IP datagram. */
+__inline__ void ip_send_check(struct iphdr *iph)
+{
+ iph->check = 0;
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+}
+
+/* dev_loopback_xmit for use with netfilter. */
+static int ip_dev_loopback_xmit(struct sk_buff *newskb)
+{
+ newskb->mac.raw = newskb->data;
+ __skb_pull(newskb, newskb->nh.raw - newskb->data);
+ newskb->pkt_type = PACKET_LOOPBACK;
+ newskb->ip_summed = CHECKSUM_UNNECESSARY;
+ BUG_TRAP(newskb->dst);
+
+#ifdef CONFIG_NETFILTER_DEBUG
+ nf_debug_ip_loopback_xmit(newskb);
+#endif
+ netif_rx(newskb);
+ return 0;
+}
+
+static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
+{
+ int ttl = inet->uc_ttl;
+
+ if (ttl < 0)
+ ttl = dst_metric(dst, RTAX_HOPLIMIT);
+ return ttl;
+}
+
+/*
+ * Add an ip header to a skbuff and send it out.
+ *
+ */
+int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
+ u32 saddr, u32 daddr, struct ip_options *opt)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct rtable *rt = (struct rtable *)skb->dst;
+ struct iphdr *iph;
+
+ /* Build the IP header. */
+ if (opt)
+ iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
+ else
+ iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
+
+ iph->version = 4;
+ iph->ihl = 5;
+ iph->tos = inet->tos;
+ if (ip_dont_fragment(sk, &rt->u.dst))
+ iph->frag_off = htons(IP_DF);
+ else
+ iph->frag_off = 0;
+ iph->ttl = ip_select_ttl(inet, &rt->u.dst);
+ iph->daddr = rt->rt_dst;
+ iph->saddr = rt->rt_src;
+ iph->protocol = sk->sk_protocol;
+ iph->tot_len = htons(skb->len);
+ ip_select_ident(iph, &rt->u.dst, sk);
+ skb->nh.iph = iph;
+
+ if (opt && opt->optlen) {
+ iph->ihl += opt->optlen>>2;
+ ip_options_build(skb, opt, daddr, rt, 0);
+ }
+ ip_send_check(iph);
+
+ skb->priority = sk->sk_priority;
+
+ /* Send it out. */
+ return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+ dst_output);
+}
+
+static inline int ip_finish_output2(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb->dst;
+ struct hh_cache *hh = dst->hh;
+ struct net_device *dev = dst->dev;
+ int hh_len = LL_RESERVED_SPACE(dev);
+
+ /* Be paranoid, rather than too clever. */
+ if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
+ struct sk_buff *skb2;
+
+ skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
+ if (skb2 == NULL) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+ if (skb->sk)
+ skb_set_owner_w(skb2, skb->sk);
+ kfree_skb(skb);
+ skb = skb2;
+ }
+
+#ifdef CONFIG_NETFILTER_DEBUG
+ nf_debug_ip_finish_output2(skb);
+#endif /*CONFIG_NETFILTER_DEBUG*/
+
+ if (hh) {
+ int hh_alen;
+
+ read_lock_bh(&hh->hh_lock);
+ hh_alen = HH_DATA_ALIGN(hh->hh_len);
+ memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
+ read_unlock_bh(&hh->hh_lock);
+ skb_push(skb, hh->hh_len);
+ return hh->hh_output(skb);
+ } else if (dst->neighbour)
+ return dst->neighbour->output(skb);
+
+ if (net_ratelimit())
+ printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+int ip_finish_output(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dst->dev;
+
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_IP);
+
+ return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
+ ip_finish_output2);
+}
+
+int ip_mc_output(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct rtable *rt = (struct rtable*)skb->dst;
+ struct net_device *dev = rt->u.dst.dev;
+
+ /*
+ * If the indicated interface is up and running, send the packet.
+ */
+ IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
+
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_IP);
+
+ /*
+ * Multicasts are looped back for other local users
+ */
+
+ if (rt->rt_flags&RTCF_MULTICAST) {
+ if ((!sk || inet_sk(sk)->mc_loop)
+#ifdef CONFIG_IP_MROUTE
+ /* Small optimization: do not loopback not local frames,
+ which returned after forwarding; they will be dropped
+ by ip_mr_input in any case.
+ Note, that local frames are looped back to be delivered
+ to local recipients.
+
+ This check is duplicated in ip_mr_input at the moment.
+ */
+ && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
+#endif
+ ) {
+ struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
+ if (newskb)
+ NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
+ newskb->dev,
+ ip_dev_loopback_xmit);
+ }
+
+ /* Multicasts with ttl 0 must not go beyond the host */
+
+ if (skb->nh.iph->ttl == 0) {
+ kfree_skb(skb);
+ return 0;
+ }
+ }
+
+ if (rt->rt_flags&RTCF_BROADCAST) {
+ struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
+ if (newskb)
+ NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
+ newskb->dev, ip_dev_loopback_xmit);
+ }
+
+ if (skb->len > dst_mtu(&rt->u.dst))
+ return ip_fragment(skb, ip_finish_output);
+ else
+ return ip_finish_output(skb);
+}
+
+int ip_output(struct sk_buff *skb)
+{
+ IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
+
+ if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
+ return ip_fragment(skb, ip_finish_output);
+ else
+ return ip_finish_output(skb);
+}
+
+int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
+{
+ struct sock *sk = skb->sk;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_options *opt = inet->opt;
+ struct rtable *rt;
+ struct iphdr *iph;
+
+ /* Skip all of this if the packet is already routed,
+ * f.e. by something like SCTP.
+ */
+ rt = (struct rtable *) skb->dst;
+ if (rt != NULL)
+ goto packet_routed;
+
+ /* Make sure we can route this packet. */
+ rt = (struct rtable *)__sk_dst_check(sk, 0);
+ if (rt == NULL) {
+ u32 daddr;
+
+ /* Use correct destination address if we have options. */
+ daddr = inet->daddr;
+ if(opt && opt->srr)
+ daddr = opt->faddr;
+
+ {
+ struct flowi fl = { .oif = sk->sk_bound_dev_if,
+ .nl_u = { .ip4_u =
+ { .daddr = daddr,
+ .saddr = inet->saddr,
+ .tos = RT_CONN_FLAGS(sk) } },
+ .proto = sk->sk_protocol,
+ .uli_u = { .ports =
+ { .sport = inet->sport,
+ .dport = inet->dport } } };
+
+ /* If this fails, retransmit mechanism of transport layer will
+ * keep trying until route appears or the connection times
+ * itself out.
+ */
+ if (ip_route_output_flow(&rt, &fl, sk, 0))
+ goto no_route;
+ }
+ __sk_dst_set(sk, &rt->u.dst);
+ tcp_v4_setup_caps(sk, &rt->u.dst);
+ }
+ skb->dst = dst_clone(&rt->u.dst);
+
+packet_routed:
+ if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+ goto no_route;
+
+ /* OK, we know where to send it, allocate and build IP header. */
+ iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
+ *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
+ iph->tot_len = htons(skb->len);
+ if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
+ iph->frag_off = htons(IP_DF);
+ else
+ iph->frag_off = 0;
+ iph->ttl = ip_select_ttl(inet, &rt->u.dst);
+ iph->protocol = sk->sk_protocol;
+ iph->saddr = rt->rt_src;
+ iph->daddr = rt->rt_dst;
+ skb->nh.iph = iph;
+ /* Transport layer set skb->h.foo itself. */
+
+ if (opt && opt->optlen) {
+ iph->ihl += opt->optlen >> 2;
+ ip_options_build(skb, opt, inet->daddr, rt, 0);
+ }
+
+ ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
+
+ /* Add an IP checksum. */
+ ip_send_check(iph);
+
+ skb->priority = sk->sk_priority;
+
+ return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+ dst_output);
+
+no_route:
+ IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
+ kfree_skb(skb);
+ return -EHOSTUNREACH;
+}
+
+
+static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
+{
+ to->pkt_type = from->pkt_type;
+ to->priority = from->priority;
+ to->protocol = from->protocol;
+ to->security = from->security;
+ dst_release(to->dst);
+ to->dst = dst_clone(from->dst);
+ to->dev = from->dev;
+
+ /* Copy the flags to each fragment. */
+ IPCB(to)->flags = IPCB(from)->flags;
+
+#ifdef CONFIG_NET_SCHED
+ to->tc_index = from->tc_index;
+#endif
+#ifdef CONFIG_NETFILTER
+ to->nfmark = from->nfmark;
+ to->nfcache = from->nfcache;
+ /* Connection association is same as pre-frag packet */
+ nf_conntrack_put(to->nfct);
+ to->nfct = from->nfct;
+ nf_conntrack_get(to->nfct);
+ to->nfctinfo = from->nfctinfo;
+#ifdef CONFIG_BRIDGE_NETFILTER
+ nf_bridge_put(to->nf_bridge);
+ to->nf_bridge = from->nf_bridge;
+ nf_bridge_get(to->nf_bridge);
+#endif
+#ifdef CONFIG_NETFILTER_DEBUG
+ to->nf_debug = from->nf_debug;
+#endif
+#endif
+}
+
+/*
+ * This IP datagram is too large to be sent in one piece. Break it up into
+ * smaller pieces (each of size equal to IP header plus
+ * a block of the data of the original IP data part) that will yet fit in a
+ * single device frame, and queue such a frame for sending.
+ */
+
+int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
+{
+ struct iphdr *iph;
+ int raw = 0;
+ int ptr;
+ struct net_device *dev;
+ struct sk_buff *skb2;
+ unsigned int mtu, hlen, left, len, ll_rs;
+ int offset;
+ int not_last_frag;
+ struct rtable *rt = (struct rtable*)skb->dst;
+ int err = 0;
+
+ dev = rt->u.dst.dev;
+
+ /*
+ * Point into the IP datagram header.
+ */
+
+ iph = skb->nh.iph;
+
+ if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(dst_mtu(&rt->u.dst)));
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ /*
+ * Setup starting values.
+ */
+
+ hlen = iph->ihl * 4;
+ mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */
+
+ /* When frag_list is given, use it. First, check its validity:
+ * some transformers could create wrong frag_list or break existing
+ * one, it is not prohibited. In this case fall back to copying.
+ *
+ * LATER: this step can be merged to real generation of fragments,
+ * we can switch to copy when see the first bad fragment.
+ */
+ if (skb_shinfo(skb)->frag_list) {
+ struct sk_buff *frag;
+ int first_len = skb_pagelen(skb);
+
+ if (first_len - hlen > mtu ||
+ ((first_len - hlen) & 7) ||
+ (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
+ skb_cloned(skb))
+ goto slow_path;
+
+ for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
+ /* Correct geometry. */
+ if (frag->len > mtu ||
+ ((frag->len & 7) && frag->next) ||
+ skb_headroom(frag) < hlen)
+ goto slow_path;
+
+ /* Partially cloned skb? */
+ if (skb_shared(frag))
+ goto slow_path;
+ }
+
+ /* Everything is OK. Generate! */
+
+ err = 0;
+ offset = 0;
+ frag = skb_shinfo(skb)->frag_list;
+ skb_shinfo(skb)->frag_list = NULL;
+ skb->data_len = first_len - skb_headlen(skb);
+ skb->len = first_len;
+ iph->tot_len = htons(first_len);
+ iph->frag_off = htons(IP_MF);
+ ip_send_check(iph);
+
+ for (;;) {
+ /* Prepare header of the next frame,
+ * before previous one went down. */
+ if (frag) {
+ frag->ip_summed = CHECKSUM_NONE;
+ frag->h.raw = frag->data;
+ frag->nh.raw = __skb_push(frag, hlen);
+ memcpy(frag->nh.raw, iph, hlen);
+ iph = frag->nh.iph;
+ iph->tot_len = htons(frag->len);
+ ip_copy_metadata(frag, skb);
+ if (offset == 0)
+ ip_options_fragment(frag);
+ offset += skb->len - hlen;
+ iph->frag_off = htons(offset>>3);
+ if (frag->next != NULL)
+ iph->frag_off |= htons(IP_MF);
+ /* Ready, complete checksum */
+ ip_send_check(iph);
+ }
+
+ err = output(skb);
+
+ if (err || !frag)
+ break;
+
+ skb = frag;
+ frag = skb->next;
+ skb->next = NULL;
+ }
+
+ if (err == 0) {
+ IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
+ return 0;
+ }
+
+ while (frag) {
+ skb = frag->next;
+ kfree_skb(frag);
+ frag = skb;
+ }
+ IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
+ return err;
+ }
+
+slow_path:
+ left = skb->len - hlen; /* Space per frame */
+ ptr = raw + hlen; /* Where to start from */
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+ /* for bridged IP traffic encapsulated inside f.e. a vlan header,
+ * we need to make room for the encapsulating header */
+ ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
+ mtu -= nf_bridge_pad(skb);
+#else
+ ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
+#endif
+ /*
+ * Fragment the datagram.
+ */
+
+ offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
+ not_last_frag = iph->frag_off & htons(IP_MF);
+
+ /*
+ * Keep copying data until we run out.
+ */
+
+ while(left > 0) {
+ len = left;
+ /* IF: it doesn't fit, use 'mtu' - the data space left */
+ if (len > mtu)
+ len = mtu;
+ /* IF: we are not sending upto and including the packet end
+ then align the next start on an eight byte boundary */
+ if (len < left) {
+ len &= ~7;
+ }
+ /*
+ * Allocate buffer.
+ */
+
+ if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
+ NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ /*
+ * Set up data on packet
+ */
+
+ ip_copy_metadata(skb2, skb);
+ skb_reserve(skb2, ll_rs);
+ skb_put(skb2, len + hlen);
+ skb2->nh.raw = skb2->data;
+ skb2->h.raw = skb2->data + hlen;
+
+ /*
+ * Charge the memory for the fragment to any owner
+ * it might possess
+ */
+
+ if (skb->sk)
+ skb_set_owner_w(skb2, skb->sk);
+
+ /*
+ * Copy the packet header into the new buffer.
+ */
+
+ memcpy(skb2->nh.raw, skb->data, hlen);
+
+ /*
+ * Copy a block of the IP datagram.
+ */
+ if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
+ BUG();
+ left -= len;
+
+ /*
+ * Fill in the new header fields.
+ */
+ iph = skb2->nh.iph;
+ iph->frag_off = htons((offset >> 3));
+
+ /* ANK: dirty, but effective trick. Upgrade options only if
+ * the segment to be fragmented was THE FIRST (otherwise,
+ * options are already fixed) and make it ONCE
+ * on the initial skb, so that all the following fragments
+ * will inherit fixed options.
+ */
+ if (offset == 0)
+ ip_options_fragment(skb);
+
+ /*
+ * Added AC : If we are fragmenting a fragment that's not the
+ * last fragment then keep MF on each bit
+ */
+ if (left > 0 || not_last_frag)
+ iph->frag_off |= htons(IP_MF);
+ ptr += len;
+ offset += len;
+
+ /*
+ * Put this fragment into the sending queue.
+ */
+
+ IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
+
+ iph->tot_len = htons(len + hlen);
+
+ ip_send_check(iph);
+
+ err = output(skb2);
+ if (err)
+ goto fail;
+ }
+ kfree_skb(skb);
+ IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
+ return err;
+
+fail:
+ kfree_skb(skb);
+ IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
+ return err;
+}
+
+int
+ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
+{
+ struct iovec *iov = from;
+
+ if (skb->ip_summed == CHECKSUM_HW) {
+ if (memcpy_fromiovecend(to, iov, offset, len) < 0)
+ return -EFAULT;
+ } else {
+ unsigned int csum = 0;
+ if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
+ return -EFAULT;
+ skb->csum = csum_block_add(skb->csum, csum, odd);
+ }
+ return 0;
+}
+
+static inline unsigned int
+csum_page(struct page *page, int offset, int copy)
+{
+ char *kaddr;
+ unsigned int csum;
+ kaddr = kmap(page);
+ csum = csum_partial(kaddr + offset, copy, 0);
+ kunmap(page);
+ return csum;
+}
+
+/*
+ * ip_append_data() and ip_append_page() can make one large IP datagram
+ * from many pieces of data. Each pieces will be holded on the socket
+ * until ip_push_pending_frames() is called. Each piece can be a page
+ * or non-page data.
+ *
+ * Not only UDP, other transport protocols - e.g. raw sockets - can use
+ * this interface potentially.
+ *
+ * LATER: length must be adjusted by pad at tail, when it is required.
+ */
+int ip_append_data(struct sock *sk,
+ int getfrag(void *from, char *to, int offset, int len,
+ int odd, struct sk_buff *skb),
+ void *from, int length, int transhdrlen,
+ struct ipcm_cookie *ipc, struct rtable *rt,
+ unsigned int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sk_buff *skb;
+
+ struct ip_options *opt = NULL;
+ int hh_len;
+ int exthdrlen;
+ int mtu;
+ int copy;
+ int err;
+ int offset = 0;
+ unsigned int maxfraglen, fragheaderlen;
+ int csummode = CHECKSUM_NONE;
+
+ if (flags&MSG_PROBE)
+ return 0;
+
+ if (skb_queue_empty(&sk->sk_write_queue)) {
+ /*
+ * setup for corking.
+ */
+ opt = ipc->opt;
+ if (opt) {
+ if (inet->cork.opt == NULL) {
+ inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
+ if (unlikely(inet->cork.opt == NULL))
+ return -ENOBUFS;
+ }
+ memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
+ inet->cork.flags |= IPCORK_OPT;
+ inet->cork.addr = ipc->addr;
+ }
+ dst_hold(&rt->u.dst);
+ inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
+ inet->cork.rt = rt;
+ inet->cork.length = 0;
+ sk->sk_sndmsg_page = NULL;
+ sk->sk_sndmsg_off = 0;
+ if ((exthdrlen = rt->u.dst.header_len) != 0) {
+ length += exthdrlen;
+ transhdrlen += exthdrlen;
+ }
+ } else {
+ rt = inet->cork.rt;
+ if (inet->cork.flags & IPCORK_OPT)
+ opt = inet->cork.opt;
+
+ transhdrlen = 0;
+ exthdrlen = 0;
+ mtu = inet->cork.fragsize;
+ }
+ hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+
+ fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
+ maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+
+ if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
+ ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
+ return -EMSGSIZE;
+ }
+
+ /*
+ * transhdrlen > 0 means that this is the first fragment and we wish
+ * it won't be fragmented in the future.
+ */
+ if (transhdrlen &&
+ length + fragheaderlen <= mtu &&
+ rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
+ !exthdrlen)
+ csummode = CHECKSUM_HW;
+
+ inet->cork.length += length;
+
+ /* So, what's going on in the loop below?
+ *
+ * We use calculated fragment length to generate chained skb,
+ * each of segments is IP fragment ready for sending to network after
+ * adding appropriate IP header.
+ */
+
+ if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
+ goto alloc_new_skb;
+
+ while (length > 0) {
+ /* Check if the remaining data fits into current packet. */
+ copy = mtu - skb->len;
+ if (copy < length)
+ copy = maxfraglen - skb->len;
+ if (copy <= 0) {
+ char *data;
+ unsigned int datalen;
+ unsigned int fraglen;
+ unsigned int fraggap;
+ unsigned int alloclen;
+ struct sk_buff *skb_prev;
+alloc_new_skb:
+ skb_prev = skb;
+ if (skb_prev)
+ fraggap = skb_prev->len - maxfraglen;
+ else
+ fraggap = 0;
+
+ /*
+ * If remaining data exceeds the mtu,
+ * we know we need more fragment(s).
+ */
+ datalen = length + fraggap;
+ if (datalen > mtu - fragheaderlen)
+ datalen = maxfraglen - fragheaderlen;
+ fraglen = datalen + fragheaderlen;
+
+ if ((flags & MSG_MORE) &&
+ !(rt->u.dst.dev->features&NETIF_F_SG))
+ alloclen = mtu;
+ else
+ alloclen = datalen + fragheaderlen;
+
+ /* The last fragment gets additional space at tail.
+ * Note, with MSG_MORE we overallocate on fragments,
+ * because we have no idea what fragment will be
+ * the last.
+ */
+ if (datalen == length)
+ alloclen += rt->u.dst.trailer_len;
+
+ if (transhdrlen) {
+ skb = sock_alloc_send_skb(sk,
+ alloclen + hh_len + 15,
+ (flags & MSG_DONTWAIT), &err);
+ } else {
+ skb = NULL;
+ if (atomic_read(&sk->sk_wmem_alloc) <=
+ 2 * sk->sk_sndbuf)
+ skb = sock_wmalloc(sk,
+ alloclen + hh_len + 15, 1,
+ sk->sk_allocation);
+ if (unlikely(skb == NULL))
+ err = -ENOBUFS;
+ }
+ if (skb == NULL)
+ goto error;
+
+ /*
+ * Fill in the control structures
+ */
+ skb->ip_summed = csummode;
+ skb->csum = 0;
+ skb_reserve(skb, hh_len);
+
+ /*
+ * Find where to start putting bytes.
+ */
+ data = skb_put(skb, fraglen);
+ skb->nh.raw = data + exthdrlen;
+ data += fragheaderlen;
+ skb->h.raw = data + exthdrlen;
+
+ if (fraggap) {
+ skb->csum = skb_copy_and_csum_bits(
+ skb_prev, maxfraglen,
+ data + transhdrlen, fraggap, 0);
+ skb_prev->csum = csum_sub(skb_prev->csum,
+ skb->csum);
+ data += fraggap;
+ skb_trim(skb_prev, maxfraglen);
+ }
+
+ copy = datalen - transhdrlen - fraggap;
+ if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
+ err = -EFAULT;
+ kfree_skb(skb);
+ goto error;
+ }
+
+ offset += copy;
+ length -= datalen - fraggap;
+ transhdrlen = 0;
+ exthdrlen = 0;
+ csummode = CHECKSUM_NONE;
+
+ /*
+ * Put the packet on the pending queue.
+ */
+ __skb_queue_tail(&sk->sk_write_queue, skb);
+ continue;
+ }
+
+ if (copy > length)
+ copy = length;
+
+ if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
+ unsigned int off;
+
+ off = skb->len;
+ if (getfrag(from, skb_put(skb, copy),
+ offset, copy, off, skb) < 0) {
+ __skb_trim(skb, off);
+ err = -EFAULT;
+ goto error;
+ }
+ } else {
+ int i = skb_shinfo(skb)->nr_frags;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
+ struct page *page = sk->sk_sndmsg_page;
+ int off = sk->sk_sndmsg_off;
+ unsigned int left;
+
+ if (page && (left = PAGE_SIZE - off) > 0) {
+ if (copy >= left)
+ copy = left;
+ if (page != frag->page) {
+ if (i == MAX_SKB_FRAGS) {
+ err = -EMSGSIZE;
+ goto error;
+ }
+ get_page(page);
+ skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
+ frag = &skb_shinfo(skb)->frags[i];
+ }
+ } else if (i < MAX_SKB_FRAGS) {
+ if (copy > PAGE_SIZE)
+ copy = PAGE_SIZE;
+ page = alloc_pages(sk->sk_allocation, 0);
+ if (page == NULL) {
+ err = -ENOMEM;
+ goto error;
+ }
+ sk->sk_sndmsg_page = page;
+ sk->sk_sndmsg_off = 0;
+
+ skb_fill_page_desc(skb, i, page, 0, 0);
+ frag = &skb_shinfo(skb)->frags[i];
+ skb->truesize += PAGE_SIZE;
+ atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
+ } else {
+ err = -EMSGSIZE;
+ goto error;
+ }
+ if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
+ err = -EFAULT;
+ goto error;
+ }
+ sk->sk_sndmsg_off += copy;
+ frag->size += copy;
+ skb->len += copy;
+ skb->data_len += copy;
+ }
+ offset += copy;
+ length -= copy;
+ }
+
+ return 0;
+
+error:
+ inet->cork.length -= length;
+ IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
+ return err;
+}
+
+ssize_t ip_append_page(struct sock *sk, struct page *page,
+ int offset, size_t size, int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sk_buff *skb;
+ struct rtable *rt;
+ struct ip_options *opt = NULL;
+ int hh_len;
+ int mtu;
+ int len;
+ int err;
+ unsigned int maxfraglen, fragheaderlen, fraggap;
+
+ if (inet->hdrincl)
+ return -EPERM;
+
+ if (flags&MSG_PROBE)
+ return 0;
+
+ if (skb_queue_empty(&sk->sk_write_queue))
+ return -EINVAL;
+
+ rt = inet->cork.rt;
+ if (inet->cork.flags & IPCORK_OPT)
+ opt = inet->cork.opt;
+
+ if (!(rt->u.dst.dev->features&NETIF_F_SG))
+ return -EOPNOTSUPP;
+
+ hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+ mtu = inet->cork.fragsize;
+
+ fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
+ maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+
+ if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
+ ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
+ return -EMSGSIZE;
+ }
+
+ if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
+ return -EINVAL;
+
+ inet->cork.length += size;
+
+ while (size > 0) {
+ int i;
+
+ /* Check if the remaining data fits into current packet. */
+ len = mtu - skb->len;
+ if (len < size)
+ len = maxfraglen - skb->len;
+ if (len <= 0) {
+ struct sk_buff *skb_prev;
+ char *data;
+ struct iphdr *iph;
+ int alloclen;
+
+ skb_prev = skb;
+ if (skb_prev)
+ fraggap = skb_prev->len - maxfraglen;
+ else
+ fraggap = 0;
+
+ alloclen = fragheaderlen + hh_len + fraggap + 15;
+ skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
+ if (unlikely(!skb)) {
+ err = -ENOBUFS;
+ goto error;
+ }
+
+ /*
+ * Fill in the control structures
+ */
+ skb->ip_summed = CHECKSUM_NONE;
+ skb->csum = 0;
+ skb_reserve(skb, hh_len);
+
+ /*
+ * Find where to start putting bytes.
+ */
+ data = skb_put(skb, fragheaderlen + fraggap);
+ skb->nh.iph = iph = (struct iphdr *)data;
+ data += fragheaderlen;
+ skb->h.raw = data;
+
+ if (fraggap) {
+ skb->csum = skb_copy_and_csum_bits(
+ skb_prev, maxfraglen,
+ data, fraggap, 0);
+ skb_prev->csum = csum_sub(skb_prev->csum,
+ skb->csum);
+ skb_trim(skb_prev, maxfraglen);
+ }
+
+ /*
+ * Put the packet on the pending queue.
+ */
+ __skb_queue_tail(&sk->sk_write_queue, skb);
+ continue;
+ }
+
+ i = skb_shinfo(skb)->nr_frags;
+ if (len > size)
+ len = size;
+ if (skb_can_coalesce(skb, i, page, offset)) {
+ skb_shinfo(skb)->frags[i-1].size += len;
+ } else if (i < MAX_SKB_FRAGS) {
+ get_page(page);
+ skb_fill_page_desc(skb, i, page, offset, len);
+ } else {
+ err = -EMSGSIZE;
+ goto error;
+ }
+
+ if (skb->ip_summed == CHECKSUM_NONE) {
+ unsigned int csum;
+ csum = csum_page(page, offset, len);
+ skb->csum = csum_block_add(skb->csum, csum, skb->len);
+ }
+
+ skb->len += len;
+ skb->data_len += len;
+ offset += len;
+ size -= len;
+ }
+ return 0;
+
+error:
+ inet->cork.length -= size;
+ IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
+ return err;
+}
+
+/*
+ * Combined all pending IP fragments on the socket as one IP datagram
+ * and push them out.
+ */
+int ip_push_pending_frames(struct sock *sk)
+{
+ struct sk_buff *skb, *tmp_skb;
+ struct sk_buff **tail_skb;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_options *opt = NULL;
+ struct rtable *rt = inet->cork.rt;
+ struct iphdr *iph;
+ int df = 0;
+ __u8 ttl;
+ int err = 0;
+
+ if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
+ goto out;
+ tail_skb = &(skb_shinfo(skb)->frag_list);
+
+ /* move skb->data to ip header from ext header */
+ if (skb->data < skb->nh.raw)
+ __skb_pull(skb, skb->nh.raw - skb->data);
+ while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
+ __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
+ *tail_skb = tmp_skb;
+ tail_skb = &(tmp_skb->next);
+ skb->len += tmp_skb->len;
+ skb->data_len += tmp_skb->len;
+ skb->truesize += tmp_skb->truesize;
+ __sock_put(tmp_skb->sk);
+ tmp_skb->destructor = NULL;
+ tmp_skb->sk = NULL;
+ }
+
+ /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
+ * to fragment the frame generated here. No matter, what transforms
+ * how transforms change size of the packet, it will come out.
+ */
+ if (inet->pmtudisc != IP_PMTUDISC_DO)
+ skb->local_df = 1;
+
+ /* DF bit is set when we want to see DF on outgoing frames.
+ * If local_df is set too, we still allow to fragment this frame
+ * locally. */
+ if (inet->pmtudisc == IP_PMTUDISC_DO ||
+ (skb->len <= dst_mtu(&rt->u.dst) &&
+ ip_dont_fragment(sk, &rt->u.dst)))
+ df = htons(IP_DF);
+
+ if (inet->cork.flags & IPCORK_OPT)
+ opt = inet->cork.opt;
+
+ if (rt->rt_type == RTN_MULTICAST)
+ ttl = inet->mc_ttl;
+ else
+ ttl = ip_select_ttl(inet, &rt->u.dst);
+
+ iph = (struct iphdr *)skb->data;
+ iph->version = 4;
+ iph->ihl = 5;
+ if (opt) {
+ iph->ihl += opt->optlen>>2;
+ ip_options_build(skb, opt, inet->cork.addr, rt, 0);
+ }
+ iph->tos = inet->tos;
+ iph->tot_len = htons(skb->len);
+ iph->frag_off = df;
+ if (!df) {
+ __ip_select_ident(iph, &rt->u.dst, 0);
+ } else {
+ iph->id = htons(inet->id++);
+ }
+ iph->ttl = ttl;
+ iph->protocol = sk->sk_protocol;
+ iph->saddr = rt->rt_src;
+ iph->daddr = rt->rt_dst;
+ ip_send_check(iph);
+
+ skb->priority = sk->sk_priority;
+ skb->dst = dst_clone(&rt->u.dst);
+
+ /* Netfilter gets whole the not fragmented skb. */
+ err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
+ skb->dst->dev, dst_output);
+ if (err) {
+ if (err > 0)
+ err = inet->recverr ? net_xmit_errno(err) : 0;
+ if (err)
+ goto error;
+ }
+
+out:
+ inet->cork.flags &= ~IPCORK_OPT;
+ if (inet->cork.opt) {
+ kfree(inet->cork.opt);
+ inet->cork.opt = NULL;
+ }
+ if (inet->cork.rt) {
+ ip_rt_put(inet->cork.rt);
+ inet->cork.rt = NULL;
+ }
+ return err;
+
+error:
+ IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
+ goto out;
+}
+
+/*
+ * Throw away all pending data on the socket.
+ */
+void ip_flush_pending_frames(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sk_buff *skb;
+
+ while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
+ kfree_skb(skb);
+
+ inet->cork.flags &= ~IPCORK_OPT;
+ if (inet->cork.opt) {
+ kfree(inet->cork.opt);
+ inet->cork.opt = NULL;
+ }
+ if (inet->cork.rt) {
+ ip_rt_put(inet->cork.rt);
+ inet->cork.rt = NULL;
+ }
+}
+
+
+/*
+ * Fetch data from kernel space and fill in checksum if needed.
+ */
+static int ip_reply_glue_bits(void *dptr, char *to, int offset,
+ int len, int odd, struct sk_buff *skb)
+{
+ unsigned int csum;
+
+ csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
+ skb->csum = csum_block_add(skb->csum, csum, odd);
+ return 0;
+}
+
+/*
+ * Generic function to send a packet as reply to another packet.
+ * Used to send TCP resets so far. ICMP should use this function too.
+ *
+ * Should run single threaded per socket because it uses the sock
+ * structure to pass arguments.
+ *
+ * LATER: switch from ip_build_xmit to ip_append_*
+ */
+void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
+ unsigned int len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct {
+ struct ip_options opt;
+ char data[40];
+ } replyopts;
+ struct ipcm_cookie ipc;
+ u32 daddr;
+ struct rtable *rt = (struct rtable*)skb->dst;
+
+ if (ip_options_echo(&replyopts.opt, skb))
+ return;
+
+ daddr = ipc.addr = rt->rt_src;
+ ipc.opt = NULL;
+
+ if (replyopts.opt.optlen) {
+ ipc.opt = &replyopts.opt;
+
+ if (ipc.opt->srr)
+ daddr = replyopts.opt.faddr;
+ }
+
+ {
+ struct flowi fl = { .nl_u = { .ip4_u =
+ { .daddr = daddr,
+ .saddr = rt->rt_spec_dst,
+ .tos = RT_TOS(skb->nh.iph->tos) } },
+ /* Not quite clean, but right. */
+ .uli_u = { .ports =
+ { .sport = skb->h.th->dest,
+ .dport = skb->h.th->source } },
+ .proto = sk->sk_protocol };
+ if (ip_route_output_key(&rt, &fl))
+ return;
+ }
+
+ /* And let IP do all the hard work.
+
+ This chunk is not reenterable, hence spinlock.
+ Note that it uses the fact, that this function is called
+ with locally disabled BH and that sk cannot be already spinlocked.
+ */
+ bh_lock_sock(sk);
+ inet->tos = skb->nh.iph->tos;
+ sk->sk_priority = skb->priority;
+ sk->sk_protocol = skb->nh.iph->protocol;
+ ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
+ &ipc, rt, MSG_DONTWAIT);
+ if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
+ if (arg->csumoffset >= 0)
+ *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
+ skb->ip_summed = CHECKSUM_NONE;
+ ip_push_pending_frames(sk);
+ }
+
+ bh_unlock_sock(sk);
+
+ ip_rt_put(rt);
+}
+
+/*
+ * IP protocol layer initialiser
+ */
+
+static struct packet_type ip_packet_type = {
+ .type = __constant_htons(ETH_P_IP),
+ .func = ip_rcv,
+};
+
+/*
+ * IP registers the packet type and then calls the subprotocol initialisers
+ */
+
+void __init ip_init(void)
+{
+ dev_add_pack(&ip_packet_type);
+
+ ip_rt_init();
+ inet_initpeers();
+
+#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
+ igmp_mc_proc_init();
+#endif
+}
+
+EXPORT_SYMBOL(ip_finish_output);
+EXPORT_SYMBOL(ip_fragment);
+EXPORT_SYMBOL(ip_generic_getfrag);
+EXPORT_SYMBOL(ip_queue_xmit);
+EXPORT_SYMBOL(ip_send_check);
+
+#ifdef CONFIG_SYSCTL
+EXPORT_SYMBOL(sysctl_ip_default_ttl);
+#endif
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
new file mode 100644
index 00000000000..47012b93cad
--- /dev/null
+++ b/net/ipv4/ip_sockglue.c
@@ -0,0 +1,1093 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The IP to API glue.
+ *
+ * Version: $Id: ip_sockglue.c,v 1.62 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors: see ip.c
+ *
+ * Fixes:
+ * Many : Split from ip.c , see ip.c for history.
+ * Martin Mares : TOS setting fixed.
+ * Alan Cox : Fixed a couple of oopses in Martin's
+ * TOS tweaks.
+ * Mike McLagan : Routing by source
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/igmp.h>
+#include <linux/netfilter.h>
+#include <linux/route.h>
+#include <linux/mroute.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#include <net/transp_v6.h>
+#endif
+
+#include <linux/errqueue.h>
+#include <asm/uaccess.h>
+
+#define IP_CMSG_PKTINFO 1
+#define IP_CMSG_TTL 2
+#define IP_CMSG_TOS 4
+#define IP_CMSG_RECVOPTS 8
+#define IP_CMSG_RETOPTS 16
+
+/*
+ * SOL_IP control messages.
+ */
+
+static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
+{
+ struct in_pktinfo info;
+ struct rtable *rt = (struct rtable *)skb->dst;
+
+ info.ipi_addr.s_addr = skb->nh.iph->daddr;
+ if (rt) {
+ info.ipi_ifindex = rt->rt_iif;
+ info.ipi_spec_dst.s_addr = rt->rt_spec_dst;
+ } else {
+ info.ipi_ifindex = 0;
+ info.ipi_spec_dst.s_addr = 0;
+ }
+
+ put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
+}
+
+static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb)
+{
+ int ttl = skb->nh.iph->ttl;
+ put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl);
+}
+
+static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb)
+{
+ put_cmsg(msg, SOL_IP, IP_TOS, 1, &skb->nh.iph->tos);
+}
+
+static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
+{
+ if (IPCB(skb)->opt.optlen == 0)
+ return;
+
+ put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen, skb->nh.iph+1);
+}
+
+
+static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
+{
+ unsigned char optbuf[sizeof(struct ip_options) + 40];
+ struct ip_options * opt = (struct ip_options*)optbuf;
+
+ if (IPCB(skb)->opt.optlen == 0)
+ return;
+
+ if (ip_options_echo(opt, skb)) {
+ msg->msg_flags |= MSG_CTRUNC;
+ return;
+ }
+ ip_options_undo(opt);
+
+ put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data);
+}
+
+
+void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
+{
+ struct inet_sock *inet = inet_sk(skb->sk);
+ unsigned flags = inet->cmsg_flags;
+
+ /* Ordered by supposed usage frequency */
+ if (flags & 1)
+ ip_cmsg_recv_pktinfo(msg, skb);
+ if ((flags>>=1) == 0)
+ return;
+
+ if (flags & 1)
+ ip_cmsg_recv_ttl(msg, skb);
+ if ((flags>>=1) == 0)
+ return;
+
+ if (flags & 1)
+ ip_cmsg_recv_tos(msg, skb);
+ if ((flags>>=1) == 0)
+ return;
+
+ if (flags & 1)
+ ip_cmsg_recv_opts(msg, skb);
+ if ((flags>>=1) == 0)
+ return;
+
+ if (flags & 1)
+ ip_cmsg_recv_retopts(msg, skb);
+}
+
+int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc)
+{
+ int err;
+ struct cmsghdr *cmsg;
+
+ for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+ if (cmsg->cmsg_level != SOL_IP)
+ continue;
+ switch (cmsg->cmsg_type) {
+ case IP_RETOPTS:
+ err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+ err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0);
+ if (err)
+ return err;
+ break;
+ case IP_PKTINFO:
+ {
+ struct in_pktinfo *info;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo)))
+ return -EINVAL;
+ info = (struct in_pktinfo *)CMSG_DATA(cmsg);
+ ipc->oif = info->ipi_ifindex;
+ ipc->addr = info->ipi_spec_dst.s_addr;
+ break;
+ }
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+
+/* Special input handler for packets caught by router alert option.
+ They are selected only by protocol field, and then processed likely
+ local ones; but only if someone wants them! Otherwise, router
+ not running rsvpd will kill RSVP.
+
+ It is user level problem, what it will make with them.
+ I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
+ but receiver should be enough clever f.e. to forward mtrace requests,
+ sent to multicast group to reach destination designated router.
+ */
+struct ip_ra_chain *ip_ra_chain;
+DEFINE_RWLOCK(ip_ra_lock);
+
+int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *))
+{
+ struct ip_ra_chain *ra, *new_ra, **rap;
+
+ if (sk->sk_type != SOCK_RAW || inet_sk(sk)->num == IPPROTO_RAW)
+ return -EINVAL;
+
+ new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
+
+ write_lock_bh(&ip_ra_lock);
+ for (rap = &ip_ra_chain; (ra=*rap) != NULL; rap = &ra->next) {
+ if (ra->sk == sk) {
+ if (on) {
+ write_unlock_bh(&ip_ra_lock);
+ if (new_ra)
+ kfree(new_ra);
+ return -EADDRINUSE;
+ }
+ *rap = ra->next;
+ write_unlock_bh(&ip_ra_lock);
+
+ if (ra->destructor)
+ ra->destructor(sk);
+ sock_put(sk);
+ kfree(ra);
+ return 0;
+ }
+ }
+ if (new_ra == NULL) {
+ write_unlock_bh(&ip_ra_lock);
+ return -ENOBUFS;
+ }
+ new_ra->sk = sk;
+ new_ra->destructor = destructor;
+
+ new_ra->next = ra;
+ *rap = new_ra;
+ sock_hold(sk);
+ write_unlock_bh(&ip_ra_lock);
+
+ return 0;
+}
+
+void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
+ u16 port, u32 info, u8 *payload)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sock_exterr_skb *serr;
+
+ if (!inet->recverr)
+ return;
+
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ serr = SKB_EXT_ERR(skb);
+ serr->ee.ee_errno = err;
+ serr->ee.ee_origin = SO_EE_ORIGIN_ICMP;
+ serr->ee.ee_type = skb->h.icmph->type;
+ serr->ee.ee_code = skb->h.icmph->code;
+ serr->ee.ee_pad = 0;
+ serr->ee.ee_info = info;
+ serr->ee.ee_data = 0;
+ serr->addr_offset = (u8*)&(((struct iphdr*)(skb->h.icmph+1))->daddr) - skb->nh.raw;
+ serr->port = port;
+
+ skb->h.raw = payload;
+ if (!skb_pull(skb, payload - skb->data) ||
+ sock_queue_err_skb(sk, skb))
+ kfree_skb(skb);
+}
+
+void ip_local_error(struct sock *sk, int err, u32 daddr, u16 port, u32 info)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sock_exterr_skb *serr;
+ struct iphdr *iph;
+ struct sk_buff *skb;
+
+ if (!inet->recverr)
+ return;
+
+ skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ iph = (struct iphdr*)skb_put(skb, sizeof(struct iphdr));
+ skb->nh.iph = iph;
+ iph->daddr = daddr;
+
+ serr = SKB_EXT_ERR(skb);
+ serr->ee.ee_errno = err;
+ serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+ serr->ee.ee_type = 0;
+ serr->ee.ee_code = 0;
+ serr->ee.ee_pad = 0;
+ serr->ee.ee_info = info;
+ serr->ee.ee_data = 0;
+ serr->addr_offset = (u8*)&iph->daddr - skb->nh.raw;
+ serr->port = port;
+
+ skb->h.raw = skb->tail;
+ __skb_pull(skb, skb->tail - skb->data);
+
+ if (sock_queue_err_skb(sk, skb))
+ kfree_skb(skb);
+}
+
+/*
+ * Handle MSG_ERRQUEUE
+ */
+int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
+{
+ struct sock_exterr_skb *serr;
+ struct sk_buff *skb, *skb2;
+ struct sockaddr_in *sin;
+ struct {
+ struct sock_extended_err ee;
+ struct sockaddr_in offender;
+ } errhdr;
+ int err;
+ int copied;
+
+ err = -EAGAIN;
+ skb = skb_dequeue(&sk->sk_error_queue);
+ if (skb == NULL)
+ goto out;
+
+ copied = skb->len;
+ if (copied > len) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+ err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+ if (err)
+ goto out_free_skb;
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ serr = SKB_EXT_ERR(skb);
+
+ sin = (struct sockaddr_in *)msg->msg_name;
+ if (sin) {
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = *(u32*)(skb->nh.raw + serr->addr_offset);
+ sin->sin_port = serr->port;
+ memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+ }
+
+ memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
+ sin = &errhdr.offender;
+ sin->sin_family = AF_UNSPEC;
+ if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = skb->nh.iph->saddr;
+ sin->sin_port = 0;
+ memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+ if (inet->cmsg_flags)
+ ip_cmsg_recv(msg, skb);
+ }
+
+ put_cmsg(msg, SOL_IP, IP_RECVERR, sizeof(errhdr), &errhdr);
+
+ /* Now we could try to dump offended packet options */
+
+ msg->msg_flags |= MSG_ERRQUEUE;
+ err = copied;
+
+ /* Reset and regenerate socket error */
+ spin_lock_irq(&sk->sk_error_queue.lock);
+ sk->sk_err = 0;
+ if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
+ sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
+ spin_unlock_irq(&sk->sk_error_queue.lock);
+ sk->sk_error_report(sk);
+ } else
+ spin_unlock_irq(&sk->sk_error_queue.lock);
+
+out_free_skb:
+ kfree_skb(skb);
+out:
+ return err;
+}
+
+
+/*
+ * Socket option code for IP. This is the end of the line after any TCP,UDP etc options on
+ * an IP socket.
+ */
+
+int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ int val=0,err;
+
+ if (level != SOL_IP)
+ return -ENOPROTOOPT;
+
+ if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) |
+ (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) |
+ (1<<IP_RETOPTS) | (1<<IP_TOS) |
+ (1<<IP_TTL) | (1<<IP_HDRINCL) |
+ (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) |
+ (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND))) ||
+ optname == IP_MULTICAST_TTL ||
+ optname == IP_MULTICAST_LOOP) {
+ if (optlen >= sizeof(int)) {
+ if (get_user(val, (int __user *) optval))
+ return -EFAULT;
+ } else if (optlen >= sizeof(char)) {
+ unsigned char ucval;
+
+ if (get_user(ucval, (unsigned char __user *) optval))
+ return -EFAULT;
+ val = (int) ucval;
+ }
+ }
+
+ /* If optlen==0, it is equivalent to val == 0 */
+
+#ifdef CONFIG_IP_MROUTE
+ if (optname >= MRT_BASE && optname <= (MRT_BASE + 10))
+ return ip_mroute_setsockopt(sk,optname,optval,optlen);
+#endif
+
+ err = 0;
+ lock_sock(sk);
+
+ switch (optname) {
+ case IP_OPTIONS:
+ {
+ struct ip_options * opt = NULL;
+ if (optlen > 40 || optlen < 0)
+ goto e_inval;
+ err = ip_options_get(&opt, optval, optlen, 1);
+ if (err)
+ break;
+ if (sk->sk_type == SOCK_STREAM) {
+ struct tcp_sock *tp = tcp_sk(sk);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ if (sk->sk_family == PF_INET ||
+ (!((1 << sk->sk_state) &
+ (TCPF_LISTEN | TCPF_CLOSE)) &&
+ inet->daddr != LOOPBACK4_IPV6)) {
+#endif
+ if (inet->opt)
+ tp->ext_header_len -= inet->opt->optlen;
+ if (opt)
+ tp->ext_header_len += opt->optlen;
+ tcp_sync_mss(sk, tp->pmtu_cookie);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ }
+#endif
+ }
+ opt = xchg(&inet->opt, opt);
+ if (opt)
+ kfree(opt);
+ break;
+ }
+ case IP_PKTINFO:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_PKTINFO;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_PKTINFO;
+ break;
+ case IP_RECVTTL:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_TTL;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_TTL;
+ break;
+ case IP_RECVTOS:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_TOS;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_TOS;
+ break;
+ case IP_RECVOPTS:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_RECVOPTS;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_RECVOPTS;
+ break;
+ case IP_RETOPTS:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_RETOPTS;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_RETOPTS;
+ break;
+ case IP_TOS: /* This sets both TOS and Precedence */
+ if (sk->sk_type == SOCK_STREAM) {
+ val &= ~3;
+ val |= inet->tos & 3;
+ }
+ if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP &&
+ !capable(CAP_NET_ADMIN)) {
+ err = -EPERM;
+ break;
+ }
+ if (inet->tos != val) {
+ inet->tos = val;
+ sk->sk_priority = rt_tos2priority(val);
+ sk_dst_reset(sk);
+ }
+ break;
+ case IP_TTL:
+ if (optlen<1)
+ goto e_inval;
+ if (val != -1 && (val < 1 || val>255))
+ goto e_inval;
+ inet->uc_ttl = val;
+ break;
+ case IP_HDRINCL:
+ if (sk->sk_type != SOCK_RAW) {
+ err = -ENOPROTOOPT;
+ break;
+ }
+ inet->hdrincl = val ? 1 : 0;
+ break;
+ case IP_MTU_DISCOVER:
+ if (val<0 || val>2)
+ goto e_inval;
+ inet->pmtudisc = val;
+ break;
+ case IP_RECVERR:
+ inet->recverr = !!val;
+ if (!val)
+ skb_queue_purge(&sk->sk_error_queue);
+ break;
+ case IP_MULTICAST_TTL:
+ if (sk->sk_type == SOCK_STREAM)
+ goto e_inval;
+ if (optlen<1)
+ goto e_inval;
+ if (val==-1)
+ val = 1;
+ if (val < 0 || val > 255)
+ goto e_inval;
+ inet->mc_ttl = val;
+ break;
+ case IP_MULTICAST_LOOP:
+ if (optlen<1)
+ goto e_inval;
+ inet->mc_loop = !!val;
+ break;
+ case IP_MULTICAST_IF:
+ {
+ struct ip_mreqn mreq;
+ struct net_device *dev = NULL;
+
+ if (sk->sk_type == SOCK_STREAM)
+ goto e_inval;
+ /*
+ * Check the arguments are allowable
+ */
+
+ err = -EFAULT;
+ if (optlen >= sizeof(struct ip_mreqn)) {
+ if (copy_from_user(&mreq,optval,sizeof(mreq)))
+ break;
+ } else {
+ memset(&mreq, 0, sizeof(mreq));
+ if (optlen >= sizeof(struct in_addr) &&
+ copy_from_user(&mreq.imr_address,optval,sizeof(struct in_addr)))
+ break;
+ }
+
+ if (!mreq.imr_ifindex) {
+ if (mreq.imr_address.s_addr == INADDR_ANY) {
+ inet->mc_index = 0;
+ inet->mc_addr = 0;
+ err = 0;
+ break;
+ }
+ dev = ip_dev_find(mreq.imr_address.s_addr);
+ if (dev) {
+ mreq.imr_ifindex = dev->ifindex;
+ dev_put(dev);
+ }
+ } else
+ dev = __dev_get_by_index(mreq.imr_ifindex);
+
+
+ err = -EADDRNOTAVAIL;
+ if (!dev)
+ break;
+
+ err = -EINVAL;
+ if (sk->sk_bound_dev_if &&
+ mreq.imr_ifindex != sk->sk_bound_dev_if)
+ break;
+
+ inet->mc_index = mreq.imr_ifindex;
+ inet->mc_addr = mreq.imr_address.s_addr;
+ err = 0;
+ break;
+ }
+
+ case IP_ADD_MEMBERSHIP:
+ case IP_DROP_MEMBERSHIP:
+ {
+ struct ip_mreqn mreq;
+
+ if (optlen < sizeof(struct ip_mreq))
+ goto e_inval;
+ err = -EFAULT;
+ if (optlen >= sizeof(struct ip_mreqn)) {
+ if(copy_from_user(&mreq,optval,sizeof(mreq)))
+ break;
+ } else {
+ memset(&mreq, 0, sizeof(mreq));
+ if (copy_from_user(&mreq,optval,sizeof(struct ip_mreq)))
+ break;
+ }
+
+ if (optname == IP_ADD_MEMBERSHIP)
+ err = ip_mc_join_group(sk, &mreq);
+ else
+ err = ip_mc_leave_group(sk, &mreq);
+ break;
+ }
+ case IP_MSFILTER:
+ {
+ extern int sysctl_optmem_max;
+ extern int sysctl_igmp_max_msf;
+ struct ip_msfilter *msf;
+
+ if (optlen < IP_MSFILTER_SIZE(0))
+ goto e_inval;
+ if (optlen > sysctl_optmem_max) {
+ err = -ENOBUFS;
+ break;
+ }
+ msf = (struct ip_msfilter *)kmalloc(optlen, GFP_KERNEL);
+ if (msf == 0) {
+ err = -ENOBUFS;
+ break;
+ }
+ err = -EFAULT;
+ if (copy_from_user(msf, optval, optlen)) {
+ kfree(msf);
+ break;
+ }
+ /* numsrc >= (1G-4) overflow in 32 bits */
+ if (msf->imsf_numsrc >= 0x3ffffffcU ||
+ msf->imsf_numsrc > sysctl_igmp_max_msf) {
+ kfree(msf);
+ err = -ENOBUFS;
+ break;
+ }
+ if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) {
+ kfree(msf);
+ err = -EINVAL;
+ break;
+ }
+ err = ip_mc_msfilter(sk, msf, 0);
+ kfree(msf);
+ break;
+ }
+ case IP_BLOCK_SOURCE:
+ case IP_UNBLOCK_SOURCE:
+ case IP_ADD_SOURCE_MEMBERSHIP:
+ case IP_DROP_SOURCE_MEMBERSHIP:
+ {
+ struct ip_mreq_source mreqs;
+ int omode, add;
+
+ if (optlen != sizeof(struct ip_mreq_source))
+ goto e_inval;
+ if (copy_from_user(&mreqs, optval, sizeof(mreqs))) {
+ err = -EFAULT;
+ break;
+ }
+ if (optname == IP_BLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 1;
+ } else if (optname == IP_UNBLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 0;
+ } else if (optname == IP_ADD_SOURCE_MEMBERSHIP) {
+ struct ip_mreqn mreq;
+
+ mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr;
+ mreq.imr_address.s_addr = mreqs.imr_interface;
+ mreq.imr_ifindex = 0;
+ err = ip_mc_join_group(sk, &mreq);
+ if (err)
+ break;
+ omode = MCAST_INCLUDE;
+ add = 1;
+ } else /*IP_DROP_SOURCE_MEMBERSHIP */ {
+ omode = MCAST_INCLUDE;
+ add = 0;
+ }
+ err = ip_mc_source(add, omode, sk, &mreqs, 0);
+ break;
+ }
+ case MCAST_JOIN_GROUP:
+ case MCAST_LEAVE_GROUP:
+ {
+ struct group_req greq;
+ struct sockaddr_in *psin;
+ struct ip_mreqn mreq;
+
+ if (optlen < sizeof(struct group_req))
+ goto e_inval;
+ err = -EFAULT;
+ if(copy_from_user(&greq, optval, sizeof(greq)))
+ break;
+ psin = (struct sockaddr_in *)&greq.gr_group;
+ if (psin->sin_family != AF_INET)
+ goto e_inval;
+ memset(&mreq, 0, sizeof(mreq));
+ mreq.imr_multiaddr = psin->sin_addr;
+ mreq.imr_ifindex = greq.gr_interface;
+
+ if (optname == MCAST_JOIN_GROUP)
+ err = ip_mc_join_group(sk, &mreq);
+ else
+ err = ip_mc_leave_group(sk, &mreq);
+ break;
+ }
+ case MCAST_JOIN_SOURCE_GROUP:
+ case MCAST_LEAVE_SOURCE_GROUP:
+ case MCAST_BLOCK_SOURCE:
+ case MCAST_UNBLOCK_SOURCE:
+ {
+ struct group_source_req greqs;
+ struct ip_mreq_source mreqs;
+ struct sockaddr_in *psin;
+ int omode, add;
+
+ if (optlen != sizeof(struct group_source_req))
+ goto e_inval;
+ if (copy_from_user(&greqs, optval, sizeof(greqs))) {
+ err = -EFAULT;
+ break;
+ }
+ if (greqs.gsr_group.ss_family != AF_INET ||
+ greqs.gsr_source.ss_family != AF_INET) {
+ err = -EADDRNOTAVAIL;
+ break;
+ }
+ psin = (struct sockaddr_in *)&greqs.gsr_group;
+ mreqs.imr_multiaddr = psin->sin_addr.s_addr;
+ psin = (struct sockaddr_in *)&greqs.gsr_source;
+ mreqs.imr_sourceaddr = psin->sin_addr.s_addr;
+ mreqs.imr_interface = 0; /* use index for mc_source */
+
+ if (optname == MCAST_BLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 1;
+ } else if (optname == MCAST_UNBLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 0;
+ } else if (optname == MCAST_JOIN_SOURCE_GROUP) {
+ struct ip_mreqn mreq;
+
+ psin = (struct sockaddr_in *)&greqs.gsr_group;
+ mreq.imr_multiaddr = psin->sin_addr;
+ mreq.imr_address.s_addr = 0;
+ mreq.imr_ifindex = greqs.gsr_interface;
+ err = ip_mc_join_group(sk, &mreq);
+ if (err)
+ break;
+ greqs.gsr_interface = mreq.imr_ifindex;
+ omode = MCAST_INCLUDE;
+ add = 1;
+ } else /* MCAST_LEAVE_SOURCE_GROUP */ {
+ omode = MCAST_INCLUDE;
+ add = 0;
+ }
+ err = ip_mc_source(add, omode, sk, &mreqs,
+ greqs.gsr_interface);
+ break;
+ }
+ case MCAST_MSFILTER:
+ {
+ extern int sysctl_optmem_max;
+ extern int sysctl_igmp_max_msf;
+ struct sockaddr_in *psin;
+ struct ip_msfilter *msf = NULL;
+ struct group_filter *gsf = NULL;
+ int msize, i, ifindex;
+
+ if (optlen < GROUP_FILTER_SIZE(0))
+ goto e_inval;
+ if (optlen > sysctl_optmem_max) {
+ err = -ENOBUFS;
+ break;
+ }
+ gsf = (struct group_filter *)kmalloc(optlen,GFP_KERNEL);
+ if (gsf == 0) {
+ err = -ENOBUFS;
+ break;
+ }
+ err = -EFAULT;
+ if (copy_from_user(gsf, optval, optlen)) {
+ goto mc_msf_out;
+ }
+ /* numsrc >= (4G-140)/128 overflow in 32 bits */
+ if (gsf->gf_numsrc >= 0x1ffffff ||
+ gsf->gf_numsrc > sysctl_igmp_max_msf) {
+ err = -ENOBUFS;
+ goto mc_msf_out;
+ }
+ if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) {
+ err = -EINVAL;
+ goto mc_msf_out;
+ }
+ msize = IP_MSFILTER_SIZE(gsf->gf_numsrc);
+ msf = (struct ip_msfilter *)kmalloc(msize,GFP_KERNEL);
+ if (msf == 0) {
+ err = -ENOBUFS;
+ goto mc_msf_out;
+ }
+ ifindex = gsf->gf_interface;
+ psin = (struct sockaddr_in *)&gsf->gf_group;
+ if (psin->sin_family != AF_INET) {
+ err = -EADDRNOTAVAIL;
+ goto mc_msf_out;
+ }
+ msf->imsf_multiaddr = psin->sin_addr.s_addr;
+ msf->imsf_interface = 0;
+ msf->imsf_fmode = gsf->gf_fmode;
+ msf->imsf_numsrc = gsf->gf_numsrc;
+ err = -EADDRNOTAVAIL;
+ for (i=0; i<gsf->gf_numsrc; ++i) {
+ psin = (struct sockaddr_in *)&gsf->gf_slist[i];
+
+ if (psin->sin_family != AF_INET)
+ goto mc_msf_out;
+ msf->imsf_slist[i] = psin->sin_addr.s_addr;
+ }
+ kfree(gsf);
+ gsf = NULL;
+
+ err = ip_mc_msfilter(sk, msf, ifindex);
+mc_msf_out:
+ if (msf)
+ kfree(msf);
+ if (gsf)
+ kfree(gsf);
+ break;
+ }
+ case IP_ROUTER_ALERT:
+ err = ip_ra_control(sk, val ? 1 : 0, NULL);
+ break;
+
+ case IP_FREEBIND:
+ if (optlen<1)
+ goto e_inval;
+ inet->freebind = !!val;
+ break;
+
+ case IP_IPSEC_POLICY:
+ case IP_XFRM_POLICY:
+ err = xfrm_user_policy(sk, optname, optval, optlen);
+ break;
+
+ default:
+#ifdef CONFIG_NETFILTER
+ err = nf_setsockopt(sk, PF_INET, optname, optval,
+ optlen);
+#else
+ err = -ENOPROTOOPT;
+#endif
+ break;
+ }
+ release_sock(sk);
+ return err;
+
+e_inval:
+ release_sock(sk);
+ return -EINVAL;
+}
+
+/*
+ * Get the options. Note for future reference. The GET of IP options gets the
+ * _received_ ones. The set sets the _sent_ ones.
+ */
+
+int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ int val;
+ int len;
+
+ if(level!=SOL_IP)
+ return -EOPNOTSUPP;
+
+#ifdef CONFIG_IP_MROUTE
+ if(optname>=MRT_BASE && optname <=MRT_BASE+10)
+ {
+ return ip_mroute_getsockopt(sk,optname,optval,optlen);
+ }
+#endif
+
+ if(get_user(len,optlen))
+ return -EFAULT;
+ if(len < 0)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ switch(optname) {
+ case IP_OPTIONS:
+ {
+ unsigned char optbuf[sizeof(struct ip_options)+40];
+ struct ip_options * opt = (struct ip_options*)optbuf;
+ opt->optlen = 0;
+ if (inet->opt)
+ memcpy(optbuf, inet->opt,
+ sizeof(struct ip_options)+
+ inet->opt->optlen);
+ release_sock(sk);
+
+ if (opt->optlen == 0)
+ return put_user(0, optlen);
+
+ ip_options_undo(opt);
+
+ len = min_t(unsigned int, len, opt->optlen);
+ if(put_user(len, optlen))
+ return -EFAULT;
+ if(copy_to_user(optval, opt->__data, len))
+ return -EFAULT;
+ return 0;
+ }
+ case IP_PKTINFO:
+ val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0;
+ break;
+ case IP_RECVTTL:
+ val = (inet->cmsg_flags & IP_CMSG_TTL) != 0;
+ break;
+ case IP_RECVTOS:
+ val = (inet->cmsg_flags & IP_CMSG_TOS) != 0;
+ break;
+ case IP_RECVOPTS:
+ val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0;
+ break;
+ case IP_RETOPTS:
+ val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0;
+ break;
+ case IP_TOS:
+ val = inet->tos;
+ break;
+ case IP_TTL:
+ val = (inet->uc_ttl == -1 ?
+ sysctl_ip_default_ttl :
+ inet->uc_ttl);
+ break;
+ case IP_HDRINCL:
+ val = inet->hdrincl;
+ break;
+ case IP_MTU_DISCOVER:
+ val = inet->pmtudisc;
+ break;
+ case IP_MTU:
+ {
+ struct dst_entry *dst;
+ val = 0;
+ dst = sk_dst_get(sk);
+ if (dst) {
+ val = dst_mtu(dst);
+ dst_release(dst);
+ }
+ if (!val) {
+ release_sock(sk);
+ return -ENOTCONN;
+ }
+ break;
+ }
+ case IP_RECVERR:
+ val = inet->recverr;
+ break;
+ case IP_MULTICAST_TTL:
+ val = inet->mc_ttl;
+ break;
+ case IP_MULTICAST_LOOP:
+ val = inet->mc_loop;
+ break;
+ case IP_MULTICAST_IF:
+ {
+ struct in_addr addr;
+ len = min_t(unsigned int, len, sizeof(struct in_addr));
+ addr.s_addr = inet->mc_addr;
+ release_sock(sk);
+
+ if(put_user(len, optlen))
+ return -EFAULT;
+ if(copy_to_user(optval, &addr, len))
+ return -EFAULT;
+ return 0;
+ }
+ case IP_MSFILTER:
+ {
+ struct ip_msfilter msf;
+ int err;
+
+ if (len < IP_MSFILTER_SIZE(0)) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+ if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) {
+ release_sock(sk);
+ return -EFAULT;
+ }
+ err = ip_mc_msfget(sk, &msf,
+ (struct ip_msfilter __user *)optval, optlen);
+ release_sock(sk);
+ return err;
+ }
+ case MCAST_MSFILTER:
+ {
+ struct group_filter gsf;
+ int err;
+
+ if (len < GROUP_FILTER_SIZE(0)) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+ if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) {
+ release_sock(sk);
+ return -EFAULT;
+ }
+ err = ip_mc_gsfget(sk, &gsf,
+ (struct group_filter __user *)optval, optlen);
+ release_sock(sk);
+ return err;
+ }
+ case IP_PKTOPTIONS:
+ {
+ struct msghdr msg;
+
+ release_sock(sk);
+
+ if (sk->sk_type != SOCK_STREAM)
+ return -ENOPROTOOPT;
+
+ msg.msg_control = optval;
+ msg.msg_controllen = len;
+ msg.msg_flags = 0;
+
+ if (inet->cmsg_flags & IP_CMSG_PKTINFO) {
+ struct in_pktinfo info;
+
+ info.ipi_addr.s_addr = inet->rcv_saddr;
+ info.ipi_spec_dst.s_addr = inet->rcv_saddr;
+ info.ipi_ifindex = inet->mc_index;
+ put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
+ }
+ if (inet->cmsg_flags & IP_CMSG_TTL) {
+ int hlim = inet->mc_ttl;
+ put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
+ }
+ len -= msg.msg_controllen;
+ return put_user(len, optlen);
+ }
+ case IP_FREEBIND:
+ val = inet->freebind;
+ break;
+ default:
+#ifdef CONFIG_NETFILTER
+ val = nf_getsockopt(sk, PF_INET, optname, optval,
+ &len);
+ release_sock(sk);
+ if (val >= 0)
+ val = put_user(len, optlen);
+ return val;
+#else
+ release_sock(sk);
+ return -ENOPROTOOPT;
+#endif
+ }
+ release_sock(sk);
+
+ if (len < sizeof(int) && len > 0 && val>=0 && val<255) {
+ unsigned char ucval = (unsigned char)val;
+ len = 1;
+ if(put_user(len, optlen))
+ return -EFAULT;
+ if(copy_to_user(optval,&ucval,1))
+ return -EFAULT;
+ } else {
+ len = min_t(unsigned int, sizeof(int), len);
+ if(put_user(len, optlen))
+ return -EFAULT;
+ if(copy_to_user(optval,&val,len))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+EXPORT_SYMBOL(ip_cmsg_recv);
+
+#ifdef CONFIG_IP_SCTP_MODULE
+EXPORT_SYMBOL(ip_getsockopt);
+EXPORT_SYMBOL(ip_setsockopt);
+#endif
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
new file mode 100644
index 00000000000..1a23c5263b9
--- /dev/null
+++ b/net/ipv4/ipcomp.c
@@ -0,0 +1,524 @@
+/*
+ * IP Payload Compression Protocol (IPComp) - RFC3173.
+ *
+ * Copyright (c) 2003 James Morris <jmorris@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Todo:
+ * - Tunable compression parameters.
+ * - Compression stats.
+ * - Adaptive compression.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/scatterlist.h>
+#include <asm/semaphore.h>
+#include <linux/crypto.h>
+#include <linux/pfkeyv2.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+#include <linux/list.h>
+#include <linux/vmalloc.h>
+#include <linux/rtnetlink.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/icmp.h>
+#include <net/ipcomp.h>
+
+struct ipcomp_tfms {
+ struct list_head list;
+ struct crypto_tfm **tfms;
+ int users;
+};
+
+static DECLARE_MUTEX(ipcomp_resource_sem);
+static void **ipcomp_scratches;
+static int ipcomp_scratch_users;
+static LIST_HEAD(ipcomp_tfms_list);
+
+static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err, plen, dlen;
+ struct iphdr *iph;
+ struct ipcomp_data *ipcd = x->data;
+ u8 *start, *scratch;
+ struct crypto_tfm *tfm;
+ int cpu;
+
+ plen = skb->len;
+ dlen = IPCOMP_SCRATCH_SIZE;
+ start = skb->data;
+
+ cpu = get_cpu();
+ scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
+ tfm = *per_cpu_ptr(ipcd->tfms, cpu);
+
+ err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen);
+ if (err)
+ goto out;
+
+ if (dlen < (plen + sizeof(struct ip_comp_hdr))) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = pskb_expand_head(skb, 0, dlen - plen, GFP_ATOMIC);
+ if (err)
+ goto out;
+
+ skb_put(skb, dlen - plen);
+ memcpy(skb->data, scratch, dlen);
+ iph = skb->nh.iph;
+ iph->tot_len = htons(dlen + iph->ihl * 4);
+out:
+ put_cpu();
+ return err;
+}
+
+static int ipcomp_input(struct xfrm_state *x,
+ struct xfrm_decap_state *decap, struct sk_buff *skb)
+{
+ u8 nexthdr;
+ int err = 0;
+ struct iphdr *iph;
+ union {
+ struct iphdr iph;
+ char buf[60];
+ } tmp_iph;
+
+
+ if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
+ skb_linearize(skb, GFP_ATOMIC) != 0) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ /* Remove ipcomp header and decompress original payload */
+ iph = skb->nh.iph;
+ memcpy(&tmp_iph, iph, iph->ihl * 4);
+ nexthdr = *(u8 *)skb->data;
+ skb_pull(skb, sizeof(struct ip_comp_hdr));
+ skb->nh.raw += sizeof(struct ip_comp_hdr);
+ memcpy(skb->nh.raw, &tmp_iph, tmp_iph.iph.ihl * 4);
+ iph = skb->nh.iph;
+ iph->tot_len = htons(ntohs(iph->tot_len) - sizeof(struct ip_comp_hdr));
+ iph->protocol = nexthdr;
+ skb->h.raw = skb->data;
+ err = ipcomp_decompress(x, skb);
+
+out:
+ return err;
+}
+
+static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err, plen, dlen, ihlen;
+ struct iphdr *iph = skb->nh.iph;
+ struct ipcomp_data *ipcd = x->data;
+ u8 *start, *scratch;
+ struct crypto_tfm *tfm;
+ int cpu;
+
+ ihlen = iph->ihl * 4;
+ plen = skb->len - ihlen;
+ dlen = IPCOMP_SCRATCH_SIZE;
+ start = skb->data + ihlen;
+
+ cpu = get_cpu();
+ scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
+ tfm = *per_cpu_ptr(ipcd->tfms, cpu);
+
+ err = crypto_comp_compress(tfm, start, plen, scratch, &dlen);
+ if (err)
+ goto out;
+
+ if ((dlen + sizeof(struct ip_comp_hdr)) >= plen) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen);
+ put_cpu();
+
+ pskb_trim(skb, ihlen + dlen + sizeof(struct ip_comp_hdr));
+ return 0;
+
+out:
+ put_cpu();
+ return err;
+}
+
+static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+ struct iphdr *iph;
+ struct ip_comp_hdr *ipch;
+ struct ipcomp_data *ipcd = x->data;
+ int hdr_len = 0;
+
+ iph = skb->nh.iph;
+ iph->tot_len = htons(skb->len);
+ hdr_len = iph->ihl * 4;
+ if ((skb->len - hdr_len) < ipcd->threshold) {
+ /* Don't bother compressing */
+ goto out_ok;
+ }
+
+ if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
+ skb_linearize(skb, GFP_ATOMIC) != 0) {
+ goto out_ok;
+ }
+
+ err = ipcomp_compress(x, skb);
+ iph = skb->nh.iph;
+
+ if (err) {
+ goto out_ok;
+ }
+
+ /* Install ipcomp header, convert into ipcomp datagram. */
+ iph->tot_len = htons(skb->len);
+ ipch = (struct ip_comp_hdr *)((char *)iph + iph->ihl * 4);
+ ipch->nexthdr = iph->protocol;
+ ipch->flags = 0;
+ ipch->cpi = htons((u16 )ntohl(x->id.spi));
+ iph->protocol = IPPROTO_COMP;
+ ip_send_check(iph);
+ return 0;
+
+out_ok:
+ if (x->props.mode)
+ ip_send_check(iph);
+ return 0;
+}
+
+static void ipcomp4_err(struct sk_buff *skb, u32 info)
+{
+ u32 spi;
+ struct iphdr *iph = (struct iphdr *)skb->data;
+ struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
+ struct xfrm_state *x;
+
+ if (skb->h.icmph->type != ICMP_DEST_UNREACH ||
+ skb->h.icmph->code != ICMP_FRAG_NEEDED)
+ return;
+
+ spi = ntohl(ntohs(ipch->cpi));
+ x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr,
+ spi, IPPROTO_COMP, AF_INET);
+ if (!x)
+ return;
+ NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n",
+ spi, NIPQUAD(iph->daddr)));
+ xfrm_state_put(x);
+}
+
+/* We always hold one tunnel user reference to indicate a tunnel */
+static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
+{
+ struct xfrm_state *t;
+
+ t = xfrm_state_alloc();
+ if (t == NULL)
+ goto out;
+
+ t->id.proto = IPPROTO_IPIP;
+ t->id.spi = x->props.saddr.a4;
+ t->id.daddr.a4 = x->id.daddr.a4;
+ memcpy(&t->sel, &x->sel, sizeof(t->sel));
+ t->props.family = AF_INET;
+ t->props.mode = 1;
+ t->props.saddr.a4 = x->props.saddr.a4;
+ t->props.flags = x->props.flags;
+
+ t->type = xfrm_get_type(IPPROTO_IPIP, t->props.family);
+ if (t->type == NULL)
+ goto error;
+
+ if (t->type->init_state(t, NULL))
+ goto error;
+
+ t->km.state = XFRM_STATE_VALID;
+ atomic_set(&t->tunnel_users, 1);
+out:
+ return t;
+
+error:
+ t->km.state = XFRM_STATE_DEAD;
+ xfrm_state_put(t);
+ t = NULL;
+ goto out;
+}
+
+/*
+ * Must be protected by xfrm_cfg_sem. State and tunnel user references are
+ * always incremented on success.
+ */
+static int ipcomp_tunnel_attach(struct xfrm_state *x)
+{
+ int err = 0;
+ struct xfrm_state *t;
+
+ t = xfrm_state_lookup((xfrm_address_t *)&x->id.daddr.a4,
+ x->props.saddr.a4, IPPROTO_IPIP, AF_INET);
+ if (!t) {
+ t = ipcomp_tunnel_create(x);
+ if (!t) {
+ err = -EINVAL;
+ goto out;
+ }
+ xfrm_state_insert(t);
+ xfrm_state_hold(t);
+ }
+ x->tunnel = t;
+ atomic_inc(&t->tunnel_users);
+out:
+ return err;
+}
+
+static void ipcomp_free_scratches(void)
+{
+ int i;
+ void **scratches;
+
+ if (--ipcomp_scratch_users)
+ return;
+
+ scratches = ipcomp_scratches;
+ if (!scratches)
+ return;
+
+ for_each_cpu(i) {
+ void *scratch = *per_cpu_ptr(scratches, i);
+ if (scratch)
+ vfree(scratch);
+ }
+
+ free_percpu(scratches);
+}
+
+static void **ipcomp_alloc_scratches(void)
+{
+ int i;
+ void **scratches;
+
+ if (ipcomp_scratch_users++)
+ return ipcomp_scratches;
+
+ scratches = alloc_percpu(void *);
+ if (!scratches)
+ return NULL;
+
+ ipcomp_scratches = scratches;
+
+ for_each_cpu(i) {
+ void *scratch = vmalloc(IPCOMP_SCRATCH_SIZE);
+ if (!scratch)
+ return NULL;
+ *per_cpu_ptr(scratches, i) = scratch;
+ }
+
+ return scratches;
+}
+
+static void ipcomp_free_tfms(struct crypto_tfm **tfms)
+{
+ struct ipcomp_tfms *pos;
+ int cpu;
+
+ list_for_each_entry(pos, &ipcomp_tfms_list, list) {
+ if (pos->tfms == tfms)
+ break;
+ }
+
+ BUG_TRAP(pos);
+
+ if (--pos->users)
+ return;
+
+ list_del(&pos->list);
+ kfree(pos);
+
+ if (!tfms)
+ return;
+
+ for_each_cpu(cpu) {
+ struct crypto_tfm *tfm = *per_cpu_ptr(tfms, cpu);
+ if (tfm)
+ crypto_free_tfm(tfm);
+ }
+ free_percpu(tfms);
+}
+
+static struct crypto_tfm **ipcomp_alloc_tfms(const char *alg_name)
+{
+ struct ipcomp_tfms *pos;
+ struct crypto_tfm **tfms;
+ int cpu;
+
+ /* This can be any valid CPU ID so we don't need locking. */
+ cpu = smp_processor_id();
+
+ list_for_each_entry(pos, &ipcomp_tfms_list, list) {
+ struct crypto_tfm *tfm;
+
+ tfms = pos->tfms;
+ tfm = *per_cpu_ptr(tfms, cpu);
+
+ if (!strcmp(crypto_tfm_alg_name(tfm), alg_name)) {
+ pos->users++;
+ return tfms;
+ }
+ }
+
+ pos = kmalloc(sizeof(*pos), GFP_KERNEL);
+ if (!pos)
+ return NULL;
+
+ pos->users = 1;
+ INIT_LIST_HEAD(&pos->list);
+ list_add(&pos->list, &ipcomp_tfms_list);
+
+ pos->tfms = tfms = alloc_percpu(struct crypto_tfm *);
+ if (!tfms)
+ goto error;
+
+ for_each_cpu(cpu) {
+ struct crypto_tfm *tfm = crypto_alloc_tfm(alg_name, 0);
+ if (!tfm)
+ goto error;
+ *per_cpu_ptr(tfms, cpu) = tfm;
+ }
+
+ return tfms;
+
+error:
+ ipcomp_free_tfms(tfms);
+ return NULL;
+}
+
+static void ipcomp_free_data(struct ipcomp_data *ipcd)
+{
+ if (ipcd->tfms)
+ ipcomp_free_tfms(ipcd->tfms);
+ ipcomp_free_scratches();
+}
+
+static void ipcomp_destroy(struct xfrm_state *x)
+{
+ struct ipcomp_data *ipcd = x->data;
+ if (!ipcd)
+ return;
+ xfrm_state_delete_tunnel(x);
+ down(&ipcomp_resource_sem);
+ ipcomp_free_data(ipcd);
+ up(&ipcomp_resource_sem);
+ kfree(ipcd);
+}
+
+static int ipcomp_init_state(struct xfrm_state *x, void *args)
+{
+ int err;
+ struct ipcomp_data *ipcd;
+ struct xfrm_algo_desc *calg_desc;
+
+ err = -EINVAL;
+ if (!x->calg)
+ goto out;
+
+ if (x->encap)
+ goto out;
+
+ err = -ENOMEM;
+ ipcd = kmalloc(sizeof(*ipcd), GFP_KERNEL);
+ if (!ipcd)
+ goto out;
+
+ memset(ipcd, 0, sizeof(*ipcd));
+ x->props.header_len = 0;
+ if (x->props.mode)
+ x->props.header_len += sizeof(struct iphdr);
+
+ down(&ipcomp_resource_sem);
+ if (!ipcomp_alloc_scratches())
+ goto error;
+
+ ipcd->tfms = ipcomp_alloc_tfms(x->calg->alg_name);
+ if (!ipcd->tfms)
+ goto error;
+ up(&ipcomp_resource_sem);
+
+ if (x->props.mode) {
+ err = ipcomp_tunnel_attach(x);
+ if (err)
+ goto error_tunnel;
+ }
+
+ calg_desc = xfrm_calg_get_byname(x->calg->alg_name, 0);
+ BUG_ON(!calg_desc);
+ ipcd->threshold = calg_desc->uinfo.comp.threshold;
+ x->data = ipcd;
+ err = 0;
+out:
+ return err;
+
+error_tunnel:
+ down(&ipcomp_resource_sem);
+error:
+ ipcomp_free_data(ipcd);
+ up(&ipcomp_resource_sem);
+ kfree(ipcd);
+ goto out;
+}
+
+static struct xfrm_type ipcomp_type = {
+ .description = "IPCOMP4",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_COMP,
+ .init_state = ipcomp_init_state,
+ .destructor = ipcomp_destroy,
+ .input = ipcomp_input,
+ .output = ipcomp_output
+};
+
+static struct net_protocol ipcomp4_protocol = {
+ .handler = xfrm4_rcv,
+ .err_handler = ipcomp4_err,
+ .no_policy = 1,
+};
+
+static int __init ipcomp4_init(void)
+{
+ if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) {
+ printk(KERN_INFO "ipcomp init: can't add xfrm type\n");
+ return -EAGAIN;
+ }
+ if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
+ printk(KERN_INFO "ipcomp init: can't add protocol\n");
+ xfrm_unregister_type(&ipcomp_type, AF_INET);
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static void __exit ipcomp4_fini(void)
+{
+ if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0)
+ printk(KERN_INFO "ip ipcomp close: can't remove protocol\n");
+ if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)
+ printk(KERN_INFO "ip ipcomp close: can't remove xfrm type\n");
+}
+
+module_init(ipcomp4_init);
+module_exit(ipcomp4_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) - RFC3173");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
new file mode 100644
index 00000000000..f2509034ce7
--- /dev/null
+++ b/net/ipv4/ipconfig.c
@@ -0,0 +1,1507 @@
+/*
+ * $Id: ipconfig.c,v 1.46 2002/02/01 22:01:04 davem Exp $
+ *
+ * Automatic Configuration of IP -- use DHCP, BOOTP, RARP, or
+ * user-supplied information to configure own IP address and routes.
+ *
+ * Copyright (C) 1996-1998 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ * Derived from network configuration code in fs/nfs/nfsroot.c,
+ * originally Copyright (C) 1995, 1996 Gero Kuhlmann and me.
+ *
+ * BOOTP rewritten to construct and analyse packets itself instead
+ * of misusing the IP layer. num_bugs_causing_wrong_arp_replies--;
+ * -- MJ, December 1998
+ *
+ * Fixed ip_auto_config_setup calling at startup in the new "Linker Magic"
+ * initialization scheme.
+ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>, 08/11/1999
+ *
+ * DHCP support added. To users this looks like a whole separate
+ * protocol, but we know it's just a bag on the side of BOOTP.
+ * -- Chip Salzenberg <chip@valinux.com>, May 2000
+ *
+ * Ported DHCP support from 2.2.16 to 2.4.0-test4
+ * -- Eric Biederman <ebiederman@lnxi.com>, 30 Aug 2000
+ *
+ * Merged changes from 2.2.19 into 2.4.3
+ * -- Eric Biederman <ebiederman@lnxi.com>, 22 April Aug 2001
+ *
+ * Multiple Nameservers in /proc/net/pnp
+ * -- Josef Siemes <jsiemes@web.de>, Aug 2002
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/socket.h>
+#include <linux/route.h>
+#include <linux/udp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/major.h>
+#include <linux/root_dev.h>
+#include <linux/delay.h>
+#include <net/arp.h>
+#include <net/ip.h>
+#include <net/ipconfig.h>
+
+#include <asm/uaccess.h>
+#include <net/checksum.h>
+#include <asm/processor.h>
+
+/* Define this to allow debugging output */
+#undef IPCONFIG_DEBUG
+
+#ifdef IPCONFIG_DEBUG
+#define DBG(x) printk x
+#else
+#define DBG(x) do { } while(0)
+#endif
+
+#if defined(CONFIG_IP_PNP_DHCP)
+#define IPCONFIG_DHCP
+#endif
+#if defined(CONFIG_IP_PNP_BOOTP) || defined(CONFIG_IP_PNP_DHCP)
+#define IPCONFIG_BOOTP
+#endif
+#if defined(CONFIG_IP_PNP_RARP)
+#define IPCONFIG_RARP
+#endif
+#if defined(IPCONFIG_BOOTP) || defined(IPCONFIG_RARP)
+#define IPCONFIG_DYNAMIC
+#endif
+
+/* Define the friendly delay before and after opening net devices */
+#define CONF_PRE_OPEN 500 /* Before opening: 1/2 second */
+#define CONF_POST_OPEN 1 /* After opening: 1 second */
+
+/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */
+#define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */
+#define CONF_SEND_RETRIES 6 /* Send six requests per open */
+#define CONF_INTER_TIMEOUT (HZ/2) /* Inter-device timeout: 1/2 second */
+#define CONF_BASE_TIMEOUT (HZ*2) /* Initial timeout: 2 seconds */
+#define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */
+#define CONF_TIMEOUT_MULT *7/4 /* Rate of timeout growth */
+#define CONF_TIMEOUT_MAX (HZ*30) /* Maximum allowed timeout */
+#define CONF_NAMESERVERS_MAX 3 /* Maximum number of nameservers
+ - '3' from resolv.h */
+
+
+/*
+ * Public IP configuration
+ */
+
+/* This is used by platforms which might be able to set the ipconfig
+ * variables using firmware environment vars. If this is set, it will
+ * ignore such firmware variables.
+ */
+int ic_set_manually __initdata = 0; /* IPconfig parameters set manually */
+
+static int ic_enable __initdata = 0; /* IP config enabled? */
+
+/* Protocol choice */
+int ic_proto_enabled __initdata = 0
+#ifdef IPCONFIG_BOOTP
+ | IC_BOOTP
+#endif
+#ifdef CONFIG_IP_PNP_DHCP
+ | IC_USE_DHCP
+#endif
+#ifdef IPCONFIG_RARP
+ | IC_RARP
+#endif
+ ;
+
+static int ic_host_name_set __initdata = 0; /* Host name set by us? */
+
+u32 ic_myaddr = INADDR_NONE; /* My IP address */
+static u32 ic_netmask = INADDR_NONE; /* Netmask for local subnet */
+u32 ic_gateway = INADDR_NONE; /* Gateway IP address */
+
+u32 ic_servaddr = INADDR_NONE; /* Boot server IP address */
+
+u32 root_server_addr = INADDR_NONE; /* Address of NFS server */
+u8 root_server_path[256] = { 0, }; /* Path to mount as root */
+
+/* Persistent data: */
+
+static int ic_proto_used; /* Protocol used, if any */
+static u32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */
+static u8 ic_domain[64]; /* DNS (not NIS) domain name */
+
+/*
+ * Private state.
+ */
+
+/* Name of user-selected boot device */
+static char user_dev_name[IFNAMSIZ] __initdata = { 0, };
+
+/* Protocols supported by available interfaces */
+static int ic_proto_have_if __initdata = 0;
+
+#ifdef IPCONFIG_DYNAMIC
+static DEFINE_SPINLOCK(ic_recv_lock);
+static volatile int ic_got_reply __initdata = 0; /* Proto(s) that replied */
+#endif
+#ifdef IPCONFIG_DHCP
+static int ic_dhcp_msgtype __initdata = 0; /* DHCP msg type received */
+#endif
+
+
+/*
+ * Network devices
+ */
+
+struct ic_device {
+ struct ic_device *next;
+ struct net_device *dev;
+ unsigned short flags;
+ short able;
+ u32 xid;
+};
+
+static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */
+static struct net_device *ic_dev __initdata = NULL; /* Selected device */
+
+static int __init ic_open_devs(void)
+{
+ struct ic_device *d, **last;
+ struct net_device *dev;
+ unsigned short oflags;
+
+ last = &ic_first_dev;
+ rtnl_shlock();
+
+ /* bring loopback device up first */
+ if (dev_change_flags(&loopback_dev, loopback_dev.flags | IFF_UP) < 0)
+ printk(KERN_ERR "IP-Config: Failed to open %s\n", loopback_dev.name);
+
+ for (dev = dev_base; dev; dev = dev->next) {
+ if (dev == &loopback_dev)
+ continue;
+ if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
+ (!(dev->flags & IFF_LOOPBACK) &&
+ (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
+ strncmp(dev->name, "dummy", 5))) {
+ int able = 0;
+ if (dev->mtu >= 364)
+ able |= IC_BOOTP;
+ else
+ printk(KERN_WARNING "DHCP/BOOTP: Ignoring device %s, MTU %d too small", dev->name, dev->mtu);
+ if (!(dev->flags & IFF_NOARP))
+ able |= IC_RARP;
+ able &= ic_proto_enabled;
+ if (ic_proto_enabled && !able)
+ continue;
+ oflags = dev->flags;
+ if (dev_change_flags(dev, oflags | IFF_UP) < 0) {
+ printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name);
+ continue;
+ }
+ if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) {
+ rtnl_shunlock();
+ return -1;
+ }
+ d->dev = dev;
+ *last = d;
+ last = &d->next;
+ d->flags = oflags;
+ d->able = able;
+ if (able & IC_BOOTP)
+ get_random_bytes(&d->xid, sizeof(u32));
+ else
+ d->xid = 0;
+ ic_proto_have_if |= able;
+ DBG(("IP-Config: %s UP (able=%d, xid=%08x)\n",
+ dev->name, able, d->xid));
+ }
+ }
+ rtnl_shunlock();
+
+ *last = NULL;
+
+ if (!ic_first_dev) {
+ if (user_dev_name[0])
+ printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name);
+ else
+ printk(KERN_ERR "IP-Config: No network devices available.\n");
+ return -1;
+ }
+ return 0;
+}
+
+static void __init ic_close_devs(void)
+{
+ struct ic_device *d, *next;
+ struct net_device *dev;
+
+ rtnl_shlock();
+ next = ic_first_dev;
+ while ((d = next)) {
+ next = d->next;
+ dev = d->dev;
+ if (dev != ic_dev) {
+ DBG(("IP-Config: Downing %s\n", dev->name));
+ dev_change_flags(dev, d->flags);
+ }
+ kfree(d);
+ }
+ rtnl_shunlock();
+}
+
+/*
+ * Interface to various network functions.
+ */
+
+static inline void
+set_sockaddr(struct sockaddr_in *sin, u32 addr, u16 port)
+{
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = addr;
+ sin->sin_port = port;
+}
+
+static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg)
+{
+ int res;
+
+ mm_segment_t oldfs = get_fs();
+ set_fs(get_ds());
+ res = devinet_ioctl(cmd, (struct ifreq __user *) arg);
+ set_fs(oldfs);
+ return res;
+}
+
+static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg)
+{
+ int res;
+
+ mm_segment_t oldfs = get_fs();
+ set_fs(get_ds());
+ res = ip_rt_ioctl(cmd, (void __user *) arg);
+ set_fs(oldfs);
+ return res;
+}
+
+/*
+ * Set up interface addresses and routes.
+ */
+
+static int __init ic_setup_if(void)
+{
+ struct ifreq ir;
+ struct sockaddr_in *sin = (void *) &ir.ifr_ifru.ifru_addr;
+ int err;
+
+ memset(&ir, 0, sizeof(ir));
+ strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name);
+ set_sockaddr(sin, ic_myaddr, 0);
+ if ((err = ic_dev_ioctl(SIOCSIFADDR, &ir)) < 0) {
+ printk(KERN_ERR "IP-Config: Unable to set interface address (%d).\n", err);
+ return -1;
+ }
+ set_sockaddr(sin, ic_netmask, 0);
+ if ((err = ic_dev_ioctl(SIOCSIFNETMASK, &ir)) < 0) {
+ printk(KERN_ERR "IP-Config: Unable to set interface netmask (%d).\n", err);
+ return -1;
+ }
+ set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0);
+ if ((err = ic_dev_ioctl(SIOCSIFBRDADDR, &ir)) < 0) {
+ printk(KERN_ERR "IP-Config: Unable to set interface broadcast address (%d).\n", err);
+ return -1;
+ }
+ return 0;
+}
+
+static int __init ic_setup_routes(void)
+{
+ /* No need to setup device routes, only the default route... */
+
+ if (ic_gateway != INADDR_NONE) {
+ struct rtentry rm;
+ int err;
+
+ memset(&rm, 0, sizeof(rm));
+ if ((ic_gateway ^ ic_myaddr) & ic_netmask) {
+ printk(KERN_ERR "IP-Config: Gateway not on directly connected network.\n");
+ return -1;
+ }
+ set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0);
+ set_sockaddr((struct sockaddr_in *) &rm.rt_genmask, 0, 0);
+ set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0);
+ rm.rt_flags = RTF_UP | RTF_GATEWAY;
+ if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) {
+ printk(KERN_ERR "IP-Config: Cannot add default route (%d).\n", err);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Fill in default values for all missing parameters.
+ */
+
+static int __init ic_defaults(void)
+{
+ /*
+ * At this point we have no userspace running so need not
+ * claim locks on system_utsname
+ */
+
+ if (!ic_host_name_set)
+ sprintf(system_utsname.nodename, "%u.%u.%u.%u", NIPQUAD(ic_myaddr));
+
+ if (root_server_addr == INADDR_NONE)
+ root_server_addr = ic_servaddr;
+
+ if (ic_netmask == INADDR_NONE) {
+ if (IN_CLASSA(ntohl(ic_myaddr)))
+ ic_netmask = htonl(IN_CLASSA_NET);
+ else if (IN_CLASSB(ntohl(ic_myaddr)))
+ ic_netmask = htonl(IN_CLASSB_NET);
+ else if (IN_CLASSC(ntohl(ic_myaddr)))
+ ic_netmask = htonl(IN_CLASSC_NET);
+ else {
+ printk(KERN_ERR "IP-Config: Unable to guess netmask for address %u.%u.%u.%u\n",
+ NIPQUAD(ic_myaddr));
+ return -1;
+ }
+ printk("IP-Config: Guessing netmask %u.%u.%u.%u\n", NIPQUAD(ic_netmask));
+ }
+
+ return 0;
+}
+
+/*
+ * RARP support.
+ */
+
+#ifdef IPCONFIG_RARP
+
+static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
+
+static struct packet_type rarp_packet_type __initdata = {
+ .type = __constant_htons(ETH_P_RARP),
+ .func = ic_rarp_recv,
+};
+
+static inline void ic_rarp_init(void)
+{
+ dev_add_pack(&rarp_packet_type);
+}
+
+static inline void ic_rarp_cleanup(void)
+{
+ dev_remove_pack(&rarp_packet_type);
+}
+
+/*
+ * Process received RARP packet.
+ */
+static int __init
+ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+{
+ struct arphdr *rarp;
+ unsigned char *rarp_ptr;
+ unsigned long sip, tip;
+ unsigned char *sha, *tha; /* s for "source", t for "target" */
+ struct ic_device *d;
+
+ if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+ return NET_RX_DROP;
+
+ if (!pskb_may_pull(skb, sizeof(struct arphdr)))
+ goto drop;
+
+ /* Basic sanity checks can be done without the lock. */
+ rarp = (struct arphdr *)skb->h.raw;
+
+ /* If this test doesn't pass, it's not IP, or we should
+ * ignore it anyway.
+ */
+ if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd))
+ goto drop;
+
+ /* If it's not a RARP reply, delete it. */
+ if (rarp->ar_op != htons(ARPOP_RREPLY))
+ goto drop;
+
+ /* If it's not Ethernet, delete it. */
+ if (rarp->ar_pro != htons(ETH_P_IP))
+ goto drop;
+
+ if (!pskb_may_pull(skb,
+ sizeof(struct arphdr) +
+ (2 * dev->addr_len) +
+ (2 * 4)))
+ goto drop;
+
+ /* OK, it is all there and looks valid, process... */
+ rarp = (struct arphdr *)skb->h.raw;
+ rarp_ptr = (unsigned char *) (rarp + 1);
+
+ /* One reply at a time, please. */
+ spin_lock(&ic_recv_lock);
+
+ /* If we already have a reply, just drop the packet */
+ if (ic_got_reply)
+ goto drop_unlock;
+
+ /* Find the ic_device that the packet arrived on */
+ d = ic_first_dev;
+ while (d && d->dev != dev)
+ d = d->next;
+ if (!d)
+ goto drop_unlock; /* should never happen */
+
+ /* Extract variable-width fields */
+ sha = rarp_ptr;
+ rarp_ptr += dev->addr_len;
+ memcpy(&sip, rarp_ptr, 4);
+ rarp_ptr += 4;
+ tha = rarp_ptr;
+ rarp_ptr += dev->addr_len;
+ memcpy(&tip, rarp_ptr, 4);
+
+ /* Discard packets which are not meant for us. */
+ if (memcmp(tha, dev->dev_addr, dev->addr_len))
+ goto drop_unlock;
+
+ /* Discard packets which are not from specified server. */
+ if (ic_servaddr != INADDR_NONE && ic_servaddr != sip)
+ goto drop_unlock;
+
+ /* We have a winner! */
+ ic_dev = dev;
+ if (ic_myaddr == INADDR_NONE)
+ ic_myaddr = tip;
+ ic_servaddr = sip;
+ ic_got_reply = IC_RARP;
+
+drop_unlock:
+ /* Show's over. Nothing to see here. */
+ spin_unlock(&ic_recv_lock);
+
+drop:
+ /* Throw the packet out. */
+ kfree_skb(skb);
+ return 0;
+}
+
+
+/*
+ * Send RARP request packet over a single interface.
+ */
+static void __init ic_rarp_send_if(struct ic_device *d)
+{
+ struct net_device *dev = d->dev;
+ arp_send(ARPOP_RREQUEST, ETH_P_RARP, 0, dev, 0, NULL,
+ dev->dev_addr, dev->dev_addr);
+}
+#endif
+
+/*
+ * DHCP/BOOTP support.
+ */
+
+#ifdef IPCONFIG_BOOTP
+
+struct bootp_pkt { /* BOOTP packet format */
+ struct iphdr iph; /* IP header */
+ struct udphdr udph; /* UDP header */
+ u8 op; /* 1=request, 2=reply */
+ u8 htype; /* HW address type */
+ u8 hlen; /* HW address length */
+ u8 hops; /* Used only by gateways */
+ u32 xid; /* Transaction ID */
+ u16 secs; /* Seconds since we started */
+ u16 flags; /* Just what it says */
+ u32 client_ip; /* Client's IP address if known */
+ u32 your_ip; /* Assigned IP address */
+ u32 server_ip; /* (Next, e.g. NFS) Server's IP address */
+ u32 relay_ip; /* IP address of BOOTP relay */
+ u8 hw_addr[16]; /* Client's HW address */
+ u8 serv_name[64]; /* Server host name */
+ u8 boot_file[128]; /* Name of boot file */
+ u8 exten[312]; /* DHCP options / BOOTP vendor extensions */
+};
+
+/* packet ops */
+#define BOOTP_REQUEST 1
+#define BOOTP_REPLY 2
+
+/* DHCP message types */
+#define DHCPDISCOVER 1
+#define DHCPOFFER 2
+#define DHCPREQUEST 3
+#define DHCPDECLINE 4
+#define DHCPACK 5
+#define DHCPNAK 6
+#define DHCPRELEASE 7
+#define DHCPINFORM 8
+
+static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
+
+static struct packet_type bootp_packet_type __initdata = {
+ .type = __constant_htons(ETH_P_IP),
+ .func = ic_bootp_recv,
+};
+
+
+/*
+ * Initialize DHCP/BOOTP extension fields in the request.
+ */
+
+static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 };
+
+#ifdef IPCONFIG_DHCP
+
+static void __init
+ic_dhcp_init_options(u8 *options)
+{
+ u8 mt = ((ic_servaddr == INADDR_NONE)
+ ? DHCPDISCOVER : DHCPREQUEST);
+ u8 *e = options;
+
+#ifdef IPCONFIG_DEBUG
+ printk("DHCP: Sending message type %d\n", mt);
+#endif
+
+ memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */
+ e += 4;
+
+ *e++ = 53; /* DHCP message type */
+ *e++ = 1;
+ *e++ = mt;
+
+ if (mt == DHCPREQUEST) {
+ *e++ = 54; /* Server ID (IP address) */
+ *e++ = 4;
+ memcpy(e, &ic_servaddr, 4);
+ e += 4;
+
+ *e++ = 50; /* Requested IP address */
+ *e++ = 4;
+ memcpy(e, &ic_myaddr, 4);
+ e += 4;
+ }
+
+ /* always? */
+ {
+ static const u8 ic_req_params[] = {
+ 1, /* Subnet mask */
+ 3, /* Default gateway */
+ 6, /* DNS server */
+ 12, /* Host name */
+ 15, /* Domain name */
+ 17, /* Boot path */
+ 40, /* NIS domain name */
+ };
+
+ *e++ = 55; /* Parameter request list */
+ *e++ = sizeof(ic_req_params);
+ memcpy(e, ic_req_params, sizeof(ic_req_params));
+ e += sizeof(ic_req_params);
+ }
+
+ *e++ = 255; /* End of the list */
+}
+
+#endif /* IPCONFIG_DHCP */
+
+static void __init ic_bootp_init_ext(u8 *e)
+{
+ memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */
+ e += 4;
+ *e++ = 1; /* Subnet mask request */
+ *e++ = 4;
+ e += 4;
+ *e++ = 3; /* Default gateway request */
+ *e++ = 4;
+ e += 4;
+ *e++ = 5; /* Name server request */
+ *e++ = 8;
+ e += 8;
+ *e++ = 12; /* Host name request */
+ *e++ = 32;
+ e += 32;
+ *e++ = 40; /* NIS Domain name request */
+ *e++ = 32;
+ e += 32;
+ *e++ = 17; /* Boot path */
+ *e++ = 40;
+ e += 40;
+
+ *e++ = 57; /* set extension buffer size for reply */
+ *e++ = 2;
+ *e++ = 1; /* 128+236+8+20+14, see dhcpd sources */
+ *e++ = 150;
+
+ *e++ = 255; /* End of the list */
+}
+
+
+/*
+ * Initialize the DHCP/BOOTP mechanism.
+ */
+static inline void ic_bootp_init(void)
+{
+ int i;
+
+ for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+ ic_nameservers[i] = INADDR_NONE;
+
+ dev_add_pack(&bootp_packet_type);
+}
+
+
+/*
+ * DHCP/BOOTP cleanup.
+ */
+static inline void ic_bootp_cleanup(void)
+{
+ dev_remove_pack(&bootp_packet_type);
+}
+
+
+/*
+ * Send DHCP/BOOTP request to single interface.
+ */
+static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_diff)
+{
+ struct net_device *dev = d->dev;
+ struct sk_buff *skb;
+ struct bootp_pkt *b;
+ int hh_len = LL_RESERVED_SPACE(dev);
+ struct iphdr *h;
+
+ /* Allocate packet */
+ skb = alloc_skb(sizeof(struct bootp_pkt) + hh_len + 15, GFP_KERNEL);
+ if (!skb)
+ return;
+ skb_reserve(skb, hh_len);
+ b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt));
+ memset(b, 0, sizeof(struct bootp_pkt));
+
+ /* Construct IP header */
+ skb->nh.iph = h = &b->iph;
+ h->version = 4;
+ h->ihl = 5;
+ h->tot_len = htons(sizeof(struct bootp_pkt));
+ h->frag_off = htons(IP_DF);
+ h->ttl = 64;
+ h->protocol = IPPROTO_UDP;
+ h->daddr = INADDR_BROADCAST;
+ h->check = ip_fast_csum((unsigned char *) h, h->ihl);
+
+ /* Construct UDP header */
+ b->udph.source = htons(68);
+ b->udph.dest = htons(67);
+ b->udph.len = htons(sizeof(struct bootp_pkt) - sizeof(struct iphdr));
+ /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
+
+ /* Construct DHCP/BOOTP header */
+ b->op = BOOTP_REQUEST;
+ if (dev->type < 256) /* check for false types */
+ b->htype = dev->type;
+ else if (dev->type == ARPHRD_IEEE802_TR) /* fix for token ring */
+ b->htype = ARPHRD_IEEE802;
+ else if (dev->type == ARPHRD_FDDI)
+ b->htype = ARPHRD_ETHER;
+ else {
+ printk("Unknown ARP type 0x%04x for device %s\n", dev->type, dev->name);
+ b->htype = dev->type; /* can cause undefined behavior */
+ }
+ b->hlen = dev->addr_len;
+ b->your_ip = INADDR_NONE;
+ b->server_ip = INADDR_NONE;
+ memcpy(b->hw_addr, dev->dev_addr, dev->addr_len);
+ b->secs = htons(jiffies_diff / HZ);
+ b->xid = d->xid;
+
+ /* add DHCP options or BOOTP extensions */
+#ifdef IPCONFIG_DHCP
+ if (ic_proto_enabled & IC_USE_DHCP)
+ ic_dhcp_init_options(b->exten);
+ else
+#endif
+ ic_bootp_init_ext(b->exten);
+
+ /* Chain packet down the line... */
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_IP);
+ if ((dev->hard_header &&
+ dev->hard_header(skb, dev, ntohs(skb->protocol), dev->broadcast, dev->dev_addr, skb->len) < 0) ||
+ dev_queue_xmit(skb) < 0)
+ printk("E");
+}
+
+
+/*
+ * Copy BOOTP-supplied string if not already set.
+ */
+static int __init ic_bootp_string(char *dest, char *src, int len, int max)
+{
+ if (!len)
+ return 0;
+ if (len > max-1)
+ len = max-1;
+ memcpy(dest, src, len);
+ dest[len] = '\0';
+ return 1;
+}
+
+
+/*
+ * Process BOOTP extensions.
+ */
+static void __init ic_do_bootp_ext(u8 *ext)
+{
+ u8 servers;
+ int i;
+
+#ifdef IPCONFIG_DEBUG
+ u8 *c;
+
+ printk("DHCP/BOOTP: Got extension %d:",*ext);
+ for(c=ext+2; c<ext+2+ext[1]; c++)
+ printk(" %02x", *c);
+ printk("\n");
+#endif
+
+ switch (*ext++) {
+ case 1: /* Subnet mask */
+ if (ic_netmask == INADDR_NONE)
+ memcpy(&ic_netmask, ext+1, 4);
+ break;
+ case 3: /* Default gateway */
+ if (ic_gateway == INADDR_NONE)
+ memcpy(&ic_gateway, ext+1, 4);
+ break;
+ case 6: /* DNS server */
+ servers= *ext/4;
+ if (servers > CONF_NAMESERVERS_MAX)
+ servers = CONF_NAMESERVERS_MAX;
+ for (i = 0; i < servers; i++) {
+ if (ic_nameservers[i] == INADDR_NONE)
+ memcpy(&ic_nameservers[i], ext+1+4*i, 4);
+ }
+ break;
+ case 12: /* Host name */
+ ic_bootp_string(system_utsname.nodename, ext+1, *ext, __NEW_UTS_LEN);
+ ic_host_name_set = 1;
+ break;
+ case 15: /* Domain name (DNS) */
+ ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain));
+ break;
+ case 17: /* Root path */
+ if (!root_server_path[0])
+ ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path));
+ break;
+ case 40: /* NIS Domain name (_not_ DNS) */
+ ic_bootp_string(system_utsname.domainname, ext+1, *ext, __NEW_UTS_LEN);
+ break;
+ }
+}
+
+
+/*
+ * Receive BOOTP reply.
+ */
+static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+{
+ struct bootp_pkt *b;
+ struct iphdr *h;
+ struct ic_device *d;
+ int len, ext_len;
+
+ /* Perform verifications before taking the lock. */
+ if (skb->pkt_type == PACKET_OTHERHOST)
+ goto drop;
+
+ if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+ return NET_RX_DROP;
+
+ if (!pskb_may_pull(skb,
+ sizeof(struct iphdr) +
+ sizeof(struct udphdr)))
+ goto drop;
+
+ b = (struct bootp_pkt *) skb->nh.iph;
+ h = &b->iph;
+
+ if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
+ goto drop;
+
+ /* Fragments are not supported */
+ if (h->frag_off & htons(IP_OFFSET | IP_MF)) {
+ if (net_ratelimit())
+ printk(KERN_ERR "DHCP/BOOTP: Ignoring fragmented "
+ "reply.\n");
+ goto drop;
+ }
+
+ if (skb->len < ntohs(h->tot_len))
+ goto drop;
+
+ if (ip_fast_csum((char *) h, h->ihl))
+ goto drop;
+
+ if (b->udph.source != htons(67) || b->udph.dest != htons(68))
+ goto drop;
+
+ if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
+ goto drop;
+
+ len = ntohs(b->udph.len) - sizeof(struct udphdr);
+ ext_len = len - (sizeof(*b) -
+ sizeof(struct iphdr) -
+ sizeof(struct udphdr) -
+ sizeof(b->exten));
+ if (ext_len < 0)
+ goto drop;
+
+ /* Ok the front looks good, make sure we can get at the rest. */
+ if (!pskb_may_pull(skb, skb->len))
+ goto drop;
+
+ b = (struct bootp_pkt *) skb->nh.iph;
+ h = &b->iph;
+
+ /* One reply at a time, please. */
+ spin_lock(&ic_recv_lock);
+
+ /* If we already have a reply, just drop the packet */
+ if (ic_got_reply)
+ goto drop_unlock;
+
+ /* Find the ic_device that the packet arrived on */
+ d = ic_first_dev;
+ while (d && d->dev != dev)
+ d = d->next;
+ if (!d)
+ goto drop_unlock; /* should never happen */
+
+ /* Is it a reply to our BOOTP request? */
+ if (b->op != BOOTP_REPLY ||
+ b->xid != d->xid) {
+ if (net_ratelimit())
+ printk(KERN_ERR "DHCP/BOOTP: Reply not for us, "
+ "op[%x] xid[%x]\n",
+ b->op, b->xid);
+ goto drop_unlock;
+ }
+
+ /* Parse extensions */
+ if (ext_len >= 4 &&
+ !memcmp(b->exten, ic_bootp_cookie, 4)) { /* Check magic cookie */
+ u8 *end = (u8 *) b + ntohs(b->iph.tot_len);
+ u8 *ext;
+
+#ifdef IPCONFIG_DHCP
+ if (ic_proto_enabled & IC_USE_DHCP) {
+ u32 server_id = INADDR_NONE;
+ int mt = 0;
+
+ ext = &b->exten[4];
+ while (ext < end && *ext != 0xff) {
+ u8 *opt = ext++;
+ if (*opt == 0) /* Padding */
+ continue;
+ ext += *ext + 1;
+ if (ext >= end)
+ break;
+ switch (*opt) {
+ case 53: /* Message type */
+ if (opt[1])
+ mt = opt[2];
+ break;
+ case 54: /* Server ID (IP address) */
+ if (opt[1] >= 4)
+ memcpy(&server_id, opt + 2, 4);
+ break;
+ };
+ }
+
+#ifdef IPCONFIG_DEBUG
+ printk("DHCP: Got message type %d\n", mt);
+#endif
+
+ switch (mt) {
+ case DHCPOFFER:
+ /* While in the process of accepting one offer,
+ * ignore all others.
+ */
+ if (ic_myaddr != INADDR_NONE)
+ goto drop_unlock;
+
+ /* Let's accept that offer. */
+ ic_myaddr = b->your_ip;
+ ic_servaddr = server_id;
+#ifdef IPCONFIG_DEBUG
+ printk("DHCP: Offered address %u.%u.%u.%u",
+ NIPQUAD(ic_myaddr));
+ printk(" by server %u.%u.%u.%u\n",
+ NIPQUAD(ic_servaddr));
+#endif
+ /* The DHCP indicated server address takes
+ * precedence over the bootp header one if
+ * they are different.
+ */
+ if ((server_id != INADDR_NONE) &&
+ (b->server_ip != server_id))
+ b->server_ip = ic_servaddr;
+ break;
+
+ case DHCPACK:
+ if (memcmp(dev->dev_addr, b->hw_addr, dev->addr_len) != 0)
+ goto drop_unlock;
+
+ /* Yeah! */
+ break;
+
+ default:
+ /* Urque. Forget it*/
+ ic_myaddr = INADDR_NONE;
+ ic_servaddr = INADDR_NONE;
+ goto drop_unlock;
+ };
+
+ ic_dhcp_msgtype = mt;
+
+ }
+#endif /* IPCONFIG_DHCP */
+
+ ext = &b->exten[4];
+ while (ext < end && *ext != 0xff) {
+ u8 *opt = ext++;
+ if (*opt == 0) /* Padding */
+ continue;
+ ext += *ext + 1;
+ if (ext < end)
+ ic_do_bootp_ext(opt);
+ }
+ }
+
+ /* We have a winner! */
+ ic_dev = dev;
+ ic_myaddr = b->your_ip;
+ ic_servaddr = b->server_ip;
+ if (ic_gateway == INADDR_NONE && b->relay_ip)
+ ic_gateway = b->relay_ip;
+ if (ic_nameservers[0] == INADDR_NONE)
+ ic_nameservers[0] = ic_servaddr;
+ ic_got_reply = IC_BOOTP;
+
+drop_unlock:
+ /* Show's over. Nothing to see here. */
+ spin_unlock(&ic_recv_lock);
+
+drop:
+ /* Throw the packet out. */
+ kfree_skb(skb);
+
+ return 0;
+}
+
+
+#endif
+
+
+/*
+ * Dynamic IP configuration -- DHCP, BOOTP, RARP.
+ */
+
+#ifdef IPCONFIG_DYNAMIC
+
+static int __init ic_dynamic(void)
+{
+ int retries;
+ struct ic_device *d;
+ unsigned long start_jiffies, timeout, jiff;
+ int do_bootp = ic_proto_have_if & IC_BOOTP;
+ int do_rarp = ic_proto_have_if & IC_RARP;
+
+ /*
+ * If none of DHCP/BOOTP/RARP was selected, return with an error.
+ * This routine gets only called when some pieces of information
+ * are missing, and without DHCP/BOOTP/RARP we are unable to get it.
+ */
+ if (!ic_proto_enabled) {
+ printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n");
+ return -1;
+ }
+
+#ifdef IPCONFIG_BOOTP
+ if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP)
+ printk(KERN_ERR "DHCP/BOOTP: No suitable device found.\n");
+#endif
+#ifdef IPCONFIG_RARP
+ if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP)
+ printk(KERN_ERR "RARP: No suitable device found.\n");
+#endif
+
+ if (!ic_proto_have_if)
+ /* Error message already printed */
+ return -1;
+
+ /*
+ * Setup protocols
+ */
+#ifdef IPCONFIG_BOOTP
+ if (do_bootp)
+ ic_bootp_init();
+#endif
+#ifdef IPCONFIG_RARP
+ if (do_rarp)
+ ic_rarp_init();
+#endif
+
+ /*
+ * Send requests and wait, until we get an answer. This loop
+ * seems to be a terrible waste of CPU time, but actually there is
+ * only one process running at all, so we don't need to use any
+ * scheduler functions.
+ * [Actually we could now, but the nothing else running note still
+ * applies.. - AC]
+ */
+ printk(KERN_NOTICE "Sending %s%s%s requests .",
+ do_bootp
+ ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "",
+ (do_bootp && do_rarp) ? " and " : "",
+ do_rarp ? "RARP" : "");
+
+ start_jiffies = jiffies;
+ d = ic_first_dev;
+ retries = CONF_SEND_RETRIES;
+ get_random_bytes(&timeout, sizeof(timeout));
+ timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM);
+ for(;;) {
+#ifdef IPCONFIG_BOOTP
+ if (do_bootp && (d->able & IC_BOOTP))
+ ic_bootp_send_if(d, jiffies - start_jiffies);
+#endif
+#ifdef IPCONFIG_RARP
+ if (do_rarp && (d->able & IC_RARP))
+ ic_rarp_send_if(d);
+#endif
+
+ jiff = jiffies + (d->next ? CONF_INTER_TIMEOUT : timeout);
+ while (time_before(jiffies, jiff) && !ic_got_reply) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(1);
+ }
+#ifdef IPCONFIG_DHCP
+ /* DHCP isn't done until we get a DHCPACK. */
+ if ((ic_got_reply & IC_BOOTP)
+ && (ic_proto_enabled & IC_USE_DHCP)
+ && ic_dhcp_msgtype != DHCPACK)
+ {
+ ic_got_reply = 0;
+ printk(",");
+ continue;
+ }
+#endif /* IPCONFIG_DHCP */
+
+ if (ic_got_reply) {
+ printk(" OK\n");
+ break;
+ }
+
+ if ((d = d->next))
+ continue;
+
+ if (! --retries) {
+ printk(" timed out!\n");
+ break;
+ }
+
+ d = ic_first_dev;
+
+ timeout = timeout CONF_TIMEOUT_MULT;
+ if (timeout > CONF_TIMEOUT_MAX)
+ timeout = CONF_TIMEOUT_MAX;
+
+ printk(".");
+ }
+
+#ifdef IPCONFIG_BOOTP
+ if (do_bootp)
+ ic_bootp_cleanup();
+#endif
+#ifdef IPCONFIG_RARP
+ if (do_rarp)
+ ic_rarp_cleanup();
+#endif
+
+ if (!ic_got_reply)
+ return -1;
+
+ printk("IP-Config: Got %s answer from %u.%u.%u.%u, ",
+ ((ic_got_reply & IC_RARP) ? "RARP"
+ : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
+ NIPQUAD(ic_servaddr));
+ printk("my address is %u.%u.%u.%u\n", NIPQUAD(ic_myaddr));
+
+ return 0;
+}
+
+#endif /* IPCONFIG_DYNAMIC */
+
+#ifdef CONFIG_PROC_FS
+
+static int pnp_seq_show(struct seq_file *seq, void *v)
+{
+ int i;
+
+ if (ic_proto_used & IC_PROTO)
+ seq_printf(seq, "#PROTO: %s\n",
+ (ic_proto_used & IC_RARP) ? "RARP"
+ : (ic_proto_used & IC_USE_DHCP) ? "DHCP" : "BOOTP");
+ else
+ seq_puts(seq, "#MANUAL\n");
+
+ if (ic_domain[0])
+ seq_printf(seq,
+ "domain %s\n", ic_domain);
+ for (i = 0; i < CONF_NAMESERVERS_MAX; i++) {
+ if (ic_nameservers[i] != INADDR_NONE)
+ seq_printf(seq,
+ "nameserver %u.%u.%u.%u\n",
+ NIPQUAD(ic_nameservers[i]));
+ }
+ if (ic_servaddr != INADDR_NONE)
+ seq_printf(seq,
+ "bootserver %u.%u.%u.%u\n",
+ NIPQUAD(ic_servaddr));
+ return 0;
+}
+
+static int pnp_seq_open(struct inode *indoe, struct file *file)
+{
+ return single_open(file, pnp_seq_show, NULL);
+}
+
+static struct file_operations pnp_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = pnp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif /* CONFIG_PROC_FS */
+
+/*
+ * Extract IP address from the parameter string if needed. Note that we
+ * need to have root_server_addr set _before_ IPConfig gets called as it
+ * can override it.
+ */
+u32 __init root_nfs_parse_addr(char *name)
+{
+ u32 addr;
+ int octets = 0;
+ char *cp, *cq;
+
+ cp = cq = name;
+ while (octets < 4) {
+ while (*cp >= '0' && *cp <= '9')
+ cp++;
+ if (cp == cq || cp - cq > 3)
+ break;
+ if (*cp == '.' || octets == 3)
+ octets++;
+ if (octets < 4)
+ cp++;
+ cq = cp;
+ }
+ if (octets == 4 && (*cp == ':' || *cp == '\0')) {
+ if (*cp == ':')
+ *cp++ = '\0';
+ addr = in_aton(name);
+ memmove(name, cp, strlen(cp) + 1);
+ } else
+ addr = INADDR_NONE;
+
+ return addr;
+}
+
+/*
+ * IP Autoconfig dispatcher.
+ */
+
+static int __init ip_auto_config(void)
+{
+ u32 addr;
+
+#ifdef CONFIG_PROC_FS
+ proc_net_fops_create("pnp", S_IRUGO, &pnp_seq_fops);
+#endif /* CONFIG_PROC_FS */
+
+ if (!ic_enable)
+ return 0;
+
+ DBG(("IP-Config: Entered.\n"));
+#ifdef IPCONFIG_DYNAMIC
+ try_try_again:
+#endif
+ /* Give hardware a chance to settle */
+ msleep(CONF_PRE_OPEN);
+
+ /* Setup all network devices */
+ if (ic_open_devs() < 0)
+ return -1;
+
+ /* Give drivers a chance to settle */
+ ssleep(CONF_POST_OPEN);
+
+ /*
+ * If the config information is insufficient (e.g., our IP address or
+ * IP address of the boot server is missing or we have multiple network
+ * interfaces and no default was set), use BOOTP or RARP to get the
+ * missing values.
+ */
+ if (ic_myaddr == INADDR_NONE ||
+#ifdef CONFIG_ROOT_NFS
+ (MAJOR(ROOT_DEV) == UNNAMED_MAJOR
+ && root_server_addr == INADDR_NONE
+ && ic_servaddr == INADDR_NONE) ||
+#endif
+ ic_first_dev->next) {
+#ifdef IPCONFIG_DYNAMIC
+
+ int retries = CONF_OPEN_RETRIES;
+
+ if (ic_dynamic() < 0) {
+ ic_close_devs();
+
+ /*
+ * I don't know why, but sometimes the
+ * eepro100 driver (at least) gets upset and
+ * doesn't work the first time it's opened.
+ * But then if you close it and reopen it, it
+ * works just fine. So we need to try that at
+ * least once before giving up.
+ *
+ * Also, if the root will be NFS-mounted, we
+ * have nowhere to go if DHCP fails. So we
+ * just have to keep trying forever.
+ *
+ * -- Chip
+ */
+#ifdef CONFIG_ROOT_NFS
+ if (ROOT_DEV == Root_NFS) {
+ printk(KERN_ERR
+ "IP-Config: Retrying forever (NFS root)...\n");
+ goto try_try_again;
+ }
+#endif
+
+ if (--retries) {
+ printk(KERN_ERR
+ "IP-Config: Reopening network devices...\n");
+ goto try_try_again;
+ }
+
+ /* Oh, well. At least we tried. */
+ printk(KERN_ERR "IP-Config: Auto-configuration of network failed.\n");
+ return -1;
+ }
+#else /* !DYNAMIC */
+ printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n");
+ ic_close_devs();
+ return -1;
+#endif /* IPCONFIG_DYNAMIC */
+ } else {
+ /* Device selected manually or only one device -> use it */
+ ic_dev = ic_first_dev->dev;
+ }
+
+ addr = root_nfs_parse_addr(root_server_path);
+ if (root_server_addr == INADDR_NONE)
+ root_server_addr = addr;
+
+ /*
+ * Use defaults whereever applicable.
+ */
+ if (ic_defaults() < 0)
+ return -1;
+
+ /*
+ * Close all network devices except the device we've
+ * autoconfigured and set up routes.
+ */
+ ic_close_devs();
+ if (ic_setup_if() < 0 || ic_setup_routes() < 0)
+ return -1;
+
+ /*
+ * Record which protocol was actually used.
+ */
+#ifdef IPCONFIG_DYNAMIC
+ ic_proto_used = ic_got_reply | (ic_proto_enabled & IC_USE_DHCP);
+#endif
+
+#ifndef IPCONFIG_SILENT
+ /*
+ * Clue in the operator.
+ */
+ printk("IP-Config: Complete:");
+ printk("\n device=%s", ic_dev->name);
+ printk(", addr=%u.%u.%u.%u", NIPQUAD(ic_myaddr));
+ printk(", mask=%u.%u.%u.%u", NIPQUAD(ic_netmask));
+ printk(", gw=%u.%u.%u.%u", NIPQUAD(ic_gateway));
+ printk(",\n host=%s, domain=%s, nis-domain=%s",
+ system_utsname.nodename, ic_domain, system_utsname.domainname);
+ printk(",\n bootserver=%u.%u.%u.%u", NIPQUAD(ic_servaddr));
+ printk(", rootserver=%u.%u.%u.%u", NIPQUAD(root_server_addr));
+ printk(", rootpath=%s", root_server_path);
+ printk("\n");
+#endif /* !SILENT */
+
+ return 0;
+}
+
+late_initcall(ip_auto_config);
+
+
+/*
+ * Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel
+ * command line parameter. It consists of option fields separated by colons in
+ * the following order:
+ *
+ * <client-ip>:<server-ip>:<gw-ip>:<netmask>:<host name>:<device>:<PROTO>
+ *
+ * Any of the fields can be empty which means to use a default value:
+ * <client-ip> - address given by BOOTP or RARP
+ * <server-ip> - address of host returning BOOTP or RARP packet
+ * <gw-ip> - none, or the address returned by BOOTP
+ * <netmask> - automatically determined from <client-ip>, or the
+ * one returned by BOOTP
+ * <host name> - <client-ip> in ASCII notation, or the name returned
+ * by BOOTP
+ * <device> - use all available devices
+ * <PROTO>:
+ * off|none - don't do autoconfig at all (DEFAULT)
+ * on|any - use any configured protocol
+ * dhcp|bootp|rarp - use only the specified protocol
+ * both - use both BOOTP and RARP (not DHCP)
+ */
+static int __init ic_proto_name(char *name)
+{
+ if (!strcmp(name, "on") || !strcmp(name, "any")) {
+ return 1;
+ }
+#ifdef CONFIG_IP_PNP_DHCP
+ else if (!strcmp(name, "dhcp")) {
+ ic_proto_enabled &= ~IC_RARP;
+ return 1;
+ }
+#endif
+#ifdef CONFIG_IP_PNP_BOOTP
+ else if (!strcmp(name, "bootp")) {
+ ic_proto_enabled &= ~(IC_RARP | IC_USE_DHCP);
+ return 1;
+ }
+#endif
+#ifdef CONFIG_IP_PNP_RARP
+ else if (!strcmp(name, "rarp")) {
+ ic_proto_enabled &= ~(IC_BOOTP | IC_USE_DHCP);
+ return 1;
+ }
+#endif
+#ifdef IPCONFIG_DYNAMIC
+ else if (!strcmp(name, "both")) {
+ ic_proto_enabled &= ~IC_USE_DHCP; /* backward compat :-( */
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+static int __init ip_auto_config_setup(char *addrs)
+{
+ char *cp, *ip, *dp;
+ int num = 0;
+
+ ic_set_manually = 1;
+
+ ic_enable = (*addrs &&
+ (strcmp(addrs, "off") != 0) &&
+ (strcmp(addrs, "none") != 0));
+ if (!ic_enable)
+ return 1;
+
+ if (ic_proto_name(addrs))
+ return 1;
+
+ /* Parse the whole string */
+ ip = addrs;
+ while (ip && *ip) {
+ if ((cp = strchr(ip, ':')))
+ *cp++ = '\0';
+ if (strlen(ip) > 0) {
+ DBG(("IP-Config: Parameter #%d: `%s'\n", num, ip));
+ switch (num) {
+ case 0:
+ if ((ic_myaddr = in_aton(ip)) == INADDR_ANY)
+ ic_myaddr = INADDR_NONE;
+ break;
+ case 1:
+ if ((ic_servaddr = in_aton(ip)) == INADDR_ANY)
+ ic_servaddr = INADDR_NONE;
+ break;
+ case 2:
+ if ((ic_gateway = in_aton(ip)) == INADDR_ANY)
+ ic_gateway = INADDR_NONE;
+ break;
+ case 3:
+ if ((ic_netmask = in_aton(ip)) == INADDR_ANY)
+ ic_netmask = INADDR_NONE;
+ break;
+ case 4:
+ if ((dp = strchr(ip, '.'))) {
+ *dp++ = '\0';
+ strlcpy(system_utsname.domainname, dp,
+ sizeof(system_utsname.domainname));
+ }
+ strlcpy(system_utsname.nodename, ip,
+ sizeof(system_utsname.nodename));
+ ic_host_name_set = 1;
+ break;
+ case 5:
+ strlcpy(user_dev_name, ip, sizeof(user_dev_name));
+ break;
+ case 6:
+ ic_proto_name(ip);
+ break;
+ }
+ }
+ ip = cp;
+ num++;
+ }
+
+ return 1;
+}
+
+static int __init nfsaddrs_config_setup(char *addrs)
+{
+ return ip_auto_config_setup(addrs);
+}
+
+__setup("ip=", ip_auto_config_setup);
+__setup("nfsaddrs=", nfsaddrs_config_setup);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
new file mode 100644
index 00000000000..68a78731f72
--- /dev/null
+++ b/net/ipv4/ipip.c
@@ -0,0 +1,905 @@
+/*
+ * Linux NET3: IP/IP protocol decoder.
+ *
+ * Version: $Id: ipip.c,v 1.50 2001/10/02 02:22:36 davem Exp $
+ *
+ * Authors:
+ * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
+ *
+ * Fixes:
+ * Alan Cox : Merged and made usable non modular (its so tiny its silly as
+ * a module taking up 2 pages).
+ * Alan Cox : Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph)
+ * to keep ip_forward happy.
+ * Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8).
+ * Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL
+ * David Woodhouse : Perform some basic ICMP handling.
+ * IPIP Routing without decapsulation.
+ * Carlos Picoto : GRE over IP support
+ * Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
+ * I do not want to merge them together.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+/* tunnel.c: an IP tunnel driver
+
+ The purpose of this driver is to provide an IP tunnel through
+ which you can tunnel network traffic transparently across subnets.
+
+ This was written by looking at Nick Holloway's dummy driver
+ Thanks for the great code!
+
+ -Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
+
+ Minor tweaks:
+ Cleaned up the code a little and added some pre-1.3.0 tweaks.
+ dev->hard_header/hard_header_len changed to use no headers.
+ Comments/bracketing tweaked.
+ Made the tunnels use dev->name not tunnel: when error reporting.
+ Added tx_dropped stat
+
+ -Alan Cox (Alan.Cox@linux.org) 21 March 95
+
+ Reworked:
+ Changed to tunnel to destination gateway in addition to the
+ tunnel's pointopoint address
+ Almost completely rewritten
+ Note: There is currently no firewall or ICMP handling done.
+
+ -Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96
+
+*/
+
+/* Things I wish I had known when writing the tunnel driver:
+
+ When the tunnel_xmit() function is called, the skb contains the
+ packet to be sent (plus a great deal of extra info), and dev
+ contains the tunnel device that _we_ are.
+
+ When we are passed a packet, we are expected to fill in the
+ source address with our source IP address.
+
+ What is the proper way to allocate, copy and free a buffer?
+ After you allocate it, it is a "0 length" chunk of memory
+ starting at zero. If you want to add headers to the buffer
+ later, you'll have to call "skb_reserve(skb, amount)" with
+ the amount of memory you want reserved. Then, you call
+ "skb_put(skb, amount)" with the amount of space you want in
+ the buffer. skb_put() returns a pointer to the top (#0) of
+ that buffer. skb->len is set to the amount of space you have
+ "allocated" with skb_put(). You can then write up to skb->len
+ bytes to that buffer. If you need more, you can call skb_put()
+ again with the additional amount of space you need. You can
+ find out how much more space you can allocate by calling
+ "skb_tailroom(skb)".
+ Now, to add header space, call "skb_push(skb, header_len)".
+ This creates space at the beginning of the buffer and returns
+ a pointer to this new space. If later you need to strip a
+ header from a buffer, call "skb_pull(skb, header_len)".
+ skb_headroom() will return how much space is left at the top
+ of the buffer (before the main data). Remember, this headroom
+ space must be reserved before the skb_put() function is called.
+ */
+
+/*
+ This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c
+
+ For comments look at net/ipv4/ip_gre.c --ANK
+ */
+
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ipip.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+
+#define HASH_SIZE 16
+#define HASH(addr) ((addr^(addr>>4))&0xF)
+
+static int ipip_fb_tunnel_init(struct net_device *dev);
+static int ipip_tunnel_init(struct net_device *dev);
+static void ipip_tunnel_setup(struct net_device *dev);
+
+static struct net_device *ipip_fb_tunnel_dev;
+
+static struct ip_tunnel *tunnels_r_l[HASH_SIZE];
+static struct ip_tunnel *tunnels_r[HASH_SIZE];
+static struct ip_tunnel *tunnels_l[HASH_SIZE];
+static struct ip_tunnel *tunnels_wc[1];
+static struct ip_tunnel **tunnels[4] = { tunnels_wc, tunnels_l, tunnels_r, tunnels_r_l };
+
+static DEFINE_RWLOCK(ipip_lock);
+
+static struct ip_tunnel * ipip_tunnel_lookup(u32 remote, u32 local)
+{
+ unsigned h0 = HASH(remote);
+ unsigned h1 = HASH(local);
+ struct ip_tunnel *t;
+
+ for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
+ if (local == t->parms.iph.saddr &&
+ remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
+ return t;
+ }
+ for (t = tunnels_r[h0]; t; t = t->next) {
+ if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
+ return t;
+ }
+ for (t = tunnels_l[h1]; t; t = t->next) {
+ if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
+ return t;
+ }
+ if ((t = tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP))
+ return t;
+ return NULL;
+}
+
+static struct ip_tunnel **ipip_bucket(struct ip_tunnel *t)
+{
+ u32 remote = t->parms.iph.daddr;
+ u32 local = t->parms.iph.saddr;
+ unsigned h = 0;
+ int prio = 0;
+
+ if (remote) {
+ prio |= 2;
+ h ^= HASH(remote);
+ }
+ if (local) {
+ prio |= 1;
+ h ^= HASH(local);
+ }
+ return &tunnels[prio][h];
+}
+
+
+static void ipip_tunnel_unlink(struct ip_tunnel *t)
+{
+ struct ip_tunnel **tp;
+
+ for (tp = ipip_bucket(t); *tp; tp = &(*tp)->next) {
+ if (t == *tp) {
+ write_lock_bh(&ipip_lock);
+ *tp = t->next;
+ write_unlock_bh(&ipip_lock);
+ break;
+ }
+ }
+}
+
+static void ipip_tunnel_link(struct ip_tunnel *t)
+{
+ struct ip_tunnel **tp = ipip_bucket(t);
+
+ t->next = *tp;
+ write_lock_bh(&ipip_lock);
+ *tp = t;
+ write_unlock_bh(&ipip_lock);
+}
+
+static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create)
+{
+ u32 remote = parms->iph.daddr;
+ u32 local = parms->iph.saddr;
+ struct ip_tunnel *t, **tp, *nt;
+ struct net_device *dev;
+ unsigned h = 0;
+ int prio = 0;
+ char name[IFNAMSIZ];
+
+ if (remote) {
+ prio |= 2;
+ h ^= HASH(remote);
+ }
+ if (local) {
+ prio |= 1;
+ h ^= HASH(local);
+ }
+ for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
+ if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
+ return t;
+ }
+ if (!create)
+ return NULL;
+
+ if (parms->name[0])
+ strlcpy(name, parms->name, IFNAMSIZ);
+ else {
+ int i;
+ for (i=1; i<100; i++) {
+ sprintf(name, "tunl%d", i);
+ if (__dev_get_by_name(name) == NULL)
+ break;
+ }
+ if (i==100)
+ goto failed;
+ }
+
+ dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
+ if (dev == NULL)
+ return NULL;
+
+ nt = dev->priv;
+ SET_MODULE_OWNER(dev);
+ dev->init = ipip_tunnel_init;
+ nt->parms = *parms;
+
+ if (register_netdevice(dev) < 0) {
+ free_netdev(dev);
+ goto failed;
+ }
+
+ dev_hold(dev);
+ ipip_tunnel_link(nt);
+ /* Do not decrement MOD_USE_COUNT here. */
+ return nt;
+
+failed:
+ return NULL;
+}
+
+static void ipip_tunnel_uninit(struct net_device *dev)
+{
+ if (dev == ipip_fb_tunnel_dev) {
+ write_lock_bh(&ipip_lock);
+ tunnels_wc[0] = NULL;
+ write_unlock_bh(&ipip_lock);
+ } else
+ ipip_tunnel_unlink((struct ip_tunnel*)dev->priv);
+ dev_put(dev);
+}
+
+static void ipip_err(struct sk_buff *skb, void *__unused)
+{
+#ifndef I_WISH_WORLD_WERE_PERFECT
+
+/* It is not :-( All the routers (except for Linux) return only
+ 8 bytes of packet payload. It means, that precise relaying of
+ ICMP in the real Internet is absolutely infeasible.
+ */
+ struct iphdr *iph = (struct iphdr*)skb->data;
+ int type = skb->h.icmph->type;
+ int code = skb->h.icmph->code;
+ struct ip_tunnel *t;
+
+ switch (type) {
+ default:
+ case ICMP_PARAMETERPROB:
+ return;
+
+ case ICMP_DEST_UNREACH:
+ switch (code) {
+ case ICMP_SR_FAILED:
+ case ICMP_PORT_UNREACH:
+ /* Impossible event. */
+ return;
+ case ICMP_FRAG_NEEDED:
+ /* Soft state for pmtu is maintained by IP core. */
+ return;
+ default:
+ /* All others are translated to HOST_UNREACH.
+ rfc2003 contains "deep thoughts" about NET_UNREACH,
+ I believe they are just ether pollution. --ANK
+ */
+ break;
+ }
+ break;
+ case ICMP_TIME_EXCEEDED:
+ if (code != ICMP_EXC_TTL)
+ return;
+ break;
+ }
+
+ read_lock(&ipip_lock);
+ t = ipip_tunnel_lookup(iph->daddr, iph->saddr);
+ if (t == NULL || t->parms.iph.daddr == 0)
+ goto out;
+ if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+ goto out;
+
+ if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
+ t->err_count++;
+ else
+ t->err_count = 1;
+ t->err_time = jiffies;
+out:
+ read_unlock(&ipip_lock);
+ return;
+#else
+ struct iphdr *iph = (struct iphdr*)dp;
+ int hlen = iph->ihl<<2;
+ struct iphdr *eiph;
+ int type = skb->h.icmph->type;
+ int code = skb->h.icmph->code;
+ int rel_type = 0;
+ int rel_code = 0;
+ int rel_info = 0;
+ struct sk_buff *skb2;
+ struct flowi fl;
+ struct rtable *rt;
+
+ if (len < hlen + sizeof(struct iphdr))
+ return;
+ eiph = (struct iphdr*)(dp + hlen);
+
+ switch (type) {
+ default:
+ return;
+ case ICMP_PARAMETERPROB:
+ if (skb->h.icmph->un.gateway < hlen)
+ return;
+
+ /* So... This guy found something strange INSIDE encapsulated
+ packet. Well, he is fool, but what can we do ?
+ */
+ rel_type = ICMP_PARAMETERPROB;
+ rel_info = skb->h.icmph->un.gateway - hlen;
+ break;
+
+ case ICMP_DEST_UNREACH:
+ switch (code) {
+ case ICMP_SR_FAILED:
+ case ICMP_PORT_UNREACH:
+ /* Impossible event. */
+ return;
+ case ICMP_FRAG_NEEDED:
+ /* And it is the only really necessary thing :-) */
+ rel_info = ntohs(skb->h.icmph->un.frag.mtu);
+ if (rel_info < hlen+68)
+ return;
+ rel_info -= hlen;
+ /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
+ if (rel_info > ntohs(eiph->tot_len))
+ return;
+ break;
+ default:
+ /* All others are translated to HOST_UNREACH.
+ rfc2003 contains "deep thoughts" about NET_UNREACH,
+ I believe, it is just ether pollution. --ANK
+ */
+ rel_type = ICMP_DEST_UNREACH;
+ rel_code = ICMP_HOST_UNREACH;
+ break;
+ }
+ break;
+ case ICMP_TIME_EXCEEDED:
+ if (code != ICMP_EXC_TTL)
+ return;
+ break;
+ }
+
+ /* Prepare fake skb to feed it to icmp_send */
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2 == NULL)
+ return;
+ dst_release(skb2->dst);
+ skb2->dst = NULL;
+ skb_pull(skb2, skb->data - (u8*)eiph);
+ skb2->nh.raw = skb2->data;
+
+ /* Try to guess incoming interface */
+ memset(&fl, 0, sizeof(fl));
+ fl.fl4_daddr = eiph->saddr;
+ fl.fl4_tos = RT_TOS(eiph->tos);
+ fl.proto = IPPROTO_IPIP;
+ if (ip_route_output_key(&rt, &key)) {
+ kfree_skb(skb2);
+ return;
+ }
+ skb2->dev = rt->u.dst.dev;
+
+ /* route "incoming" packet */
+ if (rt->rt_flags&RTCF_LOCAL) {
+ ip_rt_put(rt);
+ rt = NULL;
+ fl.fl4_daddr = eiph->daddr;
+ fl.fl4_src = eiph->saddr;
+ fl.fl4_tos = eiph->tos;
+ if (ip_route_output_key(&rt, &fl) ||
+ rt->u.dst.dev->type != ARPHRD_TUNNEL) {
+ ip_rt_put(rt);
+ kfree_skb(skb2);
+ return;
+ }
+ } else {
+ ip_rt_put(rt);
+ if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
+ skb2->dst->dev->type != ARPHRD_TUNNEL) {
+ kfree_skb(skb2);
+ return;
+ }
+ }
+
+ /* change mtu on this route */
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ if (rel_info > dst_mtu(skb2->dst)) {
+ kfree_skb(skb2);
+ return;
+ }
+ skb2->dst->ops->update_pmtu(skb2->dst, rel_info);
+ rel_info = htonl(rel_info);
+ } else if (type == ICMP_TIME_EXCEEDED) {
+ struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv;
+ if (t->parms.iph.ttl) {
+ rel_type = ICMP_DEST_UNREACH;
+ rel_code = ICMP_HOST_UNREACH;
+ }
+ }
+
+ icmp_send(skb2, rel_type, rel_code, rel_info);
+ kfree_skb(skb2);
+ return;
+#endif
+}
+
+static inline void ipip_ecn_decapsulate(struct iphdr *outer_iph, struct sk_buff *skb)
+{
+ struct iphdr *inner_iph = skb->nh.iph;
+
+ if (INET_ECN_is_ce(outer_iph->tos))
+ IP_ECN_set_ce(inner_iph);
+}
+
+static int ipip_rcv(struct sk_buff *skb)
+{
+ struct iphdr *iph;
+ struct ip_tunnel *tunnel;
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto out;
+
+ iph = skb->nh.iph;
+
+ read_lock(&ipip_lock);
+ if ((tunnel = ipip_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) {
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ read_unlock(&ipip_lock);
+ kfree_skb(skb);
+ return 0;
+ }
+
+ secpath_reset(skb);
+
+ skb->mac.raw = skb->nh.raw;
+ skb->nh.raw = skb->data;
+ memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+ skb->protocol = htons(ETH_P_IP);
+ skb->pkt_type = PACKET_HOST;
+
+ tunnel->stat.rx_packets++;
+ tunnel->stat.rx_bytes += skb->len;
+ skb->dev = tunnel->dev;
+ dst_release(skb->dst);
+ skb->dst = NULL;
+ nf_reset(skb);
+ ipip_ecn_decapsulate(iph, skb);
+ netif_rx(skb);
+ read_unlock(&ipip_lock);
+ return 0;
+ }
+ read_unlock(&ipip_lock);
+
+out:
+ return -1;
+}
+
+/*
+ * This function assumes it is being called from dev_queue_xmit()
+ * and that skb is filled properly by that function.
+ */
+
+static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
+ struct net_device_stats *stats = &tunnel->stat;
+ struct iphdr *tiph = &tunnel->parms.iph;
+ u8 tos = tunnel->parms.iph.tos;
+ u16 df = tiph->frag_off;
+ struct rtable *rt; /* Route to the other host */
+ struct net_device *tdev; /* Device to other host */
+ struct iphdr *old_iph = skb->nh.iph;
+ struct iphdr *iph; /* Our new IP header */
+ int max_headroom; /* The extra header space needed */
+ u32 dst = tiph->daddr;
+ int mtu;
+
+ if (tunnel->recursion++) {
+ tunnel->stat.collisions++;
+ goto tx_error;
+ }
+
+ if (skb->protocol != htons(ETH_P_IP))
+ goto tx_error;
+
+ if (tos&1)
+ tos = old_iph->tos;
+
+ if (!dst) {
+ /* NBMA tunnel */
+ if ((rt = (struct rtable*)skb->dst) == NULL) {
+ tunnel->stat.tx_fifo_errors++;
+ goto tx_error;
+ }
+ if ((dst = rt->rt_gateway) == 0)
+ goto tx_error_icmp;
+ }
+
+ {
+ struct flowi fl = { .oif = tunnel->parms.link,
+ .nl_u = { .ip4_u =
+ { .daddr = dst,
+ .saddr = tiph->saddr,
+ .tos = RT_TOS(tos) } },
+ .proto = IPPROTO_IPIP };
+ if (ip_route_output_key(&rt, &fl)) {
+ tunnel->stat.tx_carrier_errors++;
+ goto tx_error_icmp;
+ }
+ }
+ tdev = rt->u.dst.dev;
+
+ if (tdev == dev) {
+ ip_rt_put(rt);
+ tunnel->stat.collisions++;
+ goto tx_error;
+ }
+
+ if (tiph->frag_off)
+ mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
+ else
+ mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
+
+ if (mtu < 68) {
+ tunnel->stat.collisions++;
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+ if (skb->dst)
+ skb->dst->ops->update_pmtu(skb->dst, mtu);
+
+ df |= (old_iph->frag_off&htons(IP_DF));
+
+ if ((old_iph->frag_off&htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+
+ if (tunnel->err_count > 0) {
+ if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
+ tunnel->err_count--;
+ dst_link_failure(skb);
+ } else
+ tunnel->err_count = 0;
+ }
+
+ /*
+ * Okay, now see if we can stuff it in the buffer as-is.
+ */
+ max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr));
+
+ if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
+ struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
+ if (!new_skb) {
+ ip_rt_put(rt);
+ stats->tx_dropped++;
+ dev_kfree_skb(skb);
+ tunnel->recursion--;
+ return 0;
+ }
+ if (skb->sk)
+ skb_set_owner_w(new_skb, skb->sk);
+ dev_kfree_skb(skb);
+ skb = new_skb;
+ old_iph = skb->nh.iph;
+ }
+
+ skb->h.raw = skb->nh.raw;
+ skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ dst_release(skb->dst);
+ skb->dst = &rt->u.dst;
+
+ /*
+ * Push down and install the IPIP header.
+ */
+
+ iph = skb->nh.iph;
+ iph->version = 4;
+ iph->ihl = sizeof(struct iphdr)>>2;
+ iph->frag_off = df;
+ iph->protocol = IPPROTO_IPIP;
+ iph->tos = INET_ECN_encapsulate(tos, old_iph->tos);
+ iph->daddr = rt->rt_dst;
+ iph->saddr = rt->rt_src;
+
+ if ((iph->ttl = tiph->ttl) == 0)
+ iph->ttl = old_iph->ttl;
+
+ nf_reset(skb);
+
+ IPTUNNEL_XMIT();
+ tunnel->recursion--;
+ return 0;
+
+tx_error_icmp:
+ dst_link_failure(skb);
+tx_error:
+ stats->tx_errors++;
+ dev_kfree_skb(skb);
+ tunnel->recursion--;
+ return 0;
+}
+
+static int
+ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+ int err = 0;
+ struct ip_tunnel_parm p;
+ struct ip_tunnel *t;
+
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ t = NULL;
+ if (dev == ipip_fb_tunnel_dev) {
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+ err = -EFAULT;
+ break;
+ }
+ t = ipip_tunnel_locate(&p, 0);
+ }
+ if (t == NULL)
+ t = (struct ip_tunnel*)dev->priv;
+ memcpy(&p, &t->parms, sizeof(p));
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ err = -EFAULT;
+ break;
+
+ case SIOCADDTUNNEL:
+ case SIOCCHGTUNNEL:
+ err = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ goto done;
+
+ err = -EFAULT;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ goto done;
+
+ err = -EINVAL;
+ if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
+ p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
+ goto done;
+ if (p.iph.ttl)
+ p.iph.frag_off |= htons(IP_DF);
+
+ t = ipip_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
+
+ if (dev != ipip_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+ if (t != NULL) {
+ if (t->dev != dev) {
+ err = -EEXIST;
+ break;
+ }
+ } else {
+ if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
+ (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
+ err = -EINVAL;
+ break;
+ }
+ t = (struct ip_tunnel*)dev->priv;
+ ipip_tunnel_unlink(t);
+ t->parms.iph.saddr = p.iph.saddr;
+ t->parms.iph.daddr = p.iph.daddr;
+ memcpy(dev->dev_addr, &p.iph.saddr, 4);
+ memcpy(dev->broadcast, &p.iph.daddr, 4);
+ ipip_tunnel_link(t);
+ netdev_state_change(dev);
+ }
+ }
+
+ if (t) {
+ err = 0;
+ if (cmd == SIOCCHGTUNNEL) {
+ t->parms.iph.ttl = p.iph.ttl;
+ t->parms.iph.tos = p.iph.tos;
+ t->parms.iph.frag_off = p.iph.frag_off;
+ }
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
+ err = -EFAULT;
+ } else
+ err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+ break;
+
+ case SIOCDELTUNNEL:
+ err = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ goto done;
+
+ if (dev == ipip_fb_tunnel_dev) {
+ err = -EFAULT;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ goto done;
+ err = -ENOENT;
+ if ((t = ipip_tunnel_locate(&p, 0)) == NULL)
+ goto done;
+ err = -EPERM;
+ if (t->dev == ipip_fb_tunnel_dev)
+ goto done;
+ dev = t->dev;
+ }
+ err = unregister_netdevice(dev);
+ break;
+
+ default:
+ err = -EINVAL;
+ }
+
+done:
+ return err;
+}
+
+static struct net_device_stats *ipip_tunnel_get_stats(struct net_device *dev)
+{
+ return &(((struct ip_tunnel*)dev->priv)->stat);
+}
+
+static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+ if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr))
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+static void ipip_tunnel_setup(struct net_device *dev)
+{
+ SET_MODULE_OWNER(dev);
+ dev->uninit = ipip_tunnel_uninit;
+ dev->hard_start_xmit = ipip_tunnel_xmit;
+ dev->get_stats = ipip_tunnel_get_stats;
+ dev->do_ioctl = ipip_tunnel_ioctl;
+ dev->change_mtu = ipip_tunnel_change_mtu;
+ dev->destructor = free_netdev;
+
+ dev->type = ARPHRD_TUNNEL;
+ dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
+ dev->mtu = 1500 - sizeof(struct iphdr);
+ dev->flags = IFF_NOARP;
+ dev->iflink = 0;
+ dev->addr_len = 4;
+}
+
+static int ipip_tunnel_init(struct net_device *dev)
+{
+ struct net_device *tdev = NULL;
+ struct ip_tunnel *tunnel;
+ struct iphdr *iph;
+
+ tunnel = (struct ip_tunnel*)dev->priv;
+ iph = &tunnel->parms.iph;
+
+ tunnel->dev = dev;
+ strcpy(tunnel->parms.name, dev->name);
+
+ memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
+ memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+
+ if (iph->daddr) {
+ struct flowi fl = { .oif = tunnel->parms.link,
+ .nl_u = { .ip4_u =
+ { .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .tos = RT_TOS(iph->tos) } },
+ .proto = IPPROTO_IPIP };
+ struct rtable *rt;
+ if (!ip_route_output_key(&rt, &fl)) {
+ tdev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ }
+ dev->flags |= IFF_POINTOPOINT;
+ }
+
+ if (!tdev && tunnel->parms.link)
+ tdev = __dev_get_by_index(tunnel->parms.link);
+
+ if (tdev) {
+ dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
+ dev->mtu = tdev->mtu - sizeof(struct iphdr);
+ }
+ dev->iflink = tunnel->parms.link;
+
+ return 0;
+}
+
+static int __init ipip_fb_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = dev->priv;
+ struct iphdr *iph = &tunnel->parms.iph;
+
+ tunnel->dev = dev;
+ strcpy(tunnel->parms.name, dev->name);
+
+ iph->version = 4;
+ iph->protocol = IPPROTO_IPIP;
+ iph->ihl = 5;
+
+ dev_hold(dev);
+ tunnels_wc[0] = tunnel;
+ return 0;
+}
+
+static struct xfrm_tunnel ipip_handler = {
+ .handler = ipip_rcv,
+ .err_handler = ipip_err,
+};
+
+static char banner[] __initdata =
+ KERN_INFO "IPv4 over IPv4 tunneling driver\n";
+
+static int __init ipip_init(void)
+{
+ int err;
+
+ printk(banner);
+
+ if (xfrm4_tunnel_register(&ipip_handler) < 0) {
+ printk(KERN_INFO "ipip init: can't register tunnel\n");
+ return -EAGAIN;
+ }
+
+ ipip_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
+ "tunl0",
+ ipip_tunnel_setup);
+ if (!ipip_fb_tunnel_dev) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ ipip_fb_tunnel_dev->init = ipip_fb_tunnel_init;
+
+ if ((err = register_netdev(ipip_fb_tunnel_dev)))
+ goto err2;
+ out:
+ return err;
+ err2:
+ free_netdev(ipip_fb_tunnel_dev);
+ err1:
+ xfrm4_tunnel_deregister(&ipip_handler);
+ goto out;
+}
+
+static void __exit ipip_fini(void)
+{
+ if (xfrm4_tunnel_deregister(&ipip_handler) < 0)
+ printk(KERN_INFO "ipip close: can't deregister tunnel\n");
+
+ unregister_netdev(ipip_fb_tunnel_dev);
+}
+
+module_init(ipip_init);
+module_exit(ipip_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
new file mode 100644
index 00000000000..e21c049ec62
--- /dev/null
+++ b/net/ipv4/ipmr.c
@@ -0,0 +1,1900 @@
+/*
+ * IP multicast routing support for mrouted 3.6/3.8
+ *
+ * (c) 1995 Alan Cox, <alan@redhat.com>
+ * Linux Consultancy and Custom Driver Development
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
+ *
+ * Fixes:
+ * Michael Chastain : Incorrect size of copying.
+ * Alan Cox : Added the cache manager code
+ * Alan Cox : Fixed the clone/copy bug and device race.
+ * Mike McLagan : Routing by source
+ * Malcolm Beattie : Buffer handling fixes.
+ * Alexey Kuznetsov : Double buffer free and other fixes.
+ * SVR Anand : Fixed several multicast bugs and problems.
+ * Alexey Kuznetsov : Status, optimisations and more.
+ * Brad Parker : Better behaviour on mrouted upcall
+ * overflow.
+ * Carlos Picoto : PIMv1 Support
+ * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header
+ * Relax this requrement to work with older peers.
+ *
+ */
+
+#include <linux/config.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ipip.h>
+#include <net/checksum.h>
+
+#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
+#define CONFIG_IP_PIMSM 1
+#endif
+
+static struct sock *mroute_socket;
+
+
+/* Big lock, protecting vif table, mrt cache and mroute socket state.
+ Note that the changes are semaphored via rtnl_lock.
+ */
+
+static DEFINE_RWLOCK(mrt_lock);
+
+/*
+ * Multicast router control variables
+ */
+
+static struct vif_device vif_table[MAXVIFS]; /* Devices */
+static int maxvif;
+
+#define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
+
+static int mroute_do_assert; /* Set in PIM assert */
+static int mroute_do_pim;
+
+static struct mfc_cache *mfc_cache_array[MFC_LINES]; /* Forwarding cache */
+
+static struct mfc_cache *mfc_unres_queue; /* Queue of unresolved entries */
+static atomic_t cache_resolve_queue_len; /* Size of unresolved */
+
+/* Special spinlock for queue of unresolved entries */
+static DEFINE_SPINLOCK(mfc_unres_lock);
+
+/* We return to original Alan's scheme. Hash table of resolved
+ entries is changed only in process context and protected
+ with weak lock mrt_lock. Queue of unresolved entries is protected
+ with strong spinlock mfc_unres_lock.
+
+ In this case data path is free of exclusive locks at all.
+ */
+
+static kmem_cache_t *mrt_cachep;
+
+static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
+static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
+static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
+
+#ifdef CONFIG_IP_PIMSM_V2
+static struct net_protocol pim_protocol;
+#endif
+
+static struct timer_list ipmr_expire_timer;
+
+/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
+
+static
+struct net_device *ipmr_new_tunnel(struct vifctl *v)
+{
+ struct net_device *dev;
+
+ dev = __dev_get_by_name("tunl0");
+
+ if (dev) {
+ int err;
+ struct ifreq ifr;
+ mm_segment_t oldfs;
+ struct ip_tunnel_parm p;
+ struct in_device *in_dev;
+
+ memset(&p, 0, sizeof(p));
+ p.iph.daddr = v->vifc_rmt_addr.s_addr;
+ p.iph.saddr = v->vifc_lcl_addr.s_addr;
+ p.iph.version = 4;
+ p.iph.ihl = 5;
+ p.iph.protocol = IPPROTO_IPIP;
+ sprintf(p.name, "dvmrp%d", v->vifc_vifi);
+ ifr.ifr_ifru.ifru_data = (void*)&p;
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
+ set_fs(oldfs);
+
+ dev = NULL;
+
+ if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
+ dev->flags |= IFF_MULTICAST;
+
+ in_dev = __in_dev_get(dev);
+ if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
+ goto failure;
+ in_dev->cnf.rp_filter = 0;
+
+ if (dev_open(dev))
+ goto failure;
+ }
+ }
+ return dev;
+
+failure:
+ /* allow the register to be completed before unregistering. */
+ rtnl_unlock();
+ rtnl_lock();
+
+ unregister_netdevice(dev);
+ return NULL;
+}
+
+#ifdef CONFIG_IP_PIMSM
+
+static int reg_vif_num = -1;
+
+static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ read_lock(&mrt_lock);
+ ((struct net_device_stats*)dev->priv)->tx_bytes += skb->len;
+ ((struct net_device_stats*)dev->priv)->tx_packets++;
+ ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
+ read_unlock(&mrt_lock);
+ kfree_skb(skb);
+ return 0;
+}
+
+static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
+{
+ return (struct net_device_stats*)dev->priv;
+}
+
+static void reg_vif_setup(struct net_device *dev)
+{
+ dev->type = ARPHRD_PIMREG;
+ dev->mtu = 1500 - sizeof(struct iphdr) - 8;
+ dev->flags = IFF_NOARP;
+ dev->hard_start_xmit = reg_vif_xmit;
+ dev->get_stats = reg_vif_get_stats;
+ dev->destructor = free_netdev;
+}
+
+static struct net_device *ipmr_reg_vif(void)
+{
+ struct net_device *dev;
+ struct in_device *in_dev;
+
+ dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
+ reg_vif_setup);
+
+ if (dev == NULL)
+ return NULL;
+
+ if (register_netdevice(dev)) {
+ free_netdev(dev);
+ return NULL;
+ }
+ dev->iflink = 0;
+
+ if ((in_dev = inetdev_init(dev)) == NULL)
+ goto failure;
+
+ in_dev->cnf.rp_filter = 0;
+
+ if (dev_open(dev))
+ goto failure;
+
+ return dev;
+
+failure:
+ /* allow the register to be completed before unregistering. */
+ rtnl_unlock();
+ rtnl_lock();
+
+ unregister_netdevice(dev);
+ return NULL;
+}
+#endif
+
+/*
+ * Delete a VIF entry
+ */
+
+static int vif_delete(int vifi)
+{
+ struct vif_device *v;
+ struct net_device *dev;
+ struct in_device *in_dev;
+
+ if (vifi < 0 || vifi >= maxvif)
+ return -EADDRNOTAVAIL;
+
+ v = &vif_table[vifi];
+
+ write_lock_bh(&mrt_lock);
+ dev = v->dev;
+ v->dev = NULL;
+
+ if (!dev) {
+ write_unlock_bh(&mrt_lock);
+ return -EADDRNOTAVAIL;
+ }
+
+#ifdef CONFIG_IP_PIMSM
+ if (vifi == reg_vif_num)
+ reg_vif_num = -1;
+#endif
+
+ if (vifi+1 == maxvif) {
+ int tmp;
+ for (tmp=vifi-1; tmp>=0; tmp--) {
+ if (VIF_EXISTS(tmp))
+ break;
+ }
+ maxvif = tmp+1;
+ }
+
+ write_unlock_bh(&mrt_lock);
+
+ dev_set_allmulti(dev, -1);
+
+ if ((in_dev = __in_dev_get(dev)) != NULL) {
+ in_dev->cnf.mc_forwarding--;
+ ip_rt_multicast_event(in_dev);
+ }
+
+ if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
+ unregister_netdevice(dev);
+
+ dev_put(dev);
+ return 0;
+}
+
+/* Destroy an unresolved cache entry, killing queued skbs
+ and reporting error to netlink readers.
+ */
+
+static void ipmr_destroy_unres(struct mfc_cache *c)
+{
+ struct sk_buff *skb;
+
+ atomic_dec(&cache_resolve_queue_len);
+
+ while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
+ if (skb->nh.iph->version == 0) {
+ struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
+ nlh->nlmsg_type = NLMSG_ERROR;
+ nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+ skb_trim(skb, nlh->nlmsg_len);
+ ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT;
+ netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
+ } else
+ kfree_skb(skb);
+ }
+
+ kmem_cache_free(mrt_cachep, c);
+}
+
+
+/* Single timer process for all the unresolved queue. */
+
+static void ipmr_expire_process(unsigned long dummy)
+{
+ unsigned long now;
+ unsigned long expires;
+ struct mfc_cache *c, **cp;
+
+ if (!spin_trylock(&mfc_unres_lock)) {
+ mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
+ return;
+ }
+
+ if (atomic_read(&cache_resolve_queue_len) == 0)
+ goto out;
+
+ now = jiffies;
+ expires = 10*HZ;
+ cp = &mfc_unres_queue;
+
+ while ((c=*cp) != NULL) {
+ if (time_after(c->mfc_un.unres.expires, now)) {
+ unsigned long interval = c->mfc_un.unres.expires - now;
+ if (interval < expires)
+ expires = interval;
+ cp = &c->next;
+ continue;
+ }
+
+ *cp = c->next;
+
+ ipmr_destroy_unres(c);
+ }
+
+ if (atomic_read(&cache_resolve_queue_len))
+ mod_timer(&ipmr_expire_timer, jiffies + expires);
+
+out:
+ spin_unlock(&mfc_unres_lock);
+}
+
+/* Fill oifs list. It is called under write locked mrt_lock. */
+
+static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls)
+{
+ int vifi;
+
+ cache->mfc_un.res.minvif = MAXVIFS;
+ cache->mfc_un.res.maxvif = 0;
+ memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
+
+ for (vifi=0; vifi<maxvif; vifi++) {
+ if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
+ cache->mfc_un.res.ttls[vifi] = ttls[vifi];
+ if (cache->mfc_un.res.minvif > vifi)
+ cache->mfc_un.res.minvif = vifi;
+ if (cache->mfc_un.res.maxvif <= vifi)
+ cache->mfc_un.res.maxvif = vifi + 1;
+ }
+ }
+}
+
+static int vif_add(struct vifctl *vifc, int mrtsock)
+{
+ int vifi = vifc->vifc_vifi;
+ struct vif_device *v = &vif_table[vifi];
+ struct net_device *dev;
+ struct in_device *in_dev;
+
+ /* Is vif busy ? */
+ if (VIF_EXISTS(vifi))
+ return -EADDRINUSE;
+
+ switch (vifc->vifc_flags) {
+#ifdef CONFIG_IP_PIMSM
+ case VIFF_REGISTER:
+ /*
+ * Special Purpose VIF in PIM
+ * All the packets will be sent to the daemon
+ */
+ if (reg_vif_num >= 0)
+ return -EADDRINUSE;
+ dev = ipmr_reg_vif();
+ if (!dev)
+ return -ENOBUFS;
+ break;
+#endif
+ case VIFF_TUNNEL:
+ dev = ipmr_new_tunnel(vifc);
+ if (!dev)
+ return -ENOBUFS;
+ break;
+ case 0:
+ dev=ip_dev_find(vifc->vifc_lcl_addr.s_addr);
+ if (!dev)
+ return -EADDRNOTAVAIL;
+ __dev_put(dev);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if ((in_dev = __in_dev_get(dev)) == NULL)
+ return -EADDRNOTAVAIL;
+ in_dev->cnf.mc_forwarding++;
+ dev_set_allmulti(dev, +1);
+ ip_rt_multicast_event(in_dev);
+
+ /*
+ * Fill in the VIF structures
+ */
+ v->rate_limit=vifc->vifc_rate_limit;
+ v->local=vifc->vifc_lcl_addr.s_addr;
+ v->remote=vifc->vifc_rmt_addr.s_addr;
+ v->flags=vifc->vifc_flags;
+ if (!mrtsock)
+ v->flags |= VIFF_STATIC;
+ v->threshold=vifc->vifc_threshold;
+ v->bytes_in = 0;
+ v->bytes_out = 0;
+ v->pkt_in = 0;
+ v->pkt_out = 0;
+ v->link = dev->ifindex;
+ if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
+ v->link = dev->iflink;
+
+ /* And finish update writing critical data */
+ write_lock_bh(&mrt_lock);
+ dev_hold(dev);
+ v->dev=dev;
+#ifdef CONFIG_IP_PIMSM
+ if (v->flags&VIFF_REGISTER)
+ reg_vif_num = vifi;
+#endif
+ if (vifi+1 > maxvif)
+ maxvif = vifi+1;
+ write_unlock_bh(&mrt_lock);
+ return 0;
+}
+
+static struct mfc_cache *ipmr_cache_find(__u32 origin, __u32 mcastgrp)
+{
+ int line=MFC_HASH(mcastgrp,origin);
+ struct mfc_cache *c;
+
+ for (c=mfc_cache_array[line]; c; c = c->next) {
+ if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
+ break;
+ }
+ return c;
+}
+
+/*
+ * Allocate a multicast cache entry
+ */
+static struct mfc_cache *ipmr_cache_alloc(void)
+{
+ struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_KERNEL);
+ if(c==NULL)
+ return NULL;
+ memset(c, 0, sizeof(*c));
+ c->mfc_un.res.minvif = MAXVIFS;
+ return c;
+}
+
+static struct mfc_cache *ipmr_cache_alloc_unres(void)
+{
+ struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_ATOMIC);
+ if(c==NULL)
+ return NULL;
+ memset(c, 0, sizeof(*c));
+ skb_queue_head_init(&c->mfc_un.unres.unresolved);
+ c->mfc_un.unres.expires = jiffies + 10*HZ;
+ return c;
+}
+
+/*
+ * A cache entry has gone into a resolved state from queued
+ */
+
+static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
+{
+ struct sk_buff *skb;
+
+ /*
+ * Play the pending entries through our router
+ */
+
+ while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
+ if (skb->nh.iph->version == 0) {
+ int err;
+ struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
+
+ if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
+ nlh->nlmsg_len = skb->tail - (u8*)nlh;
+ } else {
+ nlh->nlmsg_type = NLMSG_ERROR;
+ nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+ skb_trim(skb, nlh->nlmsg_len);
+ ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE;
+ }
+ err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
+ } else
+ ip_mr_forward(skb, c, 0);
+ }
+}
+
+/*
+ * Bounce a cache query up to mrouted. We could use netlink for this but mrouted
+ * expects the following bizarre scheme.
+ *
+ * Called under mrt_lock.
+ */
+
+static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
+{
+ struct sk_buff *skb;
+ int ihl = pkt->nh.iph->ihl<<2;
+ struct igmphdr *igmp;
+ struct igmpmsg *msg;
+ int ret;
+
+#ifdef CONFIG_IP_PIMSM
+ if (assert == IGMPMSG_WHOLEPKT)
+ skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
+ else
+#endif
+ skb = alloc_skb(128, GFP_ATOMIC);
+
+ if(!skb)
+ return -ENOBUFS;
+
+#ifdef CONFIG_IP_PIMSM
+ if (assert == IGMPMSG_WHOLEPKT) {
+ /* Ugly, but we have no choice with this interface.
+ Duplicate old header, fix ihl, length etc.
+ And all this only to mangle msg->im_msgtype and
+ to set msg->im_mbz to "mbz" :-)
+ */
+ msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr));
+ skb->nh.raw = skb->h.raw = (u8*)msg;
+ memcpy(msg, pkt->nh.raw, sizeof(struct iphdr));
+ msg->im_msgtype = IGMPMSG_WHOLEPKT;
+ msg->im_mbz = 0;
+ msg->im_vif = reg_vif_num;
+ skb->nh.iph->ihl = sizeof(struct iphdr) >> 2;
+ skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr));
+ } else
+#endif
+ {
+
+ /*
+ * Copy the IP header
+ */
+
+ skb->nh.iph = (struct iphdr *)skb_put(skb, ihl);
+ memcpy(skb->data,pkt->data,ihl);
+ skb->nh.iph->protocol = 0; /* Flag to the kernel this is a route add */
+ msg = (struct igmpmsg*)skb->nh.iph;
+ msg->im_vif = vifi;
+ skb->dst = dst_clone(pkt->dst);
+
+ /*
+ * Add our header
+ */
+
+ igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
+ igmp->type =
+ msg->im_msgtype = assert;
+ igmp->code = 0;
+ skb->nh.iph->tot_len=htons(skb->len); /* Fix the length */
+ skb->h.raw = skb->nh.raw;
+ }
+
+ if (mroute_socket == NULL) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ /*
+ * Deliver to mrouted
+ */
+ if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
+ if (net_ratelimit())
+ printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
+ kfree_skb(skb);
+ }
+
+ return ret;
+}
+
+/*
+ * Queue a packet for resolution. It gets locked cache entry!
+ */
+
+static int
+ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
+{
+ int err;
+ struct mfc_cache *c;
+
+ spin_lock_bh(&mfc_unres_lock);
+ for (c=mfc_unres_queue; c; c=c->next) {
+ if (c->mfc_mcastgrp == skb->nh.iph->daddr &&
+ c->mfc_origin == skb->nh.iph->saddr)
+ break;
+ }
+
+ if (c == NULL) {
+ /*
+ * Create a new entry if allowable
+ */
+
+ if (atomic_read(&cache_resolve_queue_len)>=10 ||
+ (c=ipmr_cache_alloc_unres())==NULL) {
+ spin_unlock_bh(&mfc_unres_lock);
+
+ kfree_skb(skb);
+ return -ENOBUFS;
+ }
+
+ /*
+ * Fill in the new cache entry
+ */
+ c->mfc_parent=-1;
+ c->mfc_origin=skb->nh.iph->saddr;
+ c->mfc_mcastgrp=skb->nh.iph->daddr;
+
+ /*
+ * Reflect first query at mrouted.
+ */
+ if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
+ /* If the report failed throw the cache entry
+ out - Brad Parker
+ */
+ spin_unlock_bh(&mfc_unres_lock);
+
+ kmem_cache_free(mrt_cachep, c);
+ kfree_skb(skb);
+ return err;
+ }
+
+ atomic_inc(&cache_resolve_queue_len);
+ c->next = mfc_unres_queue;
+ mfc_unres_queue = c;
+
+ mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
+ }
+
+ /*
+ * See if we can append the packet
+ */
+ if (c->mfc_un.unres.unresolved.qlen>3) {
+ kfree_skb(skb);
+ err = -ENOBUFS;
+ } else {
+ skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
+ err = 0;
+ }
+
+ spin_unlock_bh(&mfc_unres_lock);
+ return err;
+}
+
+/*
+ * MFC cache manipulation by user space mroute daemon
+ */
+
+static int ipmr_mfc_delete(struct mfcctl *mfc)
+{
+ int line;
+ struct mfc_cache *c, **cp;
+
+ line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
+
+ for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
+ if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
+ c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
+ write_lock_bh(&mrt_lock);
+ *cp = c->next;
+ write_unlock_bh(&mrt_lock);
+
+ kmem_cache_free(mrt_cachep, c);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
+{
+ int line;
+ struct mfc_cache *uc, *c, **cp;
+
+ line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
+
+ for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
+ if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
+ c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
+ break;
+ }
+
+ if (c != NULL) {
+ write_lock_bh(&mrt_lock);
+ c->mfc_parent = mfc->mfcc_parent;
+ ipmr_update_threshoulds(c, mfc->mfcc_ttls);
+ if (!mrtsock)
+ c->mfc_flags |= MFC_STATIC;
+ write_unlock_bh(&mrt_lock);
+ return 0;
+ }
+
+ if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
+ return -EINVAL;
+
+ c=ipmr_cache_alloc();
+ if (c==NULL)
+ return -ENOMEM;
+
+ c->mfc_origin=mfc->mfcc_origin.s_addr;
+ c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
+ c->mfc_parent=mfc->mfcc_parent;
+ ipmr_update_threshoulds(c, mfc->mfcc_ttls);
+ if (!mrtsock)
+ c->mfc_flags |= MFC_STATIC;
+
+ write_lock_bh(&mrt_lock);
+ c->next = mfc_cache_array[line];
+ mfc_cache_array[line] = c;
+ write_unlock_bh(&mrt_lock);
+
+ /*
+ * Check to see if we resolved a queued list. If so we
+ * need to send on the frames and tidy up.
+ */
+ spin_lock_bh(&mfc_unres_lock);
+ for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
+ cp = &uc->next) {
+ if (uc->mfc_origin == c->mfc_origin &&
+ uc->mfc_mcastgrp == c->mfc_mcastgrp) {
+ *cp = uc->next;
+ if (atomic_dec_and_test(&cache_resolve_queue_len))
+ del_timer(&ipmr_expire_timer);
+ break;
+ }
+ }
+ spin_unlock_bh(&mfc_unres_lock);
+
+ if (uc) {
+ ipmr_cache_resolve(uc, c);
+ kmem_cache_free(mrt_cachep, uc);
+ }
+ return 0;
+}
+
+/*
+ * Close the multicast socket, and clear the vif tables etc
+ */
+
+static void mroute_clean_tables(struct sock *sk)
+{
+ int i;
+
+ /*
+ * Shut down all active vif entries
+ */
+ for(i=0; i<maxvif; i++) {
+ if (!(vif_table[i].flags&VIFF_STATIC))
+ vif_delete(i);
+ }
+
+ /*
+ * Wipe the cache
+ */
+ for (i=0;i<MFC_LINES;i++) {
+ struct mfc_cache *c, **cp;
+
+ cp = &mfc_cache_array[i];
+ while ((c = *cp) != NULL) {
+ if (c->mfc_flags&MFC_STATIC) {
+ cp = &c->next;
+ continue;
+ }
+ write_lock_bh(&mrt_lock);
+ *cp = c->next;
+ write_unlock_bh(&mrt_lock);
+
+ kmem_cache_free(mrt_cachep, c);
+ }
+ }
+
+ if (atomic_read(&cache_resolve_queue_len) != 0) {
+ struct mfc_cache *c;
+
+ spin_lock_bh(&mfc_unres_lock);
+ while (mfc_unres_queue != NULL) {
+ c = mfc_unres_queue;
+ mfc_unres_queue = c->next;
+ spin_unlock_bh(&mfc_unres_lock);
+
+ ipmr_destroy_unres(c);
+
+ spin_lock_bh(&mfc_unres_lock);
+ }
+ spin_unlock_bh(&mfc_unres_lock);
+ }
+}
+
+static void mrtsock_destruct(struct sock *sk)
+{
+ rtnl_lock();
+ if (sk == mroute_socket) {
+ ipv4_devconf.mc_forwarding--;
+
+ write_lock_bh(&mrt_lock);
+ mroute_socket=NULL;
+ write_unlock_bh(&mrt_lock);
+
+ mroute_clean_tables(sk);
+ }
+ rtnl_unlock();
+}
+
+/*
+ * Socket options and virtual interface manipulation. The whole
+ * virtual interface system is a complete heap, but unfortunately
+ * that's how BSD mrouted happens to think. Maybe one day with a proper
+ * MOSPF/PIM router set up we can clean this up.
+ */
+
+int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
+{
+ int ret;
+ struct vifctl vif;
+ struct mfcctl mfc;
+
+ if(optname!=MRT_INIT)
+ {
+ if(sk!=mroute_socket && !capable(CAP_NET_ADMIN))
+ return -EACCES;
+ }
+
+ switch(optname)
+ {
+ case MRT_INIT:
+ if (sk->sk_type != SOCK_RAW ||
+ inet_sk(sk)->num != IPPROTO_IGMP)
+ return -EOPNOTSUPP;
+ if(optlen!=sizeof(int))
+ return -ENOPROTOOPT;
+
+ rtnl_lock();
+ if (mroute_socket) {
+ rtnl_unlock();
+ return -EADDRINUSE;
+ }
+
+ ret = ip_ra_control(sk, 1, mrtsock_destruct);
+ if (ret == 0) {
+ write_lock_bh(&mrt_lock);
+ mroute_socket=sk;
+ write_unlock_bh(&mrt_lock);
+
+ ipv4_devconf.mc_forwarding++;
+ }
+ rtnl_unlock();
+ return ret;
+ case MRT_DONE:
+ if (sk!=mroute_socket)
+ return -EACCES;
+ return ip_ra_control(sk, 0, NULL);
+ case MRT_ADD_VIF:
+ case MRT_DEL_VIF:
+ if(optlen!=sizeof(vif))
+ return -EINVAL;
+ if (copy_from_user(&vif,optval,sizeof(vif)))
+ return -EFAULT;
+ if(vif.vifc_vifi >= MAXVIFS)
+ return -ENFILE;
+ rtnl_lock();
+ if (optname==MRT_ADD_VIF) {
+ ret = vif_add(&vif, sk==mroute_socket);
+ } else {
+ ret = vif_delete(vif.vifc_vifi);
+ }
+ rtnl_unlock();
+ return ret;
+
+ /*
+ * Manipulate the forwarding caches. These live
+ * in a sort of kernel/user symbiosis.
+ */
+ case MRT_ADD_MFC:
+ case MRT_DEL_MFC:
+ if(optlen!=sizeof(mfc))
+ return -EINVAL;
+ if (copy_from_user(&mfc,optval, sizeof(mfc)))
+ return -EFAULT;
+ rtnl_lock();
+ if (optname==MRT_DEL_MFC)
+ ret = ipmr_mfc_delete(&mfc);
+ else
+ ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
+ rtnl_unlock();
+ return ret;
+ /*
+ * Control PIM assert.
+ */
+ case MRT_ASSERT:
+ {
+ int v;
+ if(get_user(v,(int __user *)optval))
+ return -EFAULT;
+ mroute_do_assert=(v)?1:0;
+ return 0;
+ }
+#ifdef CONFIG_IP_PIMSM
+ case MRT_PIM:
+ {
+ int v, ret;
+ if(get_user(v,(int __user *)optval))
+ return -EFAULT;
+ v = (v)?1:0;
+ rtnl_lock();
+ ret = 0;
+ if (v != mroute_do_pim) {
+ mroute_do_pim = v;
+ mroute_do_assert = v;
+#ifdef CONFIG_IP_PIMSM_V2
+ if (mroute_do_pim)
+ ret = inet_add_protocol(&pim_protocol,
+ IPPROTO_PIM);
+ else
+ ret = inet_del_protocol(&pim_protocol,
+ IPPROTO_PIM);
+ if (ret < 0)
+ ret = -EAGAIN;
+#endif
+ }
+ rtnl_unlock();
+ return ret;
+ }
+#endif
+ /*
+ * Spurious command, or MRT_VERSION which you cannot
+ * set.
+ */
+ default:
+ return -ENOPROTOOPT;
+ }
+}
+
+/*
+ * Getsock opt support for the multicast routing system.
+ */
+
+int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
+{
+ int olr;
+ int val;
+
+ if(optname!=MRT_VERSION &&
+#ifdef CONFIG_IP_PIMSM
+ optname!=MRT_PIM &&
+#endif
+ optname!=MRT_ASSERT)
+ return -ENOPROTOOPT;
+
+ if (get_user(olr, optlen))
+ return -EFAULT;
+
+ olr = min_t(unsigned int, olr, sizeof(int));
+ if (olr < 0)
+ return -EINVAL;
+
+ if(put_user(olr,optlen))
+ return -EFAULT;
+ if(optname==MRT_VERSION)
+ val=0x0305;
+#ifdef CONFIG_IP_PIMSM
+ else if(optname==MRT_PIM)
+ val=mroute_do_pim;
+#endif
+ else
+ val=mroute_do_assert;
+ if(copy_to_user(optval,&val,olr))
+ return -EFAULT;
+ return 0;
+}
+
+/*
+ * The IP multicast ioctl support routines.
+ */
+
+int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
+{
+ struct sioc_sg_req sr;
+ struct sioc_vif_req vr;
+ struct vif_device *vif;
+ struct mfc_cache *c;
+
+ switch(cmd)
+ {
+ case SIOCGETVIFCNT:
+ if (copy_from_user(&vr,arg,sizeof(vr)))
+ return -EFAULT;
+ if(vr.vifi>=maxvif)
+ return -EINVAL;
+ read_lock(&mrt_lock);
+ vif=&vif_table[vr.vifi];
+ if(VIF_EXISTS(vr.vifi)) {
+ vr.icount=vif->pkt_in;
+ vr.ocount=vif->pkt_out;
+ vr.ibytes=vif->bytes_in;
+ vr.obytes=vif->bytes_out;
+ read_unlock(&mrt_lock);
+
+ if (copy_to_user(arg,&vr,sizeof(vr)))
+ return -EFAULT;
+ return 0;
+ }
+ read_unlock(&mrt_lock);
+ return -EADDRNOTAVAIL;
+ case SIOCGETSGCNT:
+ if (copy_from_user(&sr,arg,sizeof(sr)))
+ return -EFAULT;
+
+ read_lock(&mrt_lock);
+ c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
+ if (c) {
+ sr.pktcnt = c->mfc_un.res.pkt;
+ sr.bytecnt = c->mfc_un.res.bytes;
+ sr.wrong_if = c->mfc_un.res.wrong_if;
+ read_unlock(&mrt_lock);
+
+ if (copy_to_user(arg,&sr,sizeof(sr)))
+ return -EFAULT;
+ return 0;
+ }
+ read_unlock(&mrt_lock);
+ return -EADDRNOTAVAIL;
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
+
+static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct vif_device *v;
+ int ct;
+ if (event != NETDEV_UNREGISTER)
+ return NOTIFY_DONE;
+ v=&vif_table[0];
+ for(ct=0;ct<maxvif;ct++,v++) {
+ if (v->dev==ptr)
+ vif_delete(ct);
+ }
+ return NOTIFY_DONE;
+}
+
+
+static struct notifier_block ip_mr_notifier={
+ .notifier_call = ipmr_device_event,
+};
+
+/*
+ * Encapsulate a packet by attaching a valid IPIP header to it.
+ * This avoids tunnel drivers and other mess and gives us the speed so
+ * important for multicast video.
+ */
+
+static void ip_encap(struct sk_buff *skb, u32 saddr, u32 daddr)
+{
+ struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr));
+
+ iph->version = 4;
+ iph->tos = skb->nh.iph->tos;
+ iph->ttl = skb->nh.iph->ttl;
+ iph->frag_off = 0;
+ iph->daddr = daddr;
+ iph->saddr = saddr;
+ iph->protocol = IPPROTO_IPIP;
+ iph->ihl = 5;
+ iph->tot_len = htons(skb->len);
+ ip_select_ident(iph, skb->dst, NULL);
+ ip_send_check(iph);
+
+ skb->h.ipiph = skb->nh.iph;
+ skb->nh.iph = iph;
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ nf_reset(skb);
+}
+
+static inline int ipmr_forward_finish(struct sk_buff *skb)
+{
+ struct ip_options * opt = &(IPCB(skb)->opt);
+
+ IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
+
+ if (unlikely(opt->optlen))
+ ip_forward_options(skb);
+
+ return dst_output(skb);
+}
+
+/*
+ * Processing handlers for ipmr_forward
+ */
+
+static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct vif_device *vif = &vif_table[vifi];
+ struct net_device *dev;
+ struct rtable *rt;
+ int encap = 0;
+
+ if (vif->dev == NULL)
+ goto out_free;
+
+#ifdef CONFIG_IP_PIMSM
+ if (vif->flags & VIFF_REGISTER) {
+ vif->pkt_out++;
+ vif->bytes_out+=skb->len;
+ ((struct net_device_stats*)vif->dev->priv)->tx_bytes += skb->len;
+ ((struct net_device_stats*)vif->dev->priv)->tx_packets++;
+ ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
+ kfree_skb(skb);
+ return;
+ }
+#endif
+
+ if (vif->flags&VIFF_TUNNEL) {
+ struct flowi fl = { .oif = vif->link,
+ .nl_u = { .ip4_u =
+ { .daddr = vif->remote,
+ .saddr = vif->local,
+ .tos = RT_TOS(iph->tos) } },
+ .proto = IPPROTO_IPIP };
+ if (ip_route_output_key(&rt, &fl))
+ goto out_free;
+ encap = sizeof(struct iphdr);
+ } else {
+ struct flowi fl = { .oif = vif->link,
+ .nl_u = { .ip4_u =
+ { .daddr = iph->daddr,
+ .tos = RT_TOS(iph->tos) } },
+ .proto = IPPROTO_IPIP };
+ if (ip_route_output_key(&rt, &fl))
+ goto out_free;
+ }
+
+ dev = rt->u.dst.dev;
+
+ if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
+ /* Do not fragment multicasts. Alas, IPv4 does not
+ allow to send ICMP, so that packets will disappear
+ to blackhole.
+ */
+
+ IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
+ ip_rt_put(rt);
+ goto out_free;
+ }
+
+ encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
+
+ if (skb_cow(skb, encap)) {
+ ip_rt_put(rt);
+ goto out_free;
+ }
+
+ vif->pkt_out++;
+ vif->bytes_out+=skb->len;
+
+ dst_release(skb->dst);
+ skb->dst = &rt->u.dst;
+ iph = skb->nh.iph;
+ ip_decrease_ttl(iph);
+
+ /* FIXME: forward and output firewalls used to be called here.
+ * What do we do with netfilter? -- RR */
+ if (vif->flags & VIFF_TUNNEL) {
+ ip_encap(skb, vif->local, vif->remote);
+ /* FIXME: extra output firewall step used to be here. --RR */
+ ((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++;
+ ((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb->len;
+ }
+
+ IPCB(skb)->flags |= IPSKB_FORWARDED;
+
+ /*
+ * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
+ * not only before forwarding, but after forwarding on all output
+ * interfaces. It is clear, if mrouter runs a multicasting
+ * program, it should receive packets not depending to what interface
+ * program is joined.
+ * If we will not make it, the program will have to join on all
+ * interfaces. On the other hand, multihoming host (or router, but
+ * not mrouter) cannot join to more than one interface - it will
+ * result in receiving multiple packets.
+ */
+ NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev,
+ ipmr_forward_finish);
+ return;
+
+out_free:
+ kfree_skb(skb);
+ return;
+}
+
+static int ipmr_find_vif(struct net_device *dev)
+{
+ int ct;
+ for (ct=maxvif-1; ct>=0; ct--) {
+ if (vif_table[ct].dev == dev)
+ break;
+ }
+ return ct;
+}
+
+/* "local" means that we should preserve one skb (for local delivery) */
+
+static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
+{
+ int psend = -1;
+ int vif, ct;
+
+ vif = cache->mfc_parent;
+ cache->mfc_un.res.pkt++;
+ cache->mfc_un.res.bytes += skb->len;
+
+ /*
+ * Wrong interface: drop packet and (maybe) send PIM assert.
+ */
+ if (vif_table[vif].dev != skb->dev) {
+ int true_vifi;
+
+ if (((struct rtable*)skb->dst)->fl.iif == 0) {
+ /* It is our own packet, looped back.
+ Very complicated situation...
+
+ The best workaround until routing daemons will be
+ fixed is not to redistribute packet, if it was
+ send through wrong interface. It means, that
+ multicast applications WILL NOT work for
+ (S,G), which have default multicast route pointing
+ to wrong oif. In any case, it is not a good
+ idea to use multicasting applications on router.
+ */
+ goto dont_forward;
+ }
+
+ cache->mfc_un.res.wrong_if++;
+ true_vifi = ipmr_find_vif(skb->dev);
+
+ if (true_vifi >= 0 && mroute_do_assert &&
+ /* pimsm uses asserts, when switching from RPT to SPT,
+ so that we cannot check that packet arrived on an oif.
+ It is bad, but otherwise we would need to move pretty
+ large chunk of pimd to kernel. Ough... --ANK
+ */
+ (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
+ time_after(jiffies,
+ cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
+ cache->mfc_un.res.last_assert = jiffies;
+ ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
+ }
+ goto dont_forward;
+ }
+
+ vif_table[vif].pkt_in++;
+ vif_table[vif].bytes_in+=skb->len;
+
+ /*
+ * Forward the frame
+ */
+ for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
+ if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) {
+ if (psend != -1) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2)
+ ipmr_queue_xmit(skb2, cache, psend);
+ }
+ psend=ct;
+ }
+ }
+ if (psend != -1) {
+ if (local) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2)
+ ipmr_queue_xmit(skb2, cache, psend);
+ } else {
+ ipmr_queue_xmit(skb, cache, psend);
+ return 0;
+ }
+ }
+
+dont_forward:
+ if (!local)
+ kfree_skb(skb);
+ return 0;
+}
+
+
+/*
+ * Multicast packets for forwarding arrive here
+ */
+
+int ip_mr_input(struct sk_buff *skb)
+{
+ struct mfc_cache *cache;
+ int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
+
+ /* Packet is looped back after forward, it should not be
+ forwarded second time, but still can be delivered locally.
+ */
+ if (IPCB(skb)->flags&IPSKB_FORWARDED)
+ goto dont_forward;
+
+ if (!local) {
+ if (IPCB(skb)->opt.router_alert) {
+ if (ip_call_ra_chain(skb))
+ return 0;
+ } else if (skb->nh.iph->protocol == IPPROTO_IGMP){
+ /* IGMPv1 (and broken IGMPv2 implementations sort of
+ Cisco IOS <= 11.2(8)) do not put router alert
+ option to IGMP packets destined to routable
+ groups. It is very bad, because it means
+ that we can forward NO IGMP messages.
+ */
+ read_lock(&mrt_lock);
+ if (mroute_socket) {
+ raw_rcv(mroute_socket, skb);
+ read_unlock(&mrt_lock);
+ return 0;
+ }
+ read_unlock(&mrt_lock);
+ }
+ }
+
+ read_lock(&mrt_lock);
+ cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr);
+
+ /*
+ * No usable cache entry
+ */
+ if (cache==NULL) {
+ int vif;
+
+ if (local) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+ ip_local_deliver(skb);
+ if (skb2 == NULL) {
+ read_unlock(&mrt_lock);
+ return -ENOBUFS;
+ }
+ skb = skb2;
+ }
+
+ vif = ipmr_find_vif(skb->dev);
+ if (vif >= 0) {
+ int err = ipmr_cache_unresolved(vif, skb);
+ read_unlock(&mrt_lock);
+
+ return err;
+ }
+ read_unlock(&mrt_lock);
+ kfree_skb(skb);
+ return -ENODEV;
+ }
+
+ ip_mr_forward(skb, cache, local);
+
+ read_unlock(&mrt_lock);
+
+ if (local)
+ return ip_local_deliver(skb);
+
+ return 0;
+
+dont_forward:
+ if (local)
+ return ip_local_deliver(skb);
+ kfree_skb(skb);
+ return 0;
+}
+
+#ifdef CONFIG_IP_PIMSM_V1
+/*
+ * Handle IGMP messages of PIMv1
+ */
+
+int pim_rcv_v1(struct sk_buff * skb)
+{
+ struct igmphdr *pim;
+ struct iphdr *encap;
+ struct net_device *reg_dev = NULL;
+
+ if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
+ goto drop;
+
+ pim = (struct igmphdr*)skb->h.raw;
+
+ if (!mroute_do_pim ||
+ skb->len < sizeof(*pim) + sizeof(*encap) ||
+ pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
+ goto drop;
+
+ encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr));
+ /*
+ Check that:
+ a. packet is really destinted to a multicast group
+ b. packet is not a NULL-REGISTER
+ c. packet is not truncated
+ */
+ if (!MULTICAST(encap->daddr) ||
+ encap->tot_len == 0 ||
+ ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
+ goto drop;
+
+ read_lock(&mrt_lock);
+ if (reg_vif_num >= 0)
+ reg_dev = vif_table[reg_vif_num].dev;
+ if (reg_dev)
+ dev_hold(reg_dev);
+ read_unlock(&mrt_lock);
+
+ if (reg_dev == NULL)
+ goto drop;
+
+ skb->mac.raw = skb->nh.raw;
+ skb_pull(skb, (u8*)encap - skb->data);
+ skb->nh.iph = (struct iphdr *)skb->data;
+ skb->dev = reg_dev;
+ memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+ skb->protocol = htons(ETH_P_IP);
+ skb->ip_summed = 0;
+ skb->pkt_type = PACKET_HOST;
+ dst_release(skb->dst);
+ skb->dst = NULL;
+ ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
+ ((struct net_device_stats*)reg_dev->priv)->rx_packets++;
+ nf_reset(skb);
+ netif_rx(skb);
+ dev_put(reg_dev);
+ return 0;
+ drop:
+ kfree_skb(skb);
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_IP_PIMSM_V2
+static int pim_rcv(struct sk_buff * skb)
+{
+ struct pimreghdr *pim;
+ struct iphdr *encap;
+ struct net_device *reg_dev = NULL;
+
+ if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
+ goto drop;
+
+ pim = (struct pimreghdr*)skb->h.raw;
+ if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
+ (pim->flags&PIM_NULL_REGISTER) ||
+ (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
+ (u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))))
+ goto drop;
+
+ /* check if the inner packet is destined to mcast group */
+ encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr));
+ if (!MULTICAST(encap->daddr) ||
+ encap->tot_len == 0 ||
+ ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
+ goto drop;
+
+ read_lock(&mrt_lock);
+ if (reg_vif_num >= 0)
+ reg_dev = vif_table[reg_vif_num].dev;
+ if (reg_dev)
+ dev_hold(reg_dev);
+ read_unlock(&mrt_lock);
+
+ if (reg_dev == NULL)
+ goto drop;
+
+ skb->mac.raw = skb->nh.raw;
+ skb_pull(skb, (u8*)encap - skb->data);
+ skb->nh.iph = (struct iphdr *)skb->data;
+ skb->dev = reg_dev;
+ memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+ skb->protocol = htons(ETH_P_IP);
+ skb->ip_summed = 0;
+ skb->pkt_type = PACKET_HOST;
+ dst_release(skb->dst);
+ ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
+ ((struct net_device_stats*)reg_dev->priv)->rx_packets++;
+ skb->dst = NULL;
+ nf_reset(skb);
+ netif_rx(skb);
+ dev_put(reg_dev);
+ return 0;
+ drop:
+ kfree_skb(skb);
+ return 0;
+}
+#endif
+
+static int
+ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
+{
+ int ct;
+ struct rtnexthop *nhp;
+ struct net_device *dev = vif_table[c->mfc_parent].dev;
+ u8 *b = skb->tail;
+ struct rtattr *mp_head;
+
+ if (dev)
+ RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
+
+ mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
+
+ for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
+ if (c->mfc_un.res.ttls[ct] < 255) {
+ if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
+ goto rtattr_failure;
+ nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
+ nhp->rtnh_flags = 0;
+ nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
+ nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
+ nhp->rtnh_len = sizeof(*nhp);
+ }
+ }
+ mp_head->rta_type = RTA_MULTIPATH;
+ mp_head->rta_len = skb->tail - (u8*)mp_head;
+ rtm->rtm_type = RTN_MULTICAST;
+ return 1;
+
+rtattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -EMSGSIZE;
+}
+
+int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
+{
+ int err;
+ struct mfc_cache *cache;
+ struct rtable *rt = (struct rtable*)skb->dst;
+
+ read_lock(&mrt_lock);
+ cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
+
+ if (cache==NULL) {
+ struct net_device *dev;
+ int vif;
+
+ if (nowait) {
+ read_unlock(&mrt_lock);
+ return -EAGAIN;
+ }
+
+ dev = skb->dev;
+ if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
+ read_unlock(&mrt_lock);
+ return -ENODEV;
+ }
+ skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
+ skb->nh.iph->ihl = sizeof(struct iphdr)>>2;
+ skb->nh.iph->saddr = rt->rt_src;
+ skb->nh.iph->daddr = rt->rt_dst;
+ skb->nh.iph->version = 0;
+ err = ipmr_cache_unresolved(vif, skb);
+ read_unlock(&mrt_lock);
+ return err;
+ }
+
+ if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
+ cache->mfc_flags |= MFC_NOTIFY;
+ err = ipmr_fill_mroute(skb, cache, rtm);
+ read_unlock(&mrt_lock);
+ return err;
+}
+
+#ifdef CONFIG_PROC_FS
+/*
+ * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
+ */
+struct ipmr_vif_iter {
+ int ct;
+};
+
+static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
+ loff_t pos)
+{
+ for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
+ if(!VIF_EXISTS(iter->ct))
+ continue;
+ if (pos-- == 0)
+ return &vif_table[iter->ct];
+ }
+ return NULL;
+}
+
+static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ read_lock(&mrt_lock);
+ return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
+ : SEQ_START_TOKEN;
+}
+
+static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ipmr_vif_iter *iter = seq->private;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return ipmr_vif_seq_idx(iter, 0);
+
+ while (++iter->ct < maxvif) {
+ if(!VIF_EXISTS(iter->ct))
+ continue;
+ return &vif_table[iter->ct];
+ }
+ return NULL;
+}
+
+static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
+{
+ read_unlock(&mrt_lock);
+}
+
+static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq,
+ "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n");
+ } else {
+ const struct vif_device *vif = v;
+ const char *name = vif->dev ? vif->dev->name : "none";
+
+ seq_printf(seq,
+ "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
+ vif - vif_table,
+ name, vif->bytes_in, vif->pkt_in,
+ vif->bytes_out, vif->pkt_out,
+ vif->flags, vif->local, vif->remote);
+ }
+ return 0;
+}
+
+static struct seq_operations ipmr_vif_seq_ops = {
+ .start = ipmr_vif_seq_start,
+ .next = ipmr_vif_seq_next,
+ .stop = ipmr_vif_seq_stop,
+ .show = ipmr_vif_seq_show,
+};
+
+static int ipmr_vif_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc = -ENOMEM;
+ struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (!s)
+ goto out;
+
+ rc = seq_open(file, &ipmr_vif_seq_ops);
+ if (rc)
+ goto out_kfree;
+
+ s->ct = 0;
+ seq = file->private_data;
+ seq->private = s;
+out:
+ return rc;
+out_kfree:
+ kfree(s);
+ goto out;
+
+}
+
+static struct file_operations ipmr_vif_fops = {
+ .owner = THIS_MODULE,
+ .open = ipmr_vif_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+struct ipmr_mfc_iter {
+ struct mfc_cache **cache;
+ int ct;
+};
+
+
+static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
+{
+ struct mfc_cache *mfc;
+
+ it->cache = mfc_cache_array;
+ read_lock(&mrt_lock);
+ for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
+ for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
+ if (pos-- == 0)
+ return mfc;
+ read_unlock(&mrt_lock);
+
+ it->cache = &mfc_unres_queue;
+ spin_lock_bh(&mfc_unres_lock);
+ for(mfc = mfc_unres_queue; mfc; mfc = mfc->next)
+ if (pos-- == 0)
+ return mfc;
+ spin_unlock_bh(&mfc_unres_lock);
+
+ it->cache = NULL;
+ return NULL;
+}
+
+
+static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct ipmr_mfc_iter *it = seq->private;
+ it->cache = NULL;
+ it->ct = 0;
+ return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
+ : SEQ_START_TOKEN;
+}
+
+static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct mfc_cache *mfc = v;
+ struct ipmr_mfc_iter *it = seq->private;
+
+ ++*pos;
+
+ if (v == SEQ_START_TOKEN)
+ return ipmr_mfc_seq_idx(seq->private, 0);
+
+ if (mfc->next)
+ return mfc->next;
+
+ if (it->cache == &mfc_unres_queue)
+ goto end_of_list;
+
+ BUG_ON(it->cache != mfc_cache_array);
+
+ while (++it->ct < MFC_LINES) {
+ mfc = mfc_cache_array[it->ct];
+ if (mfc)
+ return mfc;
+ }
+
+ /* exhausted cache_array, show unresolved */
+ read_unlock(&mrt_lock);
+ it->cache = &mfc_unres_queue;
+ it->ct = 0;
+
+ spin_lock_bh(&mfc_unres_lock);
+ mfc = mfc_unres_queue;
+ if (mfc)
+ return mfc;
+
+ end_of_list:
+ spin_unlock_bh(&mfc_unres_lock);
+ it->cache = NULL;
+
+ return NULL;
+}
+
+static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
+{
+ struct ipmr_mfc_iter *it = seq->private;
+
+ if (it->cache == &mfc_unres_queue)
+ spin_unlock_bh(&mfc_unres_lock);
+ else if (it->cache == mfc_cache_array)
+ read_unlock(&mrt_lock);
+}
+
+static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
+{
+ int n;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq,
+ "Group Origin Iif Pkts Bytes Wrong Oifs\n");
+ } else {
+ const struct mfc_cache *mfc = v;
+ const struct ipmr_mfc_iter *it = seq->private;
+
+ seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
+ (unsigned long) mfc->mfc_mcastgrp,
+ (unsigned long) mfc->mfc_origin,
+ mfc->mfc_parent,
+ mfc->mfc_un.res.pkt,
+ mfc->mfc_un.res.bytes,
+ mfc->mfc_un.res.wrong_if);
+
+ if (it->cache != &mfc_unres_queue) {
+ for(n = mfc->mfc_un.res.minvif;
+ n < mfc->mfc_un.res.maxvif; n++ ) {
+ if(VIF_EXISTS(n)
+ && mfc->mfc_un.res.ttls[n] < 255)
+ seq_printf(seq,
+ " %2d:%-3d",
+ n, mfc->mfc_un.res.ttls[n]);
+ }
+ }
+ seq_putc(seq, '\n');
+ }
+ return 0;
+}
+
+static struct seq_operations ipmr_mfc_seq_ops = {
+ .start = ipmr_mfc_seq_start,
+ .next = ipmr_mfc_seq_next,
+ .stop = ipmr_mfc_seq_stop,
+ .show = ipmr_mfc_seq_show,
+};
+
+static int ipmr_mfc_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc = -ENOMEM;
+ struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (!s)
+ goto out;
+
+ rc = seq_open(file, &ipmr_mfc_seq_ops);
+ if (rc)
+ goto out_kfree;
+
+ seq = file->private_data;
+ seq->private = s;
+out:
+ return rc;
+out_kfree:
+ kfree(s);
+ goto out;
+
+}
+
+static struct file_operations ipmr_mfc_fops = {
+ .owner = THIS_MODULE,
+ .open = ipmr_mfc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+#endif
+
+#ifdef CONFIG_IP_PIMSM_V2
+static struct net_protocol pim_protocol = {
+ .handler = pim_rcv,
+};
+#endif
+
+
+/*
+ * Setup for IP multicast routing
+ */
+
+void __init ip_mr_init(void)
+{
+ mrt_cachep = kmem_cache_create("ip_mrt_cache",
+ sizeof(struct mfc_cache),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if (!mrt_cachep)
+ panic("cannot allocate ip_mrt_cache");
+
+ init_timer(&ipmr_expire_timer);
+ ipmr_expire_timer.function=ipmr_expire_process;
+ register_netdevice_notifier(&ip_mr_notifier);
+#ifdef CONFIG_PROC_FS
+ proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
+ proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
+#endif
+}
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
new file mode 100644
index 00000000000..63a82b4b64b
--- /dev/null
+++ b/net/ipv4/ipvs/Kconfig
@@ -0,0 +1,244 @@
+#
+# IP Virtual Server configuration
+#
+menu "IP: Virtual Server Configuration"
+ depends on INET && NETFILTER
+
+config IP_VS
+ tristate "IP virtual server support (EXPERIMENTAL)"
+ depends on INET && NETFILTER
+ ---help---
+ IP Virtual Server support will let you build a high-performance
+ virtual server based on cluster of two or more real servers. This
+ option must be enabled for at least one of the clustered computers
+ that will take care of intercepting incoming connections to a
+ single IP address and scheduling them to real servers.
+
+ Three request dispatching techniques are implemented, they are
+ virtual server via NAT, virtual server via tunneling and virtual
+ server via direct routing. The several scheduling algorithms can
+ be used to choose which server the connection is directed to,
+ thus load balancing can be achieved among the servers. For more
+ information and its administration program, please visit the
+ following URL: <http://www.linuxvirtualserver.org/>.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_DEBUG
+ bool "IP virtual server debugging"
+ depends on IP_VS
+ ---help---
+ Say Y here if you want to get additional messages useful in
+ debugging the IP virtual server code. You can change the debug
+ level in /proc/sys/net/ipv4/vs/debug_level
+
+config IP_VS_TAB_BITS
+ int "IPVS connection table size (the Nth power of 2)"
+ depends on IP_VS
+ default "12"
+ ---help---
+ The IPVS connection hash table uses the chaining scheme to handle
+ hash collisions. Using a big IPVS connection hash table will greatly
+ reduce conflicts when there are hundreds of thousands of connections
+ in the hash table.
+
+ Note the table size must be power of 2. The table size will be the
+ value of 2 to the your input number power. The number to choose is
+ from 8 to 20, the default number is 12, which means the table size
+ is 4096. Don't input the number too small, otherwise you will lose
+ performance on it. You can adapt the table size yourself, according
+ to your virtual server application. It is good to set the table size
+ not far less than the number of connections per second multiplying
+ average lasting time of connection in the table. For example, your
+ virtual server gets 200 connections per second, the connection lasts
+ for 200 seconds in average in the connection table, the table size
+ should be not far less than 200x200, it is good to set the table
+ size 32768 (2**15).
+
+ Another note that each connection occupies 128 bytes effectively and
+ each hash entry uses 8 bytes, so you can estimate how much memory is
+ needed for your box.
+
+comment "IPVS transport protocol load balancing support"
+ depends on IP_VS
+
+config IP_VS_PROTO_TCP
+ bool "TCP load balancing support"
+ depends on IP_VS
+ ---help---
+ This option enables support for load balancing TCP transport
+ protocol. Say Y if unsure.
+
+config IP_VS_PROTO_UDP
+ bool "UDP load balancing support"
+ depends on IP_VS
+ ---help---
+ This option enables support for load balancing UDP transport
+ protocol. Say Y if unsure.
+
+config IP_VS_PROTO_ESP
+ bool "ESP load balancing support"
+ depends on IP_VS
+ ---help---
+ This option enables support for load balancing ESP (Encapsultion
+ Security Payload) transport protocol. Say Y if unsure.
+
+config IP_VS_PROTO_AH
+ bool "AH load balancing support"
+ depends on IP_VS
+ ---help---
+ This option enables support for load balancing AH (Authentication
+ Header) transport protocol. Say Y if unsure.
+
+comment "IPVS scheduler"
+ depends on IP_VS
+
+config IP_VS_RR
+ tristate "round-robin scheduling"
+ depends on IP_VS
+ ---help---
+ The robin-robin scheduling algorithm simply directs network
+ connections to different real servers in a round-robin manner.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_WRR
+ tristate "weighted round-robin scheduling"
+ depends on IP_VS
+ ---help---
+ The weighted robin-robin scheduling algorithm directs network
+ connections to different real servers based on server weights
+ in a round-robin manner. Servers with higher weights receive
+ new connections first than those with less weights, and servers
+ with higher weights get more connections than those with less
+ weights and servers with equal weights get equal connections.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_LC
+ tristate "least-connection scheduling"
+ depends on IP_VS
+ ---help---
+ The least-connection scheduling algorithm directs network
+ connections to the server with the least number of active
+ connections.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_WLC
+ tristate "weighted least-connection scheduling"
+ depends on IP_VS
+ ---help---
+ The weighted least-connection scheduling algorithm directs network
+ connections to the server with the least active connections
+ normalized by the server weight.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_LBLC
+ tristate "locality-based least-connection scheduling"
+ depends on IP_VS
+ ---help---
+ The locality-based least-connection scheduling algorithm is for
+ destination IP load balancing. It is usually used in cache cluster.
+ This algorithm usually directs packet destined for an IP address to
+ its server if the server is alive and under load. If the server is
+ overloaded (its active connection numbers is larger than its weight)
+ and there is a server in its half load, then allocate the weighted
+ least-connection server to this IP address.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_LBLCR
+ tristate "locality-based least-connection with replication scheduling"
+ depends on IP_VS
+ ---help---
+ The locality-based least-connection with replication scheduling
+ algorithm is also for destination IP load balancing. It is
+ usually used in cache cluster. It differs from the LBLC scheduling
+ as follows: the load balancer maintains mappings from a target
+ to a set of server nodes that can serve the target. Requests for
+ a target are assigned to the least-connection node in the target's
+ server set. If all the node in the server set are over loaded,
+ it picks up a least-connection node in the cluster and adds it
+ in the sever set for the target. If the server set has not been
+ modified for the specified time, the most loaded node is removed
+ from the server set, in order to avoid high degree of replication.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_DH
+ tristate "destination hashing scheduling"
+ depends on IP_VS
+ ---help---
+ The destination hashing scheduling algorithm assigns network
+ connections to the servers through looking up a statically assigned
+ hash table by their destination IP addresses.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_SH
+ tristate "source hashing scheduling"
+ depends on IP_VS
+ ---help---
+ The source hashing scheduling algorithm assigns network
+ connections to the servers through looking up a statically assigned
+ hash table by their source IP addresses.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_SED
+ tristate "shortest expected delay scheduling"
+ depends on IP_VS
+ ---help---
+ The shortest expected delay scheduling algorithm assigns network
+ connections to the server with the shortest expected delay. The
+ expected delay that the job will experience is (Ci + 1) / Ui if
+ sent to the ith server, in which Ci is the number of connections
+ on the the ith server and Ui is the fixed service rate (weight)
+ of the ith server.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_NQ
+ tristate "never queue scheduling"
+ depends on IP_VS
+ ---help---
+ The never queue scheduling algorithm adopts a two-speed model.
+ When there is an idle server available, the job will be sent to
+ the idle server, instead of waiting for a fast one. When there
+ is no idle server available, the job will be sent to the server
+ that minimize its expected delay (The Shortest Expected Delay
+ scheduling algorithm).
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+comment 'IPVS application helper'
+ depends on IP_VS
+
+config IP_VS_FTP
+ tristate "FTP protocol helper"
+ depends on IP_VS && IP_VS_PROTO_TCP
+ ---help---
+ FTP is a protocol that transfers IP address and/or port number in
+ the payload. In the virtual server via Network Address Translation,
+ the IP address and port number of real servers cannot be sent to
+ clients in ftp connections directly, so FTP protocol helper is
+ required for tracking the connection and mangling it back to that of
+ virtual service.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+endmenu
diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile
new file mode 100644
index 00000000000..a788461a40c
--- /dev/null
+++ b/net/ipv4/ipvs/Makefile
@@ -0,0 +1,34 @@
+#
+# Makefile for the IPVS modules on top of IPv4.
+#
+
+# IPVS transport protocol load balancing support
+ip_vs_proto-objs-y :=
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o
+
+ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \
+ ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \
+ ip_vs_est.o ip_vs_proto.o ip_vs_proto_icmp.o \
+ $(ip_vs_proto-objs-y)
+
+
+# IPVS core
+obj-$(CONFIG_IP_VS) += ip_vs.o
+
+# IPVS schedulers
+obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o
+obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
+obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
+obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
+obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
+obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
+obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
+obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
+obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
+obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
+
+# IPVS application helpers
+obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
new file mode 100644
index 00000000000..d9212addd19
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -0,0 +1,658 @@
+/*
+ * ip_vs_app.c: Application module support for IPVS
+ *
+ * Version: $Id: ip_vs_app.c,v 1.17 2003/03/22 06:31:21 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference
+ * is that ip_vs_app module handles the reverse direction (incoming requests
+ * and outgoing responses).
+ *
+ * IP_MASQ_APP application masquerading module
+ *
+ * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/protocol.h>
+#include <asm/system.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+#include <net/ip_vs.h>
+
+EXPORT_SYMBOL(register_ip_vs_app);
+EXPORT_SYMBOL(unregister_ip_vs_app);
+EXPORT_SYMBOL(register_ip_vs_app_inc);
+
+/* ipvs application list head */
+static LIST_HEAD(ip_vs_app_list);
+static DECLARE_MUTEX(__ip_vs_app_mutex);
+
+
+/*
+ * Get an ip_vs_app object
+ */
+static inline int ip_vs_app_get(struct ip_vs_app *app)
+{
+ /* test and get the module atomically */
+ if (app->module)
+ return try_module_get(app->module);
+ else
+ return 1;
+}
+
+
+static inline void ip_vs_app_put(struct ip_vs_app *app)
+{
+ if (app->module)
+ module_put(app->module);
+}
+
+
+/*
+ * Allocate/initialize app incarnation and register it in proto apps.
+ */
+static int
+ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
+{
+ struct ip_vs_protocol *pp;
+ struct ip_vs_app *inc;
+ int ret;
+
+ if (!(pp = ip_vs_proto_get(proto)))
+ return -EPROTONOSUPPORT;
+
+ if (!pp->unregister_app)
+ return -EOPNOTSUPP;
+
+ inc = kmalloc(sizeof(struct ip_vs_app), GFP_KERNEL);
+ if (!inc)
+ return -ENOMEM;
+ memcpy(inc, app, sizeof(*inc));
+ INIT_LIST_HEAD(&inc->p_list);
+ INIT_LIST_HEAD(&inc->incs_list);
+ inc->app = app;
+ inc->port = htons(port);
+ atomic_set(&inc->usecnt, 0);
+
+ if (app->timeouts) {
+ inc->timeout_table =
+ ip_vs_create_timeout_table(app->timeouts,
+ app->timeouts_size);
+ if (!inc->timeout_table) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ ret = pp->register_app(inc);
+ if (ret)
+ goto out;
+
+ list_add(&inc->a_list, &app->incs_list);
+ IP_VS_DBG(9, "%s application %s:%u registered\n",
+ pp->name, inc->name, inc->port);
+
+ return 0;
+
+ out:
+ if (inc->timeout_table)
+ kfree(inc->timeout_table);
+ kfree(inc);
+ return ret;
+}
+
+
+/*
+ * Release app incarnation
+ */
+static void
+ip_vs_app_inc_release(struct ip_vs_app *inc)
+{
+ struct ip_vs_protocol *pp;
+
+ if (!(pp = ip_vs_proto_get(inc->protocol)))
+ return;
+
+ if (pp->unregister_app)
+ pp->unregister_app(inc);
+
+ IP_VS_DBG(9, "%s App %s:%u unregistered\n",
+ pp->name, inc->name, inc->port);
+
+ list_del(&inc->a_list);
+
+ if (inc->timeout_table != NULL)
+ kfree(inc->timeout_table);
+ kfree(inc);
+}
+
+
+/*
+ * Get reference to app inc (only called from softirq)
+ *
+ */
+int ip_vs_app_inc_get(struct ip_vs_app *inc)
+{
+ int result;
+
+ atomic_inc(&inc->usecnt);
+ if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
+ atomic_dec(&inc->usecnt);
+ return result;
+}
+
+
+/*
+ * Put the app inc (only called from timer or net softirq)
+ */
+void ip_vs_app_inc_put(struct ip_vs_app *inc)
+{
+ ip_vs_app_put(inc->app);
+ atomic_dec(&inc->usecnt);
+}
+
+
+/*
+ * Register an application incarnation in protocol applications
+ */
+int
+register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
+{
+ int result;
+
+ down(&__ip_vs_app_mutex);
+
+ result = ip_vs_app_inc_new(app, proto, port);
+
+ up(&__ip_vs_app_mutex);
+
+ return result;
+}
+
+
+/*
+ * ip_vs_app registration routine
+ */
+int register_ip_vs_app(struct ip_vs_app *app)
+{
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ down(&__ip_vs_app_mutex);
+
+ list_add(&app->a_list, &ip_vs_app_list);
+
+ up(&__ip_vs_app_mutex);
+
+ return 0;
+}
+
+
+/*
+ * ip_vs_app unregistration routine
+ * We are sure there are no app incarnations attached to services
+ */
+void unregister_ip_vs_app(struct ip_vs_app *app)
+{
+ struct ip_vs_app *inc, *nxt;
+
+ down(&__ip_vs_app_mutex);
+
+ list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
+ ip_vs_app_inc_release(inc);
+ }
+
+ list_del(&app->a_list);
+
+ up(&__ip_vs_app_mutex);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+}
+
+
+#if 0000
+/*
+ * Get reference to app by name (called from user context)
+ */
+struct ip_vs_app *ip_vs_app_get_by_name(char *appname)
+{
+ struct ip_vs_app *app, *a = NULL;
+
+ down(&__ip_vs_app_mutex);
+
+ list_for_each_entry(ent, &ip_vs_app_list, a_list) {
+ if (strcmp(app->name, appname))
+ continue;
+
+ /* softirq may call ip_vs_app_get too, so the caller
+ must disable softirq on the current CPU */
+ if (ip_vs_app_get(app))
+ a = app;
+ break;
+ }
+
+ up(&__ip_vs_app_mutex);
+
+ return a;
+}
+#endif
+
+
+/*
+ * Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
+ */
+int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp)
+{
+ return pp->app_conn_bind(cp);
+}
+
+
+/*
+ * Unbind cp from application incarnation (called by cp destructor)
+ */
+void ip_vs_unbind_app(struct ip_vs_conn *cp)
+{
+ struct ip_vs_app *inc = cp->app;
+
+ if (!inc)
+ return;
+
+ if (inc->unbind_conn)
+ inc->unbind_conn(inc, cp);
+ if (inc->done_conn)
+ inc->done_conn(inc, cp);
+ ip_vs_app_inc_put(inc);
+ cp->app = NULL;
+}
+
+
+/*
+ * Fixes th->seq based on ip_vs_seq info.
+ */
+static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
+{
+ __u32 seq = ntohl(th->seq);
+
+ /*
+ * Adjust seq with delta-offset for all packets after
+ * the most recent resized pkt seq and with previous_delta offset
+ * for all packets before most recent resized pkt seq.
+ */
+ if (vseq->delta || vseq->previous_delta) {
+ if(after(seq, vseq->init_seq)) {
+ th->seq = htonl(seq + vseq->delta);
+ IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n",
+ vseq->delta);
+ } else {
+ th->seq = htonl(seq + vseq->previous_delta);
+ IP_VS_DBG(9, "vs_fix_seq(): added previous_delta "
+ "(%d) to seq\n", vseq->previous_delta);
+ }
+ }
+}
+
+
+/*
+ * Fixes th->ack_seq based on ip_vs_seq info.
+ */
+static inline void
+vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
+{
+ __u32 ack_seq = ntohl(th->ack_seq);
+
+ /*
+ * Adjust ack_seq with delta-offset for
+ * the packets AFTER most recent resized pkt has caused a shift
+ * for packets before most recent resized pkt, use previous_delta
+ */
+ if (vseq->delta || vseq->previous_delta) {
+ /* since ack_seq is the number of octet that is expected
+ to receive next, so compare it with init_seq+delta */
+ if(after(ack_seq, vseq->init_seq+vseq->delta)) {
+ th->ack_seq = htonl(ack_seq - vseq->delta);
+ IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta "
+ "(%d) from ack_seq\n", vseq->delta);
+
+ } else {
+ th->ack_seq = htonl(ack_seq - vseq->previous_delta);
+ IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted "
+ "previous_delta (%d) from ack_seq\n",
+ vseq->previous_delta);
+ }
+ }
+}
+
+
+/*
+ * Updates ip_vs_seq if pkt has been resized
+ * Assumes already checked proto==IPPROTO_TCP and diff!=0.
+ */
+static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
+ unsigned flag, __u32 seq, int diff)
+{
+ /* spinlock is to keep updating cp->flags atomic */
+ spin_lock(&cp->lock);
+ if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
+ vseq->previous_delta = vseq->delta;
+ vseq->delta += diff;
+ vseq->init_seq = seq;
+ cp->flags |= flag;
+ }
+ spin_unlock(&cp->lock);
+}
+
+static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb,
+ struct ip_vs_app *app)
+{
+ int diff;
+ unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4;
+ struct tcphdr *th;
+ __u32 seq;
+
+ if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th)))
+ return 0;
+
+ th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset);
+
+ /*
+ * Remember seq number in case this pkt gets resized
+ */
+ seq = ntohl(th->seq);
+
+ /*
+ * Fix seq stuff if flagged as so.
+ */
+ if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
+ vs_fix_seq(&cp->out_seq, th);
+ if (cp->flags & IP_VS_CONN_F_IN_SEQ)
+ vs_fix_ack_seq(&cp->in_seq, th);
+
+ /*
+ * Call private output hook function
+ */
+ if (app->pkt_out == NULL)
+ return 1;
+
+ if (!app->pkt_out(app, cp, pskb, &diff))
+ return 0;
+
+ /*
+ * Update ip_vs seq stuff if len has changed.
+ */
+ if (diff != 0)
+ vs_seq_update(cp, &cp->out_seq,
+ IP_VS_CONN_F_OUT_SEQ, seq, diff);
+
+ return 1;
+}
+
+/*
+ * Output pkt hook. Will call bound ip_vs_app specific function
+ * called by ipvs packet handler, assumes previously checked cp!=NULL
+ * returns false if it can't handle packet (oom)
+ */
+int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb)
+{
+ struct ip_vs_app *app;
+
+ /*
+ * check if application module is bound to
+ * this ip_vs_conn.
+ */
+ if ((app = cp->app) == NULL)
+ return 1;
+
+ /* TCP is complicated */
+ if (cp->protocol == IPPROTO_TCP)
+ return app_tcp_pkt_out(cp, pskb, app);
+
+ /*
+ * Call private output hook function
+ */
+ if (app->pkt_out == NULL)
+ return 1;
+
+ return app->pkt_out(app, cp, pskb, NULL);
+}
+
+
+static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb,
+ struct ip_vs_app *app)
+{
+ int diff;
+ unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4;
+ struct tcphdr *th;
+ __u32 seq;
+
+ if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th)))
+ return 0;
+
+ th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset);
+
+ /*
+ * Remember seq number in case this pkt gets resized
+ */
+ seq = ntohl(th->seq);
+
+ /*
+ * Fix seq stuff if flagged as so.
+ */
+ if (cp->flags & IP_VS_CONN_F_IN_SEQ)
+ vs_fix_seq(&cp->in_seq, th);
+ if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
+ vs_fix_ack_seq(&cp->out_seq, th);
+
+ /*
+ * Call private input hook function
+ */
+ if (app->pkt_in == NULL)
+ return 1;
+
+ if (!app->pkt_in(app, cp, pskb, &diff))
+ return 0;
+
+ /*
+ * Update ip_vs seq stuff if len has changed.
+ */
+ if (diff != 0)
+ vs_seq_update(cp, &cp->in_seq,
+ IP_VS_CONN_F_IN_SEQ, seq, diff);
+
+ return 1;
+}
+
+/*
+ * Input pkt hook. Will call bound ip_vs_app specific function
+ * called by ipvs packet handler, assumes previously checked cp!=NULL.
+ * returns false if can't handle packet (oom).
+ */
+int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb)
+{
+ struct ip_vs_app *app;
+
+ /*
+ * check if application module is bound to
+ * this ip_vs_conn.
+ */
+ if ((app = cp->app) == NULL)
+ return 1;
+
+ /* TCP is complicated */
+ if (cp->protocol == IPPROTO_TCP)
+ return app_tcp_pkt_in(cp, pskb, app);
+
+ /*
+ * Call private input hook function
+ */
+ if (app->pkt_in == NULL)
+ return 1;
+
+ return app->pkt_in(app, cp, pskb, NULL);
+}
+
+
+#ifdef CONFIG_PROC_FS
+/*
+ * /proc/net/ip_vs_app entry function
+ */
+
+static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
+{
+ struct ip_vs_app *app, *inc;
+
+ list_for_each_entry(app, &ip_vs_app_list, a_list) {
+ list_for_each_entry(inc, &app->incs_list, a_list) {
+ if (pos-- == 0)
+ return inc;
+ }
+ }
+ return NULL;
+
+}
+
+static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ down(&__ip_vs_app_mutex);
+
+ return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ip_vs_app *inc, *app;
+ struct list_head *e;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return ip_vs_app_idx(0);
+
+ inc = v;
+ app = inc->app;
+
+ if ((e = inc->a_list.next) != &app->incs_list)
+ return list_entry(e, struct ip_vs_app, a_list);
+
+ /* go on to next application */
+ for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) {
+ app = list_entry(e, struct ip_vs_app, a_list);
+ list_for_each_entry(inc, &app->incs_list, a_list) {
+ return inc;
+ }
+ }
+ return NULL;
+}
+
+static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
+{
+ up(&__ip_vs_app_mutex);
+}
+
+static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "prot port usecnt name\n");
+ else {
+ const struct ip_vs_app *inc = v;
+
+ seq_printf(seq, "%-3s %-7u %-6d %-17s\n",
+ ip_vs_proto_name(inc->protocol),
+ ntohs(inc->port),
+ atomic_read(&inc->usecnt),
+ inc->name);
+ }
+ return 0;
+}
+
+static struct seq_operations ip_vs_app_seq_ops = {
+ .start = ip_vs_app_seq_start,
+ .next = ip_vs_app_seq_next,
+ .stop = ip_vs_app_seq_stop,
+ .show = ip_vs_app_seq_show,
+};
+
+static int ip_vs_app_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &ip_vs_app_seq_ops);
+}
+
+static struct file_operations ip_vs_app_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_app_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+#endif
+
+
+/*
+ * Replace a segment of data with a new segment
+ */
+int ip_vs_skb_replace(struct sk_buff *skb, int pri,
+ char *o_buf, int o_len, char *n_buf, int n_len)
+{
+ struct iphdr *iph;
+ int diff;
+ int o_offset;
+ int o_left;
+
+ EnterFunction(9);
+
+ diff = n_len - o_len;
+ o_offset = o_buf - (char *)skb->data;
+ /* The length of left data after o_buf+o_len in the skb data */
+ o_left = skb->len - (o_offset + o_len);
+
+ if (diff <= 0) {
+ memmove(o_buf + n_len, o_buf + o_len, o_left);
+ memcpy(o_buf, n_buf, n_len);
+ skb_trim(skb, skb->len + diff);
+ } else if (diff <= skb_tailroom(skb)) {
+ skb_put(skb, diff);
+ memmove(o_buf + n_len, o_buf + o_len, o_left);
+ memcpy(o_buf, n_buf, n_len);
+ } else {
+ if (pskb_expand_head(skb, skb_headroom(skb), diff, pri))
+ return -ENOMEM;
+ skb_put(skb, diff);
+ memmove(skb->data + o_offset + n_len,
+ skb->data + o_offset + o_len, o_left);
+ memcpy(skb->data + o_offset, n_buf, n_len);
+ }
+
+ /* must update the iph total length here */
+ iph = skb->nh.iph;
+ iph->tot_len = htons(skb->len);
+
+ LeaveFunction(9);
+ return 0;
+}
+
+
+int ip_vs_app_init(void)
+{
+ /* we will replace it with proc_net_ipvs_create() soon */
+ proc_net_fops_create("ip_vs_app", 0, &ip_vs_app_fops);
+ return 0;
+}
+
+
+void ip_vs_app_cleanup(void)
+{
+ proc_net_remove("ip_vs_app");
+}
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
new file mode 100644
index 00000000000..fd6feb5499f
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -0,0 +1,920 @@
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the Netfilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Version: $Id: ip_vs_conn.c,v 1.31 2003/04/18 09:03:16 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others. Many code here is taken from IP MASQ code of kernel 2.2.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h> /* for proc_net_* */
+#include <linux/seq_file.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * Connection hash table: for input and output packets lookups of IPVS
+ */
+static struct list_head *ip_vs_conn_tab;
+
+/* SLAB cache for IPVS connections */
+static kmem_cache_t *ip_vs_conn_cachep;
+
+/* counter for current IPVS connections */
+static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
+
+/* counter for no client port connections */
+static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
+
+/* random value for IPVS connection hash */
+static unsigned int ip_vs_conn_rnd;
+
+/*
+ * Fine locking granularity for big connection hash table
+ */
+#define CT_LOCKARRAY_BITS 4
+#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS)
+#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1)
+
+struct ip_vs_aligned_lock
+{
+ rwlock_t l;
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+/* lock array for conn table */
+static struct ip_vs_aligned_lock
+__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
+
+static inline void ct_read_lock(unsigned key)
+{
+ read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_read_unlock(unsigned key)
+{
+ read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_lock(unsigned key)
+{
+ write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_unlock(unsigned key)
+{
+ write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_read_lock_bh(unsigned key)
+{
+ read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_read_unlock_bh(unsigned key)
+{
+ read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_lock_bh(unsigned key)
+{
+ write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_unlock_bh(unsigned key)
+{
+ write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+
+/*
+ * Returns hash value for IPVS connection entry
+ */
+static unsigned int ip_vs_conn_hashkey(unsigned proto, __u32 addr, __u16 port)
+{
+ return jhash_3words(addr, port, proto, ip_vs_conn_rnd)
+ & IP_VS_CONN_TAB_MASK;
+}
+
+
+/*
+ * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
+ * returns bool success.
+ */
+static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
+{
+ unsigned hash;
+ int ret;
+
+ /* Hash by protocol, client address and port */
+ hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
+
+ ct_write_lock(hash);
+
+ if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
+ list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
+ cp->flags |= IP_VS_CONN_F_HASHED;
+ atomic_inc(&cp->refcnt);
+ ret = 1;
+ } else {
+ IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
+ "called from %p\n", __builtin_return_address(0));
+ ret = 0;
+ }
+
+ ct_write_unlock(hash);
+
+ return ret;
+}
+
+
+/*
+ * UNhashes ip_vs_conn from ip_vs_conn_tab.
+ * returns bool success.
+ */
+static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
+{
+ unsigned hash;
+ int ret;
+
+ /* unhash it and decrease its reference counter */
+ hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
+
+ ct_write_lock(hash);
+
+ if (cp->flags & IP_VS_CONN_F_HASHED) {
+ list_del(&cp->c_list);
+ cp->flags &= ~IP_VS_CONN_F_HASHED;
+ atomic_dec(&cp->refcnt);
+ ret = 1;
+ } else
+ ret = 0;
+
+ ct_write_unlock(hash);
+
+ return ret;
+}
+
+
+/*
+ * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ * Called for pkts coming from OUTside-to-INside.
+ * s_addr, s_port: pkt source address (foreign host)
+ * d_addr, d_port: pkt dest address (load balancer)
+ */
+static inline struct ip_vs_conn *__ip_vs_conn_in_get
+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+ unsigned hash;
+ struct ip_vs_conn *cp;
+
+ hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
+
+ ct_read_lock(hash);
+
+ list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (s_addr==cp->caddr && s_port==cp->cport &&
+ d_port==cp->vport && d_addr==cp->vaddr &&
+ protocol==cp->protocol) {
+ /* HIT */
+ atomic_inc(&cp->refcnt);
+ ct_read_unlock(hash);
+ return cp;
+ }
+ }
+
+ ct_read_unlock(hash);
+
+ return NULL;
+}
+
+struct ip_vs_conn *ip_vs_conn_in_get
+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+ struct ip_vs_conn *cp;
+
+ cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port);
+ if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
+ cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
+
+ IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+ ip_vs_proto_name(protocol),
+ NIPQUAD(s_addr), ntohs(s_port),
+ NIPQUAD(d_addr), ntohs(d_port),
+ cp?"hit":"not hit");
+
+ return cp;
+}
+
+
+/*
+ * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ * Called for pkts coming from inside-to-OUTside.
+ * s_addr, s_port: pkt source address (inside host)
+ * d_addr, d_port: pkt dest address (foreign host)
+ */
+struct ip_vs_conn *ip_vs_conn_out_get
+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+ unsigned hash;
+ struct ip_vs_conn *cp, *ret=NULL;
+
+ /*
+ * Check for "full" addressed entries
+ */
+ hash = ip_vs_conn_hashkey(protocol, d_addr, d_port);
+
+ ct_read_lock(hash);
+
+ list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (d_addr == cp->caddr && d_port == cp->cport &&
+ s_port == cp->dport && s_addr == cp->daddr &&
+ protocol == cp->protocol) {
+ /* HIT */
+ atomic_inc(&cp->refcnt);
+ ret = cp;
+ break;
+ }
+ }
+
+ ct_read_unlock(hash);
+
+ IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+ ip_vs_proto_name(protocol),
+ NIPQUAD(s_addr), ntohs(s_port),
+ NIPQUAD(d_addr), ntohs(d_port),
+ ret?"hit":"not hit");
+
+ return ret;
+}
+
+
+/*
+ * Put back the conn and restart its timer with its timeout
+ */
+void ip_vs_conn_put(struct ip_vs_conn *cp)
+{
+ /* reset it expire in its timeout */
+ mod_timer(&cp->timer, jiffies+cp->timeout);
+
+ __ip_vs_conn_put(cp);
+}
+
+
+/*
+ * Fill a no_client_port connection with a client port number
+ */
+void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __u16 cport)
+{
+ if (ip_vs_conn_unhash(cp)) {
+ spin_lock(&cp->lock);
+ if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
+ atomic_dec(&ip_vs_conn_no_cport_cnt);
+ cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
+ cp->cport = cport;
+ }
+ spin_unlock(&cp->lock);
+
+ /* hash on new dport */
+ ip_vs_conn_hash(cp);
+ }
+}
+
+
+/*
+ * Bind a connection entry with the corresponding packet_xmit.
+ * Called by ip_vs_conn_new.
+ */
+static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
+{
+ switch (IP_VS_FWD_METHOD(cp)) {
+ case IP_VS_CONN_F_MASQ:
+ cp->packet_xmit = ip_vs_nat_xmit;
+ break;
+
+ case IP_VS_CONN_F_TUNNEL:
+ cp->packet_xmit = ip_vs_tunnel_xmit;
+ break;
+
+ case IP_VS_CONN_F_DROUTE:
+ cp->packet_xmit = ip_vs_dr_xmit;
+ break;
+
+ case IP_VS_CONN_F_LOCALNODE:
+ cp->packet_xmit = ip_vs_null_xmit;
+ break;
+
+ case IP_VS_CONN_F_BYPASS:
+ cp->packet_xmit = ip_vs_bypass_xmit;
+ break;
+ }
+}
+
+
+static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
+{
+ return atomic_read(&dest->activeconns)
+ + atomic_read(&dest->inactconns);
+}
+
+/*
+ * Bind a connection entry with a virtual service destination
+ * Called just after a new connection entry is created.
+ */
+static inline void
+ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
+{
+ /* if dest is NULL, then return directly */
+ if (!dest)
+ return;
+
+ /* Increase the refcnt counter of the dest */
+ atomic_inc(&dest->refcnt);
+
+ /* Bind with the destination and its corresponding transmitter */
+ cp->flags |= atomic_read(&dest->conn_flags);
+ cp->dest = dest;
+
+ IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+ "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n",
+ ip_vs_proto_name(cp->protocol),
+ NIPQUAD(cp->caddr), ntohs(cp->cport),
+ NIPQUAD(cp->vaddr), ntohs(cp->vport),
+ NIPQUAD(cp->daddr), ntohs(cp->dport),
+ ip_vs_fwd_tag(cp), cp->state,
+ cp->flags, atomic_read(&cp->refcnt),
+ atomic_read(&dest->refcnt));
+
+ /* Update the connection counters */
+ if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+ /* It is a normal connection, so increase the inactive
+ connection counter because it is in TCP SYNRECV
+ state (inactive) or other protocol inacive state */
+ atomic_inc(&dest->inactconns);
+ } else {
+ /* It is a persistent connection/template, so increase
+ the peristent connection counter */
+ atomic_inc(&dest->persistconns);
+ }
+
+ if (dest->u_threshold != 0 &&
+ ip_vs_dest_totalconns(dest) >= dest->u_threshold)
+ dest->flags |= IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ * Unbind a connection entry with its VS destination
+ * Called by the ip_vs_conn_expire function.
+ */
+static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
+{
+ struct ip_vs_dest *dest = cp->dest;
+
+ if (!dest)
+ return;
+
+ IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+ "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n",
+ ip_vs_proto_name(cp->protocol),
+ NIPQUAD(cp->caddr), ntohs(cp->cport),
+ NIPQUAD(cp->vaddr), ntohs(cp->vport),
+ NIPQUAD(cp->daddr), ntohs(cp->dport),
+ ip_vs_fwd_tag(cp), cp->state,
+ cp->flags, atomic_read(&cp->refcnt),
+ atomic_read(&dest->refcnt));
+
+ /* Update the connection counters */
+ if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+ /* It is a normal connection, so decrease the inactconns
+ or activeconns counter */
+ if (cp->flags & IP_VS_CONN_F_INACTIVE) {
+ atomic_dec(&dest->inactconns);
+ } else {
+ atomic_dec(&dest->activeconns);
+ }
+ } else {
+ /* It is a persistent connection/template, so decrease
+ the peristent connection counter */
+ atomic_dec(&dest->persistconns);
+ }
+
+ if (dest->l_threshold != 0) {
+ if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ } else if (dest->u_threshold != 0) {
+ if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ } else {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ }
+
+ /*
+ * Simply decrease the refcnt of the dest, because the
+ * dest will be either in service's destination list
+ * or in the trash.
+ */
+ atomic_dec(&dest->refcnt);
+}
+
+
+/*
+ * Checking if the destination of a connection template is available.
+ * If available, return 1, otherwise invalidate this connection
+ * template and return 0.
+ */
+int ip_vs_check_template(struct ip_vs_conn *ct)
+{
+ struct ip_vs_dest *dest = ct->dest;
+
+ /*
+ * Checking the dest server status.
+ */
+ if ((dest == NULL) ||
+ !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
+ (sysctl_ip_vs_expire_quiescent_template &&
+ (atomic_read(&dest->weight) == 0))) {
+ IP_VS_DBG(9, "check_template: dest not available for "
+ "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+ "-> d:%u.%u.%u.%u:%d\n",
+ ip_vs_proto_name(ct->protocol),
+ NIPQUAD(ct->caddr), ntohs(ct->cport),
+ NIPQUAD(ct->vaddr), ntohs(ct->vport),
+ NIPQUAD(ct->daddr), ntohs(ct->dport));
+
+ /*
+ * Invalidate the connection template
+ */
+ if (ct->cport) {
+ if (ip_vs_conn_unhash(ct)) {
+ ct->dport = 65535;
+ ct->vport = 65535;
+ ct->cport = 0;
+ ip_vs_conn_hash(ct);
+ }
+ }
+
+ /*
+ * Simply decrease the refcnt of the template,
+ * don't restart its timer.
+ */
+ atomic_dec(&ct->refcnt);
+ return 0;
+ }
+ return 1;
+}
+
+static void ip_vs_conn_expire(unsigned long data)
+{
+ struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
+
+ cp->timeout = 60*HZ;
+
+ /*
+ * hey, I'm using it
+ */
+ atomic_inc(&cp->refcnt);
+
+ /*
+ * do I control anybody?
+ */
+ if (atomic_read(&cp->n_control))
+ goto expire_later;
+
+ /*
+ * unhash it if it is hashed in the conn table
+ */
+ if (!ip_vs_conn_unhash(cp))
+ goto expire_later;
+
+ /*
+ * refcnt==1 implies I'm the only one referrer
+ */
+ if (likely(atomic_read(&cp->refcnt) == 1)) {
+ /* delete the timer if it is activated by other users */
+ if (timer_pending(&cp->timer))
+ del_timer(&cp->timer);
+
+ /* does anybody control me? */
+ if (cp->control)
+ ip_vs_control_del(cp);
+
+ if (unlikely(cp->app != NULL))
+ ip_vs_unbind_app(cp);
+ ip_vs_unbind_dest(cp);
+ if (cp->flags & IP_VS_CONN_F_NO_CPORT)
+ atomic_dec(&ip_vs_conn_no_cport_cnt);
+ atomic_dec(&ip_vs_conn_count);
+
+ kmem_cache_free(ip_vs_conn_cachep, cp);
+ return;
+ }
+
+ /* hash it back to the table */
+ ip_vs_conn_hash(cp);
+
+ expire_later:
+ IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n",
+ atomic_read(&cp->refcnt)-1,
+ atomic_read(&cp->n_control));
+
+ ip_vs_conn_put(cp);
+}
+
+
+void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
+{
+ if (del_timer(&cp->timer))
+ mod_timer(&cp->timer, jiffies);
+ __ip_vs_conn_put(cp);
+}
+
+
+/*
+ * Create a new connection entry and hash it into the ip_vs_conn_tab
+ */
+struct ip_vs_conn *
+ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport,
+ __u32 daddr, __u16 dport, unsigned flags,
+ struct ip_vs_dest *dest)
+{
+ struct ip_vs_conn *cp;
+ struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+
+ cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
+ if (cp == NULL) {
+ IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
+ return NULL;
+ }
+
+ memset(cp, 0, sizeof(*cp));
+ INIT_LIST_HEAD(&cp->c_list);
+ init_timer(&cp->timer);
+ cp->timer.data = (unsigned long)cp;
+ cp->timer.function = ip_vs_conn_expire;
+ cp->protocol = proto;
+ cp->caddr = caddr;
+ cp->cport = cport;
+ cp->vaddr = vaddr;
+ cp->vport = vport;
+ cp->daddr = daddr;
+ cp->dport = dport;
+ cp->flags = flags;
+ spin_lock_init(&cp->lock);
+
+ /*
+ * Set the entry is referenced by the current thread before hashing
+ * it in the table, so that other thread run ip_vs_random_dropentry
+ * but cannot drop this entry.
+ */
+ atomic_set(&cp->refcnt, 1);
+
+ atomic_set(&cp->n_control, 0);
+ atomic_set(&cp->in_pkts, 0);
+
+ atomic_inc(&ip_vs_conn_count);
+ if (flags & IP_VS_CONN_F_NO_CPORT)
+ atomic_inc(&ip_vs_conn_no_cport_cnt);
+
+ /* Bind the connection with a destination server */
+ ip_vs_bind_dest(cp, dest);
+
+ /* Set its state and timeout */
+ cp->state = 0;
+ cp->timeout = 3*HZ;
+
+ /* Bind its packet transmitter */
+ ip_vs_bind_xmit(cp);
+
+ if (unlikely(pp && atomic_read(&pp->appcnt)))
+ ip_vs_bind_app(cp, pp);
+
+ /* Hash it in the ip_vs_conn_tab finally */
+ ip_vs_conn_hash(cp);
+
+ return cp;
+}
+
+
+/*
+ * /proc/net/ip_vs_conn entries
+ */
+#ifdef CONFIG_PROC_FS
+
+static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
+{
+ int idx;
+ struct ip_vs_conn *cp;
+
+ for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
+ ct_read_lock_bh(idx);
+ list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+ if (pos-- == 0) {
+ seq->private = &ip_vs_conn_tab[idx];
+ return cp;
+ }
+ }
+ ct_read_unlock_bh(idx);
+ }
+
+ return NULL;
+}
+
+static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ seq->private = NULL;
+ return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
+}
+
+static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ip_vs_conn *cp = v;
+ struct list_head *e, *l = seq->private;
+ int idx;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return ip_vs_conn_array(seq, 0);
+
+ /* more on same hash chain? */
+ if ((e = cp->c_list.next) != l)
+ return list_entry(e, struct ip_vs_conn, c_list);
+
+ idx = l - ip_vs_conn_tab;
+ ct_read_unlock_bh(idx);
+
+ while (++idx < IP_VS_CONN_TAB_SIZE) {
+ ct_read_lock_bh(idx);
+ list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+ seq->private = &ip_vs_conn_tab[idx];
+ return cp;
+ }
+ ct_read_unlock_bh(idx);
+ }
+ seq->private = NULL;
+ return NULL;
+}
+
+static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
+{
+ struct list_head *l = seq->private;
+
+ if (l)
+ ct_read_unlock_bh(l - ip_vs_conn_tab);
+}
+
+static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
+{
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq,
+ "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n");
+ else {
+ const struct ip_vs_conn *cp = v;
+
+ seq_printf(seq,
+ "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n",
+ ip_vs_proto_name(cp->protocol),
+ ntohl(cp->caddr), ntohs(cp->cport),
+ ntohl(cp->vaddr), ntohs(cp->vport),
+ ntohl(cp->daddr), ntohs(cp->dport),
+ ip_vs_state_name(cp->protocol, cp->state),
+ (cp->timer.expires-jiffies)/HZ);
+ }
+ return 0;
+}
+
+static struct seq_operations ip_vs_conn_seq_ops = {
+ .start = ip_vs_conn_seq_start,
+ .next = ip_vs_conn_seq_next,
+ .stop = ip_vs_conn_seq_stop,
+ .show = ip_vs_conn_seq_show,
+};
+
+static int ip_vs_conn_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &ip_vs_conn_seq_ops);
+}
+
+static struct file_operations ip_vs_conn_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_conn_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+#endif
+
+
+/*
+ * Randomly drop connection entries before running out of memory
+ */
+static inline int todrop_entry(struct ip_vs_conn *cp)
+{
+ /*
+ * The drop rate array needs tuning for real environments.
+ * Called from timer bh only => no locking
+ */
+ static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+ static char todrop_counter[9] = {0};
+ int i;
+
+ /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
+ This will leave enough time for normal connection to get
+ through. */
+ if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
+ return 0;
+
+ /* Don't drop the entry if its number of incoming packets is not
+ located in [0, 8] */
+ i = atomic_read(&cp->in_pkts);
+ if (i > 8 || i < 0) return 0;
+
+ if (!todrop_rate[i]) return 0;
+ if (--todrop_counter[i] > 0) return 0;
+
+ todrop_counter[i] = todrop_rate[i];
+ return 1;
+}
+
+
+void ip_vs_random_dropentry(void)
+{
+ int idx;
+ struct ip_vs_conn *cp;
+ struct ip_vs_conn *ct;
+
+ /*
+ * Randomly scan 1/32 of the whole table every second
+ */
+ for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
+ unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
+
+ /*
+ * Lock is actually needed in this loop.
+ */
+ ct_write_lock(hash);
+
+ list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
+ /* connection template */
+ continue;
+
+ if (cp->protocol == IPPROTO_TCP) {
+ switch(cp->state) {
+ case IP_VS_TCP_S_SYN_RECV:
+ case IP_VS_TCP_S_SYNACK:
+ break;
+
+ case IP_VS_TCP_S_ESTABLISHED:
+ if (todrop_entry(cp))
+ break;
+ continue;
+
+ default:
+ continue;
+ }
+ } else {
+ if (!todrop_entry(cp))
+ continue;
+ }
+
+ /*
+ * Drop the entry, and drop its ct if not referenced
+ */
+ atomic_inc(&cp->refcnt);
+ ct_write_unlock(hash);
+
+ if ((ct = cp->control))
+ atomic_inc(&ct->refcnt);
+ IP_VS_DBG(4, "del connection\n");
+ ip_vs_conn_expire_now(cp);
+ if (ct) {
+ IP_VS_DBG(4, "del conn template\n");
+ ip_vs_conn_expire_now(ct);
+ }
+ ct_write_lock(hash);
+ }
+ ct_write_unlock(hash);
+ }
+}
+
+
+/*
+ * Flush all the connection entries in the ip_vs_conn_tab
+ */
+static void ip_vs_conn_flush(void)
+{
+ int idx;
+ struct ip_vs_conn *cp;
+ struct ip_vs_conn *ct;
+
+ flush_again:
+ for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
+ /*
+ * Lock is actually needed in this loop.
+ */
+ ct_write_lock_bh(idx);
+
+ list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+ atomic_inc(&cp->refcnt);
+ ct_write_unlock(idx);
+
+ if ((ct = cp->control))
+ atomic_inc(&ct->refcnt);
+ IP_VS_DBG(4, "del connection\n");
+ ip_vs_conn_expire_now(cp);
+ if (ct) {
+ IP_VS_DBG(4, "del conn template\n");
+ ip_vs_conn_expire_now(ct);
+ }
+ ct_write_lock(idx);
+ }
+ ct_write_unlock_bh(idx);
+ }
+
+ /* the counter may be not NULL, because maybe some conn entries
+ are run by slow timer handler or unhashed but still referred */
+ if (atomic_read(&ip_vs_conn_count) != 0) {
+ schedule();
+ goto flush_again;
+ }
+}
+
+
+int ip_vs_conn_init(void)
+{
+ int idx;
+
+ /*
+ * Allocate the connection hash table and initialize its list heads
+ */
+ ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
+ if (!ip_vs_conn_tab)
+ return -ENOMEM;
+
+ /* Allocate ip_vs_conn slab cache */
+ ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
+ sizeof(struct ip_vs_conn), 0,
+ SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if (!ip_vs_conn_cachep) {
+ vfree(ip_vs_conn_tab);
+ return -ENOMEM;
+ }
+
+ IP_VS_INFO("Connection hash table configured "
+ "(size=%d, memory=%ldKbytes)\n",
+ IP_VS_CONN_TAB_SIZE,
+ (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
+ IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
+ sizeof(struct ip_vs_conn));
+
+ for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
+ INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
+ }
+
+ for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
+ rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
+ }
+
+ proc_net_fops_create("ip_vs_conn", 0, &ip_vs_conn_fops);
+
+ /* calculate the random value for connection hash */
+ get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
+
+ return 0;
+}
+
+
+void ip_vs_conn_cleanup(void)
+{
+ /* flush all the connection entries first */
+ ip_vs_conn_flush();
+
+ /* Release the empty cache */
+ kmem_cache_destroy(ip_vs_conn_cachep);
+ proc_net_remove("ip_vs_conn");
+ vfree(ip_vs_conn_tab);
+}
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
new file mode 100644
index 00000000000..5fb257dd07c
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -0,0 +1,1191 @@
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the Netfilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Version: $Id: ip_vs_core.c,v 1.34 2003/05/10 03:05:23 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others.
+ *
+ * Changes:
+ * Paul `Rusty' Russell properly handle non-linear skbs
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/icmp.h>
+
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h> /* for icmp_send */
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+EXPORT_SYMBOL(register_ip_vs_scheduler);
+EXPORT_SYMBOL(unregister_ip_vs_scheduler);
+EXPORT_SYMBOL(ip_vs_skb_replace);
+EXPORT_SYMBOL(ip_vs_proto_name);
+EXPORT_SYMBOL(ip_vs_conn_new);
+EXPORT_SYMBOL(ip_vs_conn_in_get);
+EXPORT_SYMBOL(ip_vs_conn_out_get);
+#ifdef CONFIG_IP_VS_PROTO_TCP
+EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
+#endif
+EXPORT_SYMBOL(ip_vs_conn_put);
+#ifdef CONFIG_IP_VS_DEBUG
+EXPORT_SYMBOL(ip_vs_get_debug_level);
+#endif
+EXPORT_SYMBOL(ip_vs_make_skb_writable);
+
+
+/* ID used in ICMP lookups */
+#define icmp_id(icmph) (((icmph)->un).echo.id)
+
+const char *ip_vs_proto_name(unsigned proto)
+{
+ static char buf[20];
+
+ switch (proto) {
+ case IPPROTO_IP:
+ return "IP";
+ case IPPROTO_UDP:
+ return "UDP";
+ case IPPROTO_TCP:
+ return "TCP";
+ case IPPROTO_ICMP:
+ return "ICMP";
+ default:
+ sprintf(buf, "IP_%d", proto);
+ return buf;
+ }
+}
+
+void ip_vs_init_hash_table(struct list_head *table, int rows)
+{
+ while (--rows >= 0)
+ INIT_LIST_HEAD(&table[rows]);
+}
+
+static inline void
+ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+ struct ip_vs_dest *dest = cp->dest;
+ if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ spin_lock(&dest->stats.lock);
+ dest->stats.inpkts++;
+ dest->stats.inbytes += skb->len;
+ spin_unlock(&dest->stats.lock);
+
+ spin_lock(&dest->svc->stats.lock);
+ dest->svc->stats.inpkts++;
+ dest->svc->stats.inbytes += skb->len;
+ spin_unlock(&dest->svc->stats.lock);
+
+ spin_lock(&ip_vs_stats.lock);
+ ip_vs_stats.inpkts++;
+ ip_vs_stats.inbytes += skb->len;
+ spin_unlock(&ip_vs_stats.lock);
+ }
+}
+
+
+static inline void
+ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+ struct ip_vs_dest *dest = cp->dest;
+ if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ spin_lock(&dest->stats.lock);
+ dest->stats.outpkts++;
+ dest->stats.outbytes += skb->len;
+ spin_unlock(&dest->stats.lock);
+
+ spin_lock(&dest->svc->stats.lock);
+ dest->svc->stats.outpkts++;
+ dest->svc->stats.outbytes += skb->len;
+ spin_unlock(&dest->svc->stats.lock);
+
+ spin_lock(&ip_vs_stats.lock);
+ ip_vs_stats.outpkts++;
+ ip_vs_stats.outbytes += skb->len;
+ spin_unlock(&ip_vs_stats.lock);
+ }
+}
+
+
+static inline void
+ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
+{
+ spin_lock(&cp->dest->stats.lock);
+ cp->dest->stats.conns++;
+ spin_unlock(&cp->dest->stats.lock);
+
+ spin_lock(&svc->stats.lock);
+ svc->stats.conns++;
+ spin_unlock(&svc->stats.lock);
+
+ spin_lock(&ip_vs_stats.lock);
+ ip_vs_stats.conns++;
+ spin_unlock(&ip_vs_stats.lock);
+}
+
+
+static inline int
+ip_vs_set_state(struct ip_vs_conn *cp, int direction,
+ const struct sk_buff *skb,
+ struct ip_vs_protocol *pp)
+{
+ if (unlikely(!pp->state_transition))
+ return 0;
+ return pp->state_transition(cp, direction, skb, pp);
+}
+
+
+int ip_vs_make_skb_writable(struct sk_buff **pskb, int writable_len)
+{
+ struct sk_buff *skb = *pskb;
+
+ /* skb is already used, better copy skb and its payload */
+ if (unlikely(skb_shared(skb) || skb->sk))
+ goto copy_skb;
+
+ /* skb data is already used, copy it */
+ if (unlikely(skb_cloned(skb)))
+ goto copy_data;
+
+ return pskb_may_pull(skb, writable_len);
+
+ copy_data:
+ if (unlikely(writable_len > skb->len))
+ return 0;
+ return !pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+
+ copy_skb:
+ if (unlikely(writable_len > skb->len))
+ return 0;
+ skb = skb_copy(skb, GFP_ATOMIC);
+ if (!skb)
+ return 0;
+ BUG_ON(skb_is_nonlinear(skb));
+
+ /* Rest of kernel will get very unhappy if we pass it a
+ suddenly-orphaned skbuff */
+ if ((*pskb)->sk)
+ skb_set_owner_w(skb, (*pskb)->sk);
+ kfree_skb(*pskb);
+ *pskb = skb;
+ return 1;
+}
+
+/*
+ * IPVS persistent scheduling function
+ * It creates a connection entry according to its template if exists,
+ * or selects a server and creates a connection entry plus a template.
+ * Locking: we are svc user (svc->refcnt), so we hold all dests too
+ * Protocols supported: TCP, UDP
+ */
+static struct ip_vs_conn *
+ip_vs_sched_persist(struct ip_vs_service *svc,
+ const struct sk_buff *skb,
+ __u16 ports[2])
+{
+ struct ip_vs_conn *cp = NULL;
+ struct iphdr *iph = skb->nh.iph;
+ struct ip_vs_dest *dest;
+ struct ip_vs_conn *ct;
+ __u16 dport; /* destination port to forward */
+ __u32 snet; /* source network of the client, after masking */
+
+ /* Mask saddr with the netmask to adjust template granularity */
+ snet = iph->saddr & svc->netmask;
+
+ IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u "
+ "mnet %u.%u.%u.%u\n",
+ NIPQUAD(iph->saddr), ntohs(ports[0]),
+ NIPQUAD(iph->daddr), ntohs(ports[1]),
+ NIPQUAD(snet));
+
+ /*
+ * As far as we know, FTP is a very complicated network protocol, and
+ * it uses control connection and data connections. For active FTP,
+ * FTP server initialize data connection to the client, its source port
+ * is often 20. For passive FTP, FTP server tells the clients the port
+ * that it passively listens to, and the client issues the data
+ * connection. In the tunneling or direct routing mode, the load
+ * balancer is on the client-to-server half of connection, the port
+ * number is unknown to the load balancer. So, a conn template like
+ * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
+ * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
+ * is created for other persistent services.
+ */
+ if (ports[1] == svc->port) {
+ /* Check if a template already exists */
+ if (svc->port != FTPPORT)
+ ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
+ iph->daddr, ports[1]);
+ else
+ ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
+ iph->daddr, 0);
+
+ if (!ct || !ip_vs_check_template(ct)) {
+ /*
+ * No template found or the dest of the connection
+ * template is not available.
+ */
+ dest = svc->scheduler->schedule(svc, skb);
+ if (dest == NULL) {
+ IP_VS_DBG(1, "p-schedule: no dest found.\n");
+ return NULL;
+ }
+
+ /*
+ * Create a template like <protocol,caddr,0,
+ * vaddr,vport,daddr,dport> for non-ftp service,
+ * and <protocol,caddr,0,vaddr,0,daddr,0>
+ * for ftp service.
+ */
+ if (svc->port != FTPPORT)
+ ct = ip_vs_conn_new(iph->protocol,
+ snet, 0,
+ iph->daddr,
+ ports[1],
+ dest->addr, dest->port,
+ 0,
+ dest);
+ else
+ ct = ip_vs_conn_new(iph->protocol,
+ snet, 0,
+ iph->daddr, 0,
+ dest->addr, 0,
+ 0,
+ dest);
+ if (ct == NULL)
+ return NULL;
+
+ ct->timeout = svc->timeout;
+ } else {
+ /* set destination with the found template */
+ dest = ct->dest;
+ }
+ dport = dest->port;
+ } else {
+ /*
+ * Note: persistent fwmark-based services and persistent
+ * port zero service are handled here.
+ * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
+ * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
+ */
+ if (svc->fwmark)
+ ct = ip_vs_conn_in_get(IPPROTO_IP, snet, 0,
+ htonl(svc->fwmark), 0);
+ else
+ ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
+ iph->daddr, 0);
+
+ if (!ct || !ip_vs_check_template(ct)) {
+ /*
+ * If it is not persistent port zero, return NULL,
+ * otherwise create a connection template.
+ */
+ if (svc->port)
+ return NULL;
+
+ dest = svc->scheduler->schedule(svc, skb);
+ if (dest == NULL) {
+ IP_VS_DBG(1, "p-schedule: no dest found.\n");
+ return NULL;
+ }
+
+ /*
+ * Create a template according to the service
+ */
+ if (svc->fwmark)
+ ct = ip_vs_conn_new(IPPROTO_IP,
+ snet, 0,
+ htonl(svc->fwmark), 0,
+ dest->addr, 0,
+ 0,
+ dest);
+ else
+ ct = ip_vs_conn_new(iph->protocol,
+ snet, 0,
+ iph->daddr, 0,
+ dest->addr, 0,
+ 0,
+ dest);
+ if (ct == NULL)
+ return NULL;
+
+ ct->timeout = svc->timeout;
+ } else {
+ /* set destination with the found template */
+ dest = ct->dest;
+ }
+ dport = ports[1];
+ }
+
+ /*
+ * Create a new connection according to the template
+ */
+ cp = ip_vs_conn_new(iph->protocol,
+ iph->saddr, ports[0],
+ iph->daddr, ports[1],
+ dest->addr, dport,
+ 0,
+ dest);
+ if (cp == NULL) {
+ ip_vs_conn_put(ct);
+ return NULL;
+ }
+
+ /*
+ * Add its control
+ */
+ ip_vs_control_add(cp, ct);
+ ip_vs_conn_put(ct);
+
+ ip_vs_conn_stats(cp, svc);
+ return cp;
+}
+
+
+/*
+ * IPVS main scheduling function
+ * It selects a server according to the virtual service, and
+ * creates a connection entry.
+ * Protocols supported: TCP, UDP
+ */
+struct ip_vs_conn *
+ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+ struct ip_vs_conn *cp = NULL;
+ struct iphdr *iph = skb->nh.iph;
+ struct ip_vs_dest *dest;
+ __u16 _ports[2], *pptr;
+
+ pptr = skb_header_pointer(skb, iph->ihl*4,
+ sizeof(_ports), _ports);
+ if (pptr == NULL)
+ return NULL;
+
+ /*
+ * Persistent service
+ */
+ if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+ return ip_vs_sched_persist(svc, skb, pptr);
+
+ /*
+ * Non-persistent service
+ */
+ if (!svc->fwmark && pptr[1] != svc->port) {
+ if (!svc->port)
+ IP_VS_ERR("Schedule: port zero only supported "
+ "in persistent services, "
+ "check your ipvs configuration\n");
+ return NULL;
+ }
+
+ dest = svc->scheduler->schedule(svc, skb);
+ if (dest == NULL) {
+ IP_VS_DBG(1, "Schedule: no dest found.\n");
+ return NULL;
+ }
+
+ /*
+ * Create a connection entry.
+ */
+ cp = ip_vs_conn_new(iph->protocol,
+ iph->saddr, pptr[0],
+ iph->daddr, pptr[1],
+ dest->addr, dest->port?dest->port:pptr[1],
+ 0,
+ dest);
+ if (cp == NULL)
+ return NULL;
+
+ IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
+ "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n",
+ ip_vs_fwd_tag(cp),
+ NIPQUAD(cp->caddr), ntohs(cp->cport),
+ NIPQUAD(cp->vaddr), ntohs(cp->vport),
+ NIPQUAD(cp->daddr), ntohs(cp->dport),
+ cp->flags, atomic_read(&cp->refcnt));
+
+ ip_vs_conn_stats(cp, svc);
+ return cp;
+}
+
+
+/*
+ * Pass or drop the packet.
+ * Called by ip_vs_in, when the virtual service is available but
+ * no destination is available for a new connection.
+ */
+int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
+ struct ip_vs_protocol *pp)
+{
+ __u16 _ports[2], *pptr;
+ struct iphdr *iph = skb->nh.iph;
+
+ pptr = skb_header_pointer(skb, iph->ihl*4,
+ sizeof(_ports), _ports);
+ if (pptr == NULL) {
+ ip_vs_service_put(svc);
+ return NF_DROP;
+ }
+
+ /* if it is fwmark-based service, the cache_bypass sysctl is up
+ and the destination is RTN_UNICAST (and not local), then create
+ a cache_bypass connection entry */
+ if (sysctl_ip_vs_cache_bypass && svc->fwmark
+ && (inet_addr_type(iph->daddr) == RTN_UNICAST)) {
+ int ret, cs;
+ struct ip_vs_conn *cp;
+
+ ip_vs_service_put(svc);
+
+ /* create a new connection entry */
+ IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
+ cp = ip_vs_conn_new(iph->protocol,
+ iph->saddr, pptr[0],
+ iph->daddr, pptr[1],
+ 0, 0,
+ IP_VS_CONN_F_BYPASS,
+ NULL);
+ if (cp == NULL)
+ return NF_DROP;
+
+ /* statistics */
+ ip_vs_in_stats(cp, skb);
+
+ /* set state */
+ cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
+
+ /* transmit the first SYN packet */
+ ret = cp->packet_xmit(skb, cp, pp);
+ /* do not touch skb anymore */
+
+ atomic_inc(&cp->in_pkts);
+ ip_vs_conn_put(cp);
+ return ret;
+ }
+
+ /*
+ * When the virtual ftp service is presented, packets destined
+ * for other services on the VIP may get here (except services
+ * listed in the ipvs table), pass the packets, because it is
+ * not ipvs job to decide to drop the packets.
+ */
+ if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
+ ip_vs_service_put(svc);
+ return NF_ACCEPT;
+ }
+
+ ip_vs_service_put(svc);
+
+ /*
+ * Notify the client that the destination is unreachable, and
+ * release the socket buffer.
+ * Since it is in IP layer, the TCP socket is not actually
+ * created, the TCP RST packet cannot be sent, instead that
+ * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
+ */
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+ return NF_DROP;
+}
+
+
+/*
+ * It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING
+ * chain, and is used for VS/NAT.
+ * It detects packets for VS/NAT connections and sends the packets
+ * immediately. This can avoid that iptable_nat mangles the packets
+ * for VS/NAT.
+ */
+static unsigned int ip_vs_post_routing(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY))
+ return NF_ACCEPT;
+
+ /* The packet was sent from IPVS, exit this chain */
+ (*okfn)(*pskb);
+
+ return NF_STOLEN;
+}
+
+u16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
+{
+ return (u16) csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
+}
+
+static inline struct sk_buff *
+ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
+{
+ skb = ip_defrag(skb, user);
+ if (skb)
+ ip_send_check(skb->nh.iph);
+ return skb;
+}
+
+/*
+ * Packet has been made sufficiently writable in caller
+ * - inout: 1=in->out, 0=out->in
+ */
+void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, int inout)
+{
+ struct iphdr *iph = skb->nh.iph;
+ unsigned int icmp_offset = iph->ihl*4;
+ struct icmphdr *icmph = (struct icmphdr *)(skb->nh.raw + icmp_offset);
+ struct iphdr *ciph = (struct iphdr *)(icmph + 1);
+
+ if (inout) {
+ iph->saddr = cp->vaddr;
+ ip_send_check(iph);
+ ciph->daddr = cp->vaddr;
+ ip_send_check(ciph);
+ } else {
+ iph->daddr = cp->daddr;
+ ip_send_check(iph);
+ ciph->saddr = cp->daddr;
+ ip_send_check(ciph);
+ }
+
+ /* the TCP/UDP port */
+ if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
+ __u16 *ports = (void *)ciph + ciph->ihl*4;
+
+ if (inout)
+ ports[1] = cp->vport;
+ else
+ ports[0] = cp->dport;
+ }
+
+ /* And finally the ICMP checksum */
+ icmph->checksum = 0;
+ icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ if (inout)
+ IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+ "Forwarding altered outgoing ICMP");
+ else
+ IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+ "Forwarding altered incoming ICMP");
+}
+
+/*
+ * Handle ICMP messages in the inside-to-outside direction (outgoing).
+ * Find any that might be relevant, check against existing connections,
+ * forward to the right destination host if relevant.
+ * Currently handles error types - unreachable, quench, ttl exceeded.
+ * (Only used in VS/NAT)
+ */
+static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
+{
+ struct sk_buff *skb = *pskb;
+ struct iphdr *iph;
+ struct icmphdr _icmph, *ic;
+ struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
+ struct ip_vs_conn *cp;
+ struct ip_vs_protocol *pp;
+ unsigned int offset, ihl, verdict;
+
+ *related = 1;
+
+ /* reassemble IP fragments */
+ if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
+ skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);
+ if (!skb)
+ return NF_STOLEN;
+ *pskb = skb;
+ }
+
+ iph = skb->nh.iph;
+ offset = ihl = iph->ihl * 4;
+ ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+ if (ic == NULL)
+ return NF_DROP;
+
+ IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
+ ic->type, ntohs(icmp_id(ic)),
+ NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+
+ /*
+ * Work through seeing if this is for us.
+ * These checks are supposed to be in an order that means easy
+ * things are checked first to speed up processing.... however
+ * this means that some packets will manage to get a long way
+ * down this stack and then be rejected, but that's life.
+ */
+ if ((ic->type != ICMP_DEST_UNREACH) &&
+ (ic->type != ICMP_SOURCE_QUENCH) &&
+ (ic->type != ICMP_TIME_EXCEEDED)) {
+ *related = 0;
+ return NF_ACCEPT;
+ }
+
+ /* Now find the contained IP header */
+ offset += sizeof(_icmph);
+ cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+ if (cih == NULL)
+ return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+ pp = ip_vs_proto_get(cih->protocol);
+ if (!pp)
+ return NF_ACCEPT;
+
+ /* Is the embedded protocol header present? */
+ if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) &&
+ pp->dont_defrag))
+ return NF_ACCEPT;
+
+ IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
+
+ offset += cih->ihl * 4;
+
+ /* The embedded headers contain source and dest in reverse order */
+ cp = pp->conn_out_get(skb, pp, cih, offset, 1);
+ if (!cp)
+ return NF_ACCEPT;
+
+ verdict = NF_DROP;
+
+ if (IP_VS_FWD_METHOD(cp) != 0) {
+ IP_VS_ERR("shouldn't reach here, because the box is on the"
+ "half connection in the tun/dr module.\n");
+ }
+
+ /* Ensure the checksum is correct */
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
+ ip_vs_checksum_complete(skb, ihl)) {
+ /* Failed checksum! */
+ IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n",
+ NIPQUAD(iph->saddr));
+ goto out;
+ }
+
+ if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
+ offset += 2 * sizeof(__u16);
+ if (!ip_vs_make_skb_writable(pskb, offset))
+ goto out;
+ skb = *pskb;
+
+ ip_vs_nat_icmp(skb, pp, cp, 1);
+
+ /* do the statistics and put it back */
+ ip_vs_out_stats(cp, skb);
+
+ skb->nfcache |= NFC_IPVS_PROPERTY;
+ verdict = NF_ACCEPT;
+
+ out:
+ __ip_vs_conn_put(cp);
+
+ return verdict;
+}
+
+static inline int is_tcp_reset(const struct sk_buff *skb)
+{
+ struct tcphdr _tcph, *th;
+
+ th = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+ sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return 0;
+ return th->rst;
+}
+
+/*
+ * It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT.
+ * Check if outgoing packet belongs to the established ip_vs_conn,
+ * rewrite addresses of the packet and send it on its way...
+ */
+static unsigned int
+ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct sk_buff *skb = *pskb;
+ struct iphdr *iph;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_conn *cp;
+ int ihl;
+
+ EnterFunction(11);
+
+ if (skb->nfcache & NFC_IPVS_PROPERTY)
+ return NF_ACCEPT;
+
+ iph = skb->nh.iph;
+ if (unlikely(iph->protocol == IPPROTO_ICMP)) {
+ int related, verdict = ip_vs_out_icmp(pskb, &related);
+
+ if (related)
+ return verdict;
+ skb = *pskb;
+ iph = skb->nh.iph;
+ }
+
+ pp = ip_vs_proto_get(iph->protocol);
+ if (unlikely(!pp))
+ return NF_ACCEPT;
+
+ /* reassemble IP fragments */
+ if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) &&
+ !pp->dont_defrag)) {
+ skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);
+ if (!skb)
+ return NF_STOLEN;
+ iph = skb->nh.iph;
+ *pskb = skb;
+ }
+
+ ihl = iph->ihl << 2;
+
+ /*
+ * Check if the packet belongs to an existing entry
+ */
+ cp = pp->conn_out_get(skb, pp, iph, ihl, 0);
+
+ if (unlikely(!cp)) {
+ if (sysctl_ip_vs_nat_icmp_send &&
+ (pp->protocol == IPPROTO_TCP ||
+ pp->protocol == IPPROTO_UDP)) {
+ __u16 _ports[2], *pptr;
+
+ pptr = skb_header_pointer(skb, ihl,
+ sizeof(_ports), _ports);
+ if (pptr == NULL)
+ return NF_ACCEPT; /* Not for me */
+ if (ip_vs_lookup_real_service(iph->protocol,
+ iph->saddr, pptr[0])) {
+ /*
+ * Notify the real server: there is no
+ * existing entry if it is not RST
+ * packet or not TCP packet.
+ */
+ if (iph->protocol != IPPROTO_TCP
+ || !is_tcp_reset(skb)) {
+ icmp_send(skb,ICMP_DEST_UNREACH,
+ ICMP_PORT_UNREACH, 0);
+ return NF_DROP;
+ }
+ }
+ }
+ IP_VS_DBG_PKT(12, pp, skb, 0,
+ "packet continues traversal as normal");
+ return NF_ACCEPT;
+ }
+
+ IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
+
+ if (!ip_vs_make_skb_writable(pskb, ihl))
+ goto drop;
+
+ /* mangle the packet */
+ if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp))
+ goto drop;
+ skb = *pskb;
+ skb->nh.iph->saddr = cp->vaddr;
+ ip_send_check(skb->nh.iph);
+
+ IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
+
+ ip_vs_out_stats(cp, skb);
+ ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
+ ip_vs_conn_put(cp);
+
+ skb->nfcache |= NFC_IPVS_PROPERTY;
+
+ LeaveFunction(11);
+ return NF_ACCEPT;
+
+ drop:
+ ip_vs_conn_put(cp);
+ kfree_skb(*pskb);
+ return NF_STOLEN;
+}
+
+
+/*
+ * Handle ICMP messages in the outside-to-inside direction (incoming).
+ * Find any that might be relevant, check against existing connections,
+ * forward to the right destination host if relevant.
+ * Currently handles error types - unreachable, quench, ttl exceeded.
+ */
+static int
+ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum)
+{
+ struct sk_buff *skb = *pskb;
+ struct iphdr *iph;
+ struct icmphdr _icmph, *ic;
+ struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
+ struct ip_vs_conn *cp;
+ struct ip_vs_protocol *pp;
+ unsigned int offset, ihl, verdict;
+
+ *related = 1;
+
+ /* reassemble IP fragments */
+ if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
+ skb = ip_vs_gather_frags(skb,
+ hooknum == NF_IP_LOCAL_IN ?
+ IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD);
+ if (!skb)
+ return NF_STOLEN;
+ *pskb = skb;
+ }
+
+ iph = skb->nh.iph;
+ offset = ihl = iph->ihl * 4;
+ ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+ if (ic == NULL)
+ return NF_DROP;
+
+ IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
+ ic->type, ntohs(icmp_id(ic)),
+ NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+
+ /*
+ * Work through seeing if this is for us.
+ * These checks are supposed to be in an order that means easy
+ * things are checked first to speed up processing.... however
+ * this means that some packets will manage to get a long way
+ * down this stack and then be rejected, but that's life.
+ */
+ if ((ic->type != ICMP_DEST_UNREACH) &&
+ (ic->type != ICMP_SOURCE_QUENCH) &&
+ (ic->type != ICMP_TIME_EXCEEDED)) {
+ *related = 0;
+ return NF_ACCEPT;
+ }
+
+ /* Now find the contained IP header */
+ offset += sizeof(_icmph);
+ cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+ if (cih == NULL)
+ return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+ pp = ip_vs_proto_get(cih->protocol);
+ if (!pp)
+ return NF_ACCEPT;
+
+ /* Is the embedded protocol header present? */
+ if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) &&
+ pp->dont_defrag))
+ return NF_ACCEPT;
+
+ IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
+
+ offset += cih->ihl * 4;
+
+ /* The embedded headers contain source and dest in reverse order */
+ cp = pp->conn_in_get(skb, pp, cih, offset, 1);
+ if (!cp)
+ return NF_ACCEPT;
+
+ verdict = NF_DROP;
+
+ /* Ensure the checksum is correct */
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
+ ip_vs_checksum_complete(skb, ihl)) {
+ /* Failed checksum! */
+ IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
+ NIPQUAD(iph->saddr));
+ goto out;
+ }
+
+ /* do the statistics and put it back */
+ ip_vs_in_stats(cp, skb);
+ if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
+ offset += 2 * sizeof(__u16);
+ verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
+ /* do not touch skb anymore */
+
+ out:
+ __ip_vs_conn_put(cp);
+
+ return verdict;
+}
+
+/*
+ * Check if it's for virtual services, look it up,
+ * and send it on its way...
+ */
+static unsigned int
+ip_vs_in(unsigned int hooknum, struct sk_buff **pskb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct sk_buff *skb = *pskb;
+ struct iphdr *iph;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_conn *cp;
+ int ret, restart;
+ int ihl;
+
+ /*
+ * Big tappo: only PACKET_HOST (neither loopback nor mcasts)
+ * ... don't know why 1st test DOES NOT include 2nd (?)
+ */
+ if (unlikely(skb->pkt_type != PACKET_HOST
+ || skb->dev == &loopback_dev || skb->sk)) {
+ IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
+ skb->pkt_type,
+ skb->nh.iph->protocol,
+ NIPQUAD(skb->nh.iph->daddr));
+ return NF_ACCEPT;
+ }
+
+ iph = skb->nh.iph;
+ if (unlikely(iph->protocol == IPPROTO_ICMP)) {
+ int related, verdict = ip_vs_in_icmp(pskb, &related, hooknum);
+
+ if (related)
+ return verdict;
+ skb = *pskb;
+ iph = skb->nh.iph;
+ }
+
+ /* Protocol supported? */
+ pp = ip_vs_proto_get(iph->protocol);
+ if (unlikely(!pp))
+ return NF_ACCEPT;
+
+ ihl = iph->ihl << 2;
+
+ /*
+ * Check if the packet belongs to an existing connection entry
+ */
+ cp = pp->conn_in_get(skb, pp, iph, ihl, 0);
+
+ if (unlikely(!cp)) {
+ int v;
+
+ if (!pp->conn_schedule(skb, pp, &v, &cp))
+ return v;
+ }
+
+ if (unlikely(!cp)) {
+ /* sorry, all this trouble for a no-hit :) */
+ IP_VS_DBG_PKT(12, pp, skb, 0,
+ "packet continues traversal as normal");
+ return NF_ACCEPT;
+ }
+
+ IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
+
+ /* Check the server status */
+ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ /* the destination server is not available */
+
+ if (sysctl_ip_vs_expire_nodest_conn) {
+ /* try to expire the connection immediately */
+ ip_vs_conn_expire_now(cp);
+ } else {
+ /* don't restart its timer, and silently
+ drop the packet. */
+ __ip_vs_conn_put(cp);
+ }
+ return NF_DROP;
+ }
+
+ ip_vs_in_stats(cp, skb);
+ restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
+ if (cp->packet_xmit)
+ ret = cp->packet_xmit(skb, cp, pp);
+ /* do not touch skb anymore */
+ else {
+ IP_VS_DBG_RL("warning: packet_xmit is null");
+ ret = NF_ACCEPT;
+ }
+
+ /* increase its packet counter and check if it is needed
+ to be synchronized */
+ atomic_inc(&cp->in_pkts);
+ if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
+ (cp->protocol != IPPROTO_TCP ||
+ cp->state == IP_VS_TCP_S_ESTABLISHED) &&
+ (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
+ == sysctl_ip_vs_sync_threshold[0]))
+ ip_vs_sync_conn(cp);
+
+ ip_vs_conn_put(cp);
+ return ret;
+}
+
+
+/*
+ * It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP
+ * related packets destined for 0.0.0.0/0.
+ * When fwmark-based virtual service is used, such as transparent
+ * cache cluster, TCP packets can be marked and routed to ip_vs_in,
+ * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
+ * sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain
+ * and send them to ip_vs_in_icmp.
+ */
+static unsigned int
+ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **pskb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ int r;
+
+ if ((*pskb)->nh.iph->protocol != IPPROTO_ICMP)
+ return NF_ACCEPT;
+
+ return ip_vs_in_icmp(pskb, &r, hooknum);
+}
+
+
+/* After packet filtering, forward packet through VS/DR, VS/TUN,
+ or VS/NAT(change destination), so that filtering rules can be
+ applied to IPVS. */
+static struct nf_hook_ops ip_vs_in_ops = {
+ .hook = ip_vs_in,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_LOCAL_IN,
+ .priority = 100,
+};
+
+/* After packet filtering, change source only for VS/NAT */
+static struct nf_hook_ops ip_vs_out_ops = {
+ .hook = ip_vs_out,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_FORWARD,
+ .priority = 100,
+};
+
+/* After packet filtering (but before ip_vs_out_icmp), catch icmp
+ destined for 0.0.0.0/0, which is for incoming IPVS connections */
+static struct nf_hook_ops ip_vs_forward_icmp_ops = {
+ .hook = ip_vs_forward_icmp,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_FORWARD,
+ .priority = 99,
+};
+
+/* Before the netfilter connection tracking, exit from POST_ROUTING */
+static struct nf_hook_ops ip_vs_post_routing_ops = {
+ .hook = ip_vs_post_routing,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_POST_ROUTING,
+ .priority = NF_IP_PRI_NAT_SRC-1,
+};
+
+
+/*
+ * Initialize IP Virtual Server
+ */
+static int __init ip_vs_init(void)
+{
+ int ret;
+
+ ret = ip_vs_control_init();
+ if (ret < 0) {
+ IP_VS_ERR("can't setup control.\n");
+ goto cleanup_nothing;
+ }
+
+ ip_vs_protocol_init();
+
+ ret = ip_vs_app_init();
+ if (ret < 0) {
+ IP_VS_ERR("can't setup application helper.\n");
+ goto cleanup_protocol;
+ }
+
+ ret = ip_vs_conn_init();
+ if (ret < 0) {
+ IP_VS_ERR("can't setup connection table.\n");
+ goto cleanup_app;
+ }
+
+ ret = nf_register_hook(&ip_vs_in_ops);
+ if (ret < 0) {
+ IP_VS_ERR("can't register in hook.\n");
+ goto cleanup_conn;
+ }
+
+ ret = nf_register_hook(&ip_vs_out_ops);
+ if (ret < 0) {
+ IP_VS_ERR("can't register out hook.\n");
+ goto cleanup_inops;
+ }
+ ret = nf_register_hook(&ip_vs_post_routing_ops);
+ if (ret < 0) {
+ IP_VS_ERR("can't register post_routing hook.\n");
+ goto cleanup_outops;
+ }
+ ret = nf_register_hook(&ip_vs_forward_icmp_ops);
+ if (ret < 0) {
+ IP_VS_ERR("can't register forward_icmp hook.\n");
+ goto cleanup_postroutingops;
+ }
+
+ IP_VS_INFO("ipvs loaded.\n");
+ return ret;
+
+ cleanup_postroutingops:
+ nf_unregister_hook(&ip_vs_post_routing_ops);
+ cleanup_outops:
+ nf_unregister_hook(&ip_vs_out_ops);
+ cleanup_inops:
+ nf_unregister_hook(&ip_vs_in_ops);
+ cleanup_conn:
+ ip_vs_conn_cleanup();
+ cleanup_app:
+ ip_vs_app_cleanup();
+ cleanup_protocol:
+ ip_vs_protocol_cleanup();
+ ip_vs_control_cleanup();
+ cleanup_nothing:
+ return ret;
+}
+
+static void __exit ip_vs_cleanup(void)
+{
+ nf_unregister_hook(&ip_vs_forward_icmp_ops);
+ nf_unregister_hook(&ip_vs_post_routing_ops);
+ nf_unregister_hook(&ip_vs_out_ops);
+ nf_unregister_hook(&ip_vs_in_ops);
+ ip_vs_conn_cleanup();
+ ip_vs_app_cleanup();
+ ip_vs_protocol_cleanup();
+ ip_vs_control_cleanup();
+ IP_VS_INFO("ipvs unloaded.\n");
+}
+
+module_init(ip_vs_init);
+module_exit(ip_vs_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
new file mode 100644
index 00000000000..218d9701036
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -0,0 +1,2391 @@
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the NetFilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/workqueue.h>
+#include <linux/swap.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip.h>
+#include <net/sock.h>
+
+#include <asm/uaccess.h>
+
+#include <net/ip_vs.h>
+
+/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
+static DECLARE_MUTEX(__ip_vs_mutex);
+
+/* lock for service table */
+static DEFINE_RWLOCK(__ip_vs_svc_lock);
+
+/* lock for table with the real services */
+static DEFINE_RWLOCK(__ip_vs_rs_lock);
+
+/* lock for state and timeout tables */
+static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
+
+/* lock for drop entry handling */
+static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
+
+/* lock for drop packet handling */
+static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
+
+/* 1/rate drop and drop-entry variables */
+int ip_vs_drop_rate = 0;
+int ip_vs_drop_counter = 0;
+static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
+
+/* number of virtual services */
+static int ip_vs_num_services = 0;
+
+/* sysctl variables */
+static int sysctl_ip_vs_drop_entry = 0;
+static int sysctl_ip_vs_drop_packet = 0;
+static int sysctl_ip_vs_secure_tcp = 0;
+static int sysctl_ip_vs_amemthresh = 1024;
+static int sysctl_ip_vs_am_droprate = 10;
+int sysctl_ip_vs_cache_bypass = 0;
+int sysctl_ip_vs_expire_nodest_conn = 0;
+int sysctl_ip_vs_expire_quiescent_template = 0;
+int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
+int sysctl_ip_vs_nat_icmp_send = 0;
+
+
+#ifdef CONFIG_IP_VS_DEBUG
+static int sysctl_ip_vs_debug_level = 0;
+
+int ip_vs_get_debug_level(void)
+{
+ return sysctl_ip_vs_debug_level;
+}
+#endif
+
+/*
+ * update_defense_level is called from keventd and from sysctl.
+ */
+static void update_defense_level(void)
+{
+ struct sysinfo i;
+ static int old_secure_tcp = 0;
+ int availmem;
+ int nomem;
+ int to_change = -1;
+
+ /* we only count free and buffered memory (in pages) */
+ si_meminfo(&i);
+ availmem = i.freeram + i.bufferram;
+ /* however in linux 2.5 the i.bufferram is total page cache size,
+ we need adjust it */
+ /* si_swapinfo(&i); */
+ /* availmem = availmem - (i.totalswap - i.freeswap); */
+
+ nomem = (availmem < sysctl_ip_vs_amemthresh);
+
+ /* drop_entry */
+ spin_lock(&__ip_vs_dropentry_lock);
+ switch (sysctl_ip_vs_drop_entry) {
+ case 0:
+ atomic_set(&ip_vs_dropentry, 0);
+ break;
+ case 1:
+ if (nomem) {
+ atomic_set(&ip_vs_dropentry, 1);
+ sysctl_ip_vs_drop_entry = 2;
+ } else {
+ atomic_set(&ip_vs_dropentry, 0);
+ }
+ break;
+ case 2:
+ if (nomem) {
+ atomic_set(&ip_vs_dropentry, 1);
+ } else {
+ atomic_set(&ip_vs_dropentry, 0);
+ sysctl_ip_vs_drop_entry = 1;
+ };
+ break;
+ case 3:
+ atomic_set(&ip_vs_dropentry, 1);
+ break;
+ }
+ spin_unlock(&__ip_vs_dropentry_lock);
+
+ /* drop_packet */
+ spin_lock(&__ip_vs_droppacket_lock);
+ switch (sysctl_ip_vs_drop_packet) {
+ case 0:
+ ip_vs_drop_rate = 0;
+ break;
+ case 1:
+ if (nomem) {
+ ip_vs_drop_rate = ip_vs_drop_counter
+ = sysctl_ip_vs_amemthresh /
+ (sysctl_ip_vs_amemthresh-availmem);
+ sysctl_ip_vs_drop_packet = 2;
+ } else {
+ ip_vs_drop_rate = 0;
+ }
+ break;
+ case 2:
+ if (nomem) {
+ ip_vs_drop_rate = ip_vs_drop_counter
+ = sysctl_ip_vs_amemthresh /
+ (sysctl_ip_vs_amemthresh-availmem);
+ } else {
+ ip_vs_drop_rate = 0;
+ sysctl_ip_vs_drop_packet = 1;
+ }
+ break;
+ case 3:
+ ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
+ break;
+ }
+ spin_unlock(&__ip_vs_droppacket_lock);
+
+ /* secure_tcp */
+ write_lock(&__ip_vs_securetcp_lock);
+ switch (sysctl_ip_vs_secure_tcp) {
+ case 0:
+ if (old_secure_tcp >= 2)
+ to_change = 0;
+ break;
+ case 1:
+ if (nomem) {
+ if (old_secure_tcp < 2)
+ to_change = 1;
+ sysctl_ip_vs_secure_tcp = 2;
+ } else {
+ if (old_secure_tcp >= 2)
+ to_change = 0;
+ }
+ break;
+ case 2:
+ if (nomem) {
+ if (old_secure_tcp < 2)
+ to_change = 1;
+ } else {
+ if (old_secure_tcp >= 2)
+ to_change = 0;
+ sysctl_ip_vs_secure_tcp = 1;
+ }
+ break;
+ case 3:
+ if (old_secure_tcp < 2)
+ to_change = 1;
+ break;
+ }
+ old_secure_tcp = sysctl_ip_vs_secure_tcp;
+ if (to_change >= 0)
+ ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
+ write_unlock(&__ip_vs_securetcp_lock);
+}
+
+
+/*
+ * Timer for checking the defense
+ */
+#define DEFENSE_TIMER_PERIOD 1*HZ
+static void defense_work_handler(void *data);
+static DECLARE_WORK(defense_work, defense_work_handler, NULL);
+
+static void defense_work_handler(void *data)
+{
+ update_defense_level();
+ if (atomic_read(&ip_vs_dropentry))
+ ip_vs_random_dropentry();
+
+ schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
+}
+
+int
+ip_vs_use_count_inc(void)
+{
+ return try_module_get(THIS_MODULE);
+}
+
+void
+ip_vs_use_count_dec(void)
+{
+ module_put(THIS_MODULE);
+}
+
+
+/*
+ * Hash table: for virtual service lookups
+ */
+#define IP_VS_SVC_TAB_BITS 8
+#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
+#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
+
+/* the service table hashed by <protocol, addr, port> */
+static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
+/* the service table hashed by fwmark */
+static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
+
+/*
+ * Hash table: for real service lookups
+ */
+#define IP_VS_RTAB_BITS 4
+#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
+#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
+
+static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
+
+/*
+ * Trash for destinations
+ */
+static LIST_HEAD(ip_vs_dest_trash);
+
+/*
+ * FTP & NULL virtual service counters
+ */
+static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
+static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
+
+
+/*
+ * Returns hash value for virtual service
+ */
+static __inline__ unsigned
+ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
+{
+ register unsigned porth = ntohs(port);
+
+ return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
+ & IP_VS_SVC_TAB_MASK;
+}
+
+/*
+ * Returns hash value of fwmark for virtual service lookup
+ */
+static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
+{
+ return fwmark & IP_VS_SVC_TAB_MASK;
+}
+
+/*
+ * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
+ * or in the ip_vs_svc_fwm_table by fwmark.
+ * Should be called with locked tables.
+ */
+static int ip_vs_svc_hash(struct ip_vs_service *svc)
+{
+ unsigned hash;
+
+ if (svc->flags & IP_VS_SVC_F_HASHED) {
+ IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
+ "called from %p\n", __builtin_return_address(0));
+ return 0;
+ }
+
+ if (svc->fwmark == 0) {
+ /*
+ * Hash it by <protocol,addr,port> in ip_vs_svc_table
+ */
+ hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
+ list_add(&svc->s_list, &ip_vs_svc_table[hash]);
+ } else {
+ /*
+ * Hash it by fwmark in ip_vs_svc_fwm_table
+ */
+ hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
+ list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
+ }
+
+ svc->flags |= IP_VS_SVC_F_HASHED;
+ /* increase its refcnt because it is referenced by the svc table */
+ atomic_inc(&svc->refcnt);
+ return 1;
+}
+
+
+/*
+ * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
+ * Should be called with locked tables.
+ */
+static int ip_vs_svc_unhash(struct ip_vs_service *svc)
+{
+ if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
+ IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
+ "called from %p\n", __builtin_return_address(0));
+ return 0;
+ }
+
+ if (svc->fwmark == 0) {
+ /* Remove it from the ip_vs_svc_table table */
+ list_del(&svc->s_list);
+ } else {
+ /* Remove it from the ip_vs_svc_fwm_table table */
+ list_del(&svc->f_list);
+ }
+
+ svc->flags &= ~IP_VS_SVC_F_HASHED;
+ atomic_dec(&svc->refcnt);
+ return 1;
+}
+
+
+/*
+ * Get service by {proto,addr,port} in the service table.
+ */
+static __inline__ struct ip_vs_service *
+__ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
+{
+ unsigned hash;
+ struct ip_vs_service *svc;
+
+ /* Check for "full" addressed entries */
+ hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
+
+ list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
+ if ((svc->addr == vaddr)
+ && (svc->port == vport)
+ && (svc->protocol == protocol)) {
+ /* HIT */
+ atomic_inc(&svc->usecnt);
+ return svc;
+ }
+ }
+
+ return NULL;
+}
+
+
+/*
+ * Get service by {fwmark} in the service table.
+ */
+static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
+{
+ unsigned hash;
+ struct ip_vs_service *svc;
+
+ /* Check for fwmark addressed entries */
+ hash = ip_vs_svc_fwm_hashkey(fwmark);
+
+ list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
+ if (svc->fwmark == fwmark) {
+ /* HIT */
+ atomic_inc(&svc->usecnt);
+ return svc;
+ }
+ }
+
+ return NULL;
+}
+
+struct ip_vs_service *
+ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
+{
+ struct ip_vs_service *svc;
+
+ read_lock(&__ip_vs_svc_lock);
+
+ /*
+ * Check the table hashed by fwmark first
+ */
+ if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
+ goto out;
+
+ /*
+ * Check the table hashed by <protocol,addr,port>
+ * for "full" addressed entries
+ */
+ svc = __ip_vs_service_get(protocol, vaddr, vport);
+
+ if (svc == NULL
+ && protocol == IPPROTO_TCP
+ && atomic_read(&ip_vs_ftpsvc_counter)
+ && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
+ /*
+ * Check if ftp service entry exists, the packet
+ * might belong to FTP data connections.
+ */
+ svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
+ }
+
+ if (svc == NULL
+ && atomic_read(&ip_vs_nullsvc_counter)) {
+ /*
+ * Check if the catch-all port (port zero) exists
+ */
+ svc = __ip_vs_service_get(protocol, vaddr, 0);
+ }
+
+ out:
+ read_unlock(&__ip_vs_svc_lock);
+
+ IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
+ fwmark, ip_vs_proto_name(protocol),
+ NIPQUAD(vaddr), ntohs(vport),
+ svc?"hit":"not hit");
+
+ return svc;
+}
+
+
+static inline void
+__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+ atomic_inc(&svc->refcnt);
+ dest->svc = svc;
+}
+
+static inline void
+__ip_vs_unbind_svc(struct ip_vs_dest *dest)
+{
+ struct ip_vs_service *svc = dest->svc;
+
+ dest->svc = NULL;
+ if (atomic_dec_and_test(&svc->refcnt))
+ kfree(svc);
+}
+
+
+/*
+ * Returns hash value for real service
+ */
+static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
+{
+ register unsigned porth = ntohs(port);
+
+ return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
+ & IP_VS_RTAB_MASK;
+}
+
+/*
+ * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
+ * should be called with locked tables.
+ */
+static int ip_vs_rs_hash(struct ip_vs_dest *dest)
+{
+ unsigned hash;
+
+ if (!list_empty(&dest->d_list)) {
+ return 0;
+ }
+
+ /*
+ * Hash by proto,addr,port,
+ * which are the parameters of the real service.
+ */
+ hash = ip_vs_rs_hashkey(dest->addr, dest->port);
+ list_add(&dest->d_list, &ip_vs_rtable[hash]);
+
+ return 1;
+}
+
+/*
+ * UNhashes ip_vs_dest from ip_vs_rtable.
+ * should be called with locked tables.
+ */
+static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
+{
+ /*
+ * Remove it from the ip_vs_rtable table.
+ */
+ if (!list_empty(&dest->d_list)) {
+ list_del(&dest->d_list);
+ INIT_LIST_HEAD(&dest->d_list);
+ }
+
+ return 1;
+}
+
+/*
+ * Lookup real service by <proto,addr,port> in the real service table.
+ */
+struct ip_vs_dest *
+ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
+{
+ unsigned hash;
+ struct ip_vs_dest *dest;
+
+ /*
+ * Check for "full" addressed entries
+ * Return the first found entry
+ */
+ hash = ip_vs_rs_hashkey(daddr, dport);
+
+ read_lock(&__ip_vs_rs_lock);
+ list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
+ if ((dest->addr == daddr)
+ && (dest->port == dport)
+ && ((dest->protocol == protocol) ||
+ dest->vfwmark)) {
+ /* HIT */
+ read_unlock(&__ip_vs_rs_lock);
+ return dest;
+ }
+ }
+ read_unlock(&__ip_vs_rs_lock);
+
+ return NULL;
+}
+
+/*
+ * Lookup destination by {addr,port} in the given service
+ */
+static struct ip_vs_dest *
+ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
+{
+ struct ip_vs_dest *dest;
+
+ /*
+ * Find the destination for the given service
+ */
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ if ((dest->addr == daddr) && (dest->port == dport)) {
+ /* HIT */
+ return dest;
+ }
+ }
+
+ return NULL;
+}
+
+
+/*
+ * Lookup dest by {svc,addr,port} in the destination trash.
+ * The destination trash is used to hold the destinations that are removed
+ * from the service table but are still referenced by some conn entries.
+ * The reason to add the destination trash is when the dest is temporary
+ * down (either by administrator or by monitor program), the dest can be
+ * picked back from the trash, the remaining connections to the dest can
+ * continue, and the counting information of the dest is also useful for
+ * scheduling.
+ */
+static struct ip_vs_dest *
+ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
+{
+ struct ip_vs_dest *dest, *nxt;
+
+ /*
+ * Find the destination in trash
+ */
+ list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
+ IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
+ "refcnt=%d\n",
+ dest->vfwmark,
+ NIPQUAD(dest->addr), ntohs(dest->port),
+ atomic_read(&dest->refcnt));
+ if (dest->addr == daddr &&
+ dest->port == dport &&
+ dest->vfwmark == svc->fwmark &&
+ dest->protocol == svc->protocol &&
+ (svc->fwmark ||
+ (dest->vaddr == svc->addr &&
+ dest->vport == svc->port))) {
+ /* HIT */
+ return dest;
+ }
+
+ /*
+ * Try to purge the destination from trash if not referenced
+ */
+ if (atomic_read(&dest->refcnt) == 1) {
+ IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
+ "from trash\n",
+ dest->vfwmark,
+ NIPQUAD(dest->addr), ntohs(dest->port));
+ list_del(&dest->n_list);
+ ip_vs_dst_reset(dest);
+ __ip_vs_unbind_svc(dest);
+ kfree(dest);
+ }
+ }
+
+ return NULL;
+}
+
+
+/*
+ * Clean up all the destinations in the trash
+ * Called by the ip_vs_control_cleanup()
+ *
+ * When the ip_vs_control_clearup is activated by ipvs module exit,
+ * the service tables must have been flushed and all the connections
+ * are expired, and the refcnt of each destination in the trash must
+ * be 1, so we simply release them here.
+ */
+static void ip_vs_trash_cleanup(void)
+{
+ struct ip_vs_dest *dest, *nxt;
+
+ list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
+ list_del(&dest->n_list);
+ ip_vs_dst_reset(dest);
+ __ip_vs_unbind_svc(dest);
+ kfree(dest);
+ }
+}
+
+
+static void
+ip_vs_zero_stats(struct ip_vs_stats *stats)
+{
+ spin_lock_bh(&stats->lock);
+ memset(stats, 0, (char *)&stats->lock - (char *)stats);
+ spin_unlock_bh(&stats->lock);
+ ip_vs_zero_estimator(stats);
+}
+
+/*
+ * Update a destination in the given service
+ */
+static void
+__ip_vs_update_dest(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
+{
+ int conn_flags;
+
+ /* set the weight and the flags */
+ atomic_set(&dest->weight, udest->weight);
+ conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
+
+ /* check if local node and update the flags */
+ if (inet_addr_type(udest->addr) == RTN_LOCAL) {
+ conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
+ | IP_VS_CONN_F_LOCALNODE;
+ }
+
+ /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
+ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
+ conn_flags |= IP_VS_CONN_F_NOOUTPUT;
+ } else {
+ /*
+ * Put the real service in ip_vs_rtable if not present.
+ * For now only for NAT!
+ */
+ write_lock_bh(&__ip_vs_rs_lock);
+ ip_vs_rs_hash(dest);
+ write_unlock_bh(&__ip_vs_rs_lock);
+ }
+ atomic_set(&dest->conn_flags, conn_flags);
+
+ /* bind the service */
+ if (!dest->svc) {
+ __ip_vs_bind_svc(dest, svc);
+ } else {
+ if (dest->svc != svc) {
+ __ip_vs_unbind_svc(dest);
+ ip_vs_zero_stats(&dest->stats);
+ __ip_vs_bind_svc(dest, svc);
+ }
+ }
+
+ /* set the dest status flags */
+ dest->flags |= IP_VS_DEST_F_AVAILABLE;
+
+ if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ dest->u_threshold = udest->u_threshold;
+ dest->l_threshold = udest->l_threshold;
+}
+
+
+/*
+ * Create a destination for the given service
+ */
+static int
+ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
+ struct ip_vs_dest **dest_p)
+{
+ struct ip_vs_dest *dest;
+ unsigned atype;
+
+ EnterFunction(2);
+
+ atype = inet_addr_type(udest->addr);
+ if (atype != RTN_LOCAL && atype != RTN_UNICAST)
+ return -EINVAL;
+
+ dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
+ if (dest == NULL) {
+ IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
+ return -ENOMEM;
+ }
+ memset(dest, 0, sizeof(struct ip_vs_dest));
+
+ dest->protocol = svc->protocol;
+ dest->vaddr = svc->addr;
+ dest->vport = svc->port;
+ dest->vfwmark = svc->fwmark;
+ dest->addr = udest->addr;
+ dest->port = udest->port;
+
+ atomic_set(&dest->activeconns, 0);
+ atomic_set(&dest->inactconns, 0);
+ atomic_set(&dest->persistconns, 0);
+ atomic_set(&dest->refcnt, 0);
+
+ INIT_LIST_HEAD(&dest->d_list);
+ spin_lock_init(&dest->dst_lock);
+ spin_lock_init(&dest->stats.lock);
+ __ip_vs_update_dest(svc, dest, udest);
+ ip_vs_new_estimator(&dest->stats);
+
+ *dest_p = dest;
+
+ LeaveFunction(2);
+ return 0;
+}
+
+
+/*
+ * Add a destination into an existing service
+ */
+static int
+ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
+{
+ struct ip_vs_dest *dest;
+ __u32 daddr = udest->addr;
+ __u16 dport = udest->port;
+ int ret;
+
+ EnterFunction(2);
+
+ if (udest->weight < 0) {
+ IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
+ return -ERANGE;
+ }
+
+ if (udest->l_threshold > udest->u_threshold) {
+ IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
+ "upper threshold\n");
+ return -ERANGE;
+ }
+
+ /*
+ * Check if the dest already exists in the list
+ */
+ dest = ip_vs_lookup_dest(svc, daddr, dport);
+ if (dest != NULL) {
+ IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
+ return -EEXIST;
+ }
+
+ /*
+ * Check if the dest already exists in the trash and
+ * is from the same service
+ */
+ dest = ip_vs_trash_get_dest(svc, daddr, dport);
+ if (dest != NULL) {
+ IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
+ "refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
+ NIPQUAD(daddr), ntohs(dport),
+ atomic_read(&dest->refcnt),
+ dest->vfwmark,
+ NIPQUAD(dest->vaddr),
+ ntohs(dest->vport));
+ __ip_vs_update_dest(svc, dest, udest);
+
+ /*
+ * Get the destination from the trash
+ */
+ list_del(&dest->n_list);
+
+ ip_vs_new_estimator(&dest->stats);
+
+ write_lock_bh(&__ip_vs_svc_lock);
+
+ /*
+ * Wait until all other svc users go away.
+ */
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+ list_add(&dest->n_list, &svc->destinations);
+ svc->num_dests++;
+
+ /* call the update_service function of its scheduler */
+ svc->scheduler->update_service(svc);
+
+ write_unlock_bh(&__ip_vs_svc_lock);
+ return 0;
+ }
+
+ /*
+ * Allocate and initialize the dest structure
+ */
+ ret = ip_vs_new_dest(svc, udest, &dest);
+ if (ret) {
+ return ret;
+ }
+
+ /*
+ * Add the dest entry into the list
+ */
+ atomic_inc(&dest->refcnt);
+
+ write_lock_bh(&__ip_vs_svc_lock);
+
+ /*
+ * Wait until all other svc users go away.
+ */
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+ list_add(&dest->n_list, &svc->destinations);
+ svc->num_dests++;
+
+ /* call the update_service function of its scheduler */
+ svc->scheduler->update_service(svc);
+
+ write_unlock_bh(&__ip_vs_svc_lock);
+
+ LeaveFunction(2);
+
+ return 0;
+}
+
+
+/*
+ * Edit a destination in the given service
+ */
+static int
+ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
+{
+ struct ip_vs_dest *dest;
+ __u32 daddr = udest->addr;
+ __u16 dport = udest->port;
+
+ EnterFunction(2);
+
+ if (udest->weight < 0) {
+ IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
+ return -ERANGE;
+ }
+
+ if (udest->l_threshold > udest->u_threshold) {
+ IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
+ "upper threshold\n");
+ return -ERANGE;
+ }
+
+ /*
+ * Lookup the destination list
+ */
+ dest = ip_vs_lookup_dest(svc, daddr, dport);
+ if (dest == NULL) {
+ IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
+ return -ENOENT;
+ }
+
+ __ip_vs_update_dest(svc, dest, udest);
+
+ write_lock_bh(&__ip_vs_svc_lock);
+
+ /* Wait until all other svc users go away */
+ while (atomic_read(&svc->usecnt) > 1) {};
+
+ /* call the update_service, because server weight may be changed */
+ svc->scheduler->update_service(svc);
+
+ write_unlock_bh(&__ip_vs_svc_lock);
+
+ LeaveFunction(2);
+
+ return 0;
+}
+
+
+/*
+ * Delete a destination (must be already unlinked from the service)
+ */
+static void __ip_vs_del_dest(struct ip_vs_dest *dest)
+{
+ ip_vs_kill_estimator(&dest->stats);
+
+ /*
+ * Remove it from the d-linked list with the real services.
+ */
+ write_lock_bh(&__ip_vs_rs_lock);
+ ip_vs_rs_unhash(dest);
+ write_unlock_bh(&__ip_vs_rs_lock);
+
+ /*
+ * Decrease the refcnt of the dest, and free the dest
+ * if nobody refers to it (refcnt=0). Otherwise, throw
+ * the destination into the trash.
+ */
+ if (atomic_dec_and_test(&dest->refcnt)) {
+ ip_vs_dst_reset(dest);
+ /* simply decrease svc->refcnt here, let the caller check
+ and release the service if nobody refers to it.
+ Only user context can release destination and service,
+ and only one user context can update virtual service at a
+ time, so the operation here is OK */
+ atomic_dec(&dest->svc->refcnt);
+ kfree(dest);
+ } else {
+ IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n",
+ NIPQUAD(dest->addr), ntohs(dest->port),
+ atomic_read(&dest->refcnt));
+ list_add(&dest->n_list, &ip_vs_dest_trash);
+ atomic_inc(&dest->refcnt);
+ }
+}
+
+
+/*
+ * Unlink a destination from the given service
+ */
+static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest,
+ int svcupd)
+{
+ dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
+
+ /*
+ * Remove it from the d-linked destination list.
+ */
+ list_del(&dest->n_list);
+ svc->num_dests--;
+ if (svcupd) {
+ /*
+ * Call the update_service function of its scheduler
+ */
+ svc->scheduler->update_service(svc);
+ }
+}
+
+
+/*
+ * Delete a destination server in the given service
+ */
+static int
+ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
+{
+ struct ip_vs_dest *dest;
+ __u32 daddr = udest->addr;
+ __u16 dport = udest->port;
+
+ EnterFunction(2);
+
+ dest = ip_vs_lookup_dest(svc, daddr, dport);
+ if (dest == NULL) {
+ IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
+ return -ENOENT;
+ }
+
+ write_lock_bh(&__ip_vs_svc_lock);
+
+ /*
+ * Wait until all other svc users go away.
+ */
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+ /*
+ * Unlink dest from the service
+ */
+ __ip_vs_unlink_dest(svc, dest, 1);
+
+ write_unlock_bh(&__ip_vs_svc_lock);
+
+ /*
+ * Delete the destination
+ */
+ __ip_vs_del_dest(dest);
+
+ LeaveFunction(2);
+
+ return 0;
+}
+
+
+/*
+ * Add a service into the service hash table
+ */
+static int
+ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
+{
+ int ret = 0;
+ struct ip_vs_scheduler *sched = NULL;
+ struct ip_vs_service *svc = NULL;
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ /* Lookup the scheduler by 'u->sched_name' */
+ sched = ip_vs_scheduler_get(u->sched_name);
+ if (sched == NULL) {
+ IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
+ u->sched_name);
+ ret = -ENOENT;
+ goto out_mod_dec;
+ }
+
+ svc = (struct ip_vs_service *)
+ kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
+ if (svc == NULL) {
+ IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
+ ret = -ENOMEM;
+ goto out_err;
+ }
+ memset(svc, 0, sizeof(struct ip_vs_service));
+
+ /* I'm the first user of the service */
+ atomic_set(&svc->usecnt, 1);
+ atomic_set(&svc->refcnt, 0);
+
+ svc->protocol = u->protocol;
+ svc->addr = u->addr;
+ svc->port = u->port;
+ svc->fwmark = u->fwmark;
+ svc->flags = u->flags;
+ svc->timeout = u->timeout * HZ;
+ svc->netmask = u->netmask;
+
+ INIT_LIST_HEAD(&svc->destinations);
+ rwlock_init(&svc->sched_lock);
+ spin_lock_init(&svc->stats.lock);
+
+ /* Bind the scheduler */
+ ret = ip_vs_bind_scheduler(svc, sched);
+ if (ret)
+ goto out_err;
+ sched = NULL;
+
+ /* Update the virtual service counters */
+ if (svc->port == FTPPORT)
+ atomic_inc(&ip_vs_ftpsvc_counter);
+ else if (svc->port == 0)
+ atomic_inc(&ip_vs_nullsvc_counter);
+
+ ip_vs_new_estimator(&svc->stats);
+ ip_vs_num_services++;
+
+ /* Hash the service into the service table */
+ write_lock_bh(&__ip_vs_svc_lock);
+ ip_vs_svc_hash(svc);
+ write_unlock_bh(&__ip_vs_svc_lock);
+
+ *svc_p = svc;
+ return 0;
+
+ out_err:
+ if (svc != NULL) {
+ if (svc->scheduler)
+ ip_vs_unbind_scheduler(svc);
+ if (svc->inc) {
+ local_bh_disable();
+ ip_vs_app_inc_put(svc->inc);
+ local_bh_enable();
+ }
+ kfree(svc);
+ }
+ ip_vs_scheduler_put(sched);
+
+ out_mod_dec:
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ return ret;
+}
+
+
+/*
+ * Edit a service and bind it with a new scheduler
+ */
+static int
+ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
+{
+ struct ip_vs_scheduler *sched, *old_sched;
+ int ret = 0;
+
+ /*
+ * Lookup the scheduler, by 'u->sched_name'
+ */
+ sched = ip_vs_scheduler_get(u->sched_name);
+ if (sched == NULL) {
+ IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
+ u->sched_name);
+ return -ENOENT;
+ }
+ old_sched = sched;
+
+ write_lock_bh(&__ip_vs_svc_lock);
+
+ /*
+ * Wait until all other svc users go away.
+ */
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+ /*
+ * Set the flags and timeout value
+ */
+ svc->flags = u->flags | IP_VS_SVC_F_HASHED;
+ svc->timeout = u->timeout * HZ;
+ svc->netmask = u->netmask;
+
+ old_sched = svc->scheduler;
+ if (sched != old_sched) {
+ /*
+ * Unbind the old scheduler
+ */
+ if ((ret = ip_vs_unbind_scheduler(svc))) {
+ old_sched = sched;
+ goto out;
+ }
+
+ /*
+ * Bind the new scheduler
+ */
+ if ((ret = ip_vs_bind_scheduler(svc, sched))) {
+ /*
+ * If ip_vs_bind_scheduler fails, restore the old
+ * scheduler.
+ * The main reason of failure is out of memory.
+ *
+ * The question is if the old scheduler can be
+ * restored all the time. TODO: if it cannot be
+ * restored some time, we must delete the service,
+ * otherwise the system may crash.
+ */
+ ip_vs_bind_scheduler(svc, old_sched);
+ old_sched = sched;
+ goto out;
+ }
+ }
+
+ out:
+ write_unlock_bh(&__ip_vs_svc_lock);
+
+ if (old_sched)
+ ip_vs_scheduler_put(old_sched);
+
+ return ret;
+}
+
+
+/*
+ * Delete a service from the service list
+ * - The service must be unlinked, unlocked and not referenced!
+ * - We are called under _bh lock
+ */
+static void __ip_vs_del_service(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest, *nxt;
+ struct ip_vs_scheduler *old_sched;
+
+ ip_vs_num_services--;
+ ip_vs_kill_estimator(&svc->stats);
+
+ /* Unbind scheduler */
+ old_sched = svc->scheduler;
+ ip_vs_unbind_scheduler(svc);
+ if (old_sched)
+ ip_vs_scheduler_put(old_sched);
+
+ /* Unbind app inc */
+ if (svc->inc) {
+ ip_vs_app_inc_put(svc->inc);
+ svc->inc = NULL;
+ }
+
+ /*
+ * Unlink the whole destination list
+ */
+ list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
+ __ip_vs_unlink_dest(svc, dest, 0);
+ __ip_vs_del_dest(dest);
+ }
+
+ /*
+ * Update the virtual service counters
+ */
+ if (svc->port == FTPPORT)
+ atomic_dec(&ip_vs_ftpsvc_counter);
+ else if (svc->port == 0)
+ atomic_dec(&ip_vs_nullsvc_counter);
+
+ /*
+ * Free the service if nobody refers to it
+ */
+ if (atomic_read(&svc->refcnt) == 0)
+ kfree(svc);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+}
+
+/*
+ * Delete a service from the service list
+ */
+static int ip_vs_del_service(struct ip_vs_service *svc)
+{
+ if (svc == NULL)
+ return -EEXIST;
+
+ /*
+ * Unhash it from the service table
+ */
+ write_lock_bh(&__ip_vs_svc_lock);
+
+ ip_vs_svc_unhash(svc);
+
+ /*
+ * Wait until all the svc users go away.
+ */
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+ __ip_vs_del_service(svc);
+
+ write_unlock_bh(&__ip_vs_svc_lock);
+
+ return 0;
+}
+
+
+/*
+ * Flush all the virtual services
+ */
+static int ip_vs_flush(void)
+{
+ int idx;
+ struct ip_vs_service *svc, *nxt;
+
+ /*
+ * Flush the service table hashed by <protocol,addr,port>
+ */
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
+ write_lock_bh(&__ip_vs_svc_lock);
+ ip_vs_svc_unhash(svc);
+ /*
+ * Wait until all the svc users go away.
+ */
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+ __ip_vs_del_service(svc);
+ write_unlock_bh(&__ip_vs_svc_lock);
+ }
+ }
+
+ /*
+ * Flush the service table hashed by fwmark
+ */
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ list_for_each_entry_safe(svc, nxt,
+ &ip_vs_svc_fwm_table[idx], f_list) {
+ write_lock_bh(&__ip_vs_svc_lock);
+ ip_vs_svc_unhash(svc);
+ /*
+ * Wait until all the svc users go away.
+ */
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+ __ip_vs_del_service(svc);
+ write_unlock_bh(&__ip_vs_svc_lock);
+ }
+ }
+
+ return 0;
+}
+
+
+/*
+ * Zero counters in a service or all services
+ */
+static int ip_vs_zero_service(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest;
+
+ write_lock_bh(&__ip_vs_svc_lock);
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ ip_vs_zero_stats(&dest->stats);
+ }
+ ip_vs_zero_stats(&svc->stats);
+ write_unlock_bh(&__ip_vs_svc_lock);
+ return 0;
+}
+
+static int ip_vs_zero_all(void)
+{
+ int idx;
+ struct ip_vs_service *svc;
+
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ ip_vs_zero_service(svc);
+ }
+ }
+
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ ip_vs_zero_service(svc);
+ }
+ }
+
+ ip_vs_zero_stats(&ip_vs_stats);
+ return 0;
+}
+
+
+static int
+proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = table->data;
+ int val = *valp;
+ int rc;
+
+ rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+ if (write && (*valp != val)) {
+ if ((*valp < 0) || (*valp > 3)) {
+ /* Restore the correct value */
+ *valp = val;
+ } else {
+ local_bh_disable();
+ update_defense_level();
+ local_bh_enable();
+ }
+ }
+ return rc;
+}
+
+
+static int
+proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = table->data;
+ int val[2];
+ int rc;
+
+ /* backup the value first */
+ memcpy(val, valp, sizeof(val));
+
+ rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+ if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
+ /* Restore the correct value */
+ memcpy(valp, val, sizeof(val));
+ }
+ return rc;
+}
+
+
+/*
+ * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
+ */
+
+static struct ctl_table vs_vars[] = {
+ {
+ .ctl_name = NET_IPV4_VS_AMEMTHRESH,
+ .procname = "amemthresh",
+ .data = &sysctl_ip_vs_amemthresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#ifdef CONFIG_IP_VS_DEBUG
+ {
+ .ctl_name = NET_IPV4_VS_DEBUG_LEVEL,
+ .procname = "debug_level",
+ .data = &sysctl_ip_vs_debug_level,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+ {
+ .ctl_name = NET_IPV4_VS_AMDROPRATE,
+ .procname = "am_droprate",
+ .data = &sysctl_ip_vs_am_droprate,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_VS_DROP_ENTRY,
+ .procname = "drop_entry",
+ .data = &sysctl_ip_vs_drop_entry,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_do_defense_mode,
+ },
+ {
+ .ctl_name = NET_IPV4_VS_DROP_PACKET,
+ .procname = "drop_packet",
+ .data = &sysctl_ip_vs_drop_packet,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_do_defense_mode,
+ },
+ {
+ .ctl_name = NET_IPV4_VS_SECURE_TCP,
+ .procname = "secure_tcp",
+ .data = &sysctl_ip_vs_secure_tcp,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_do_defense_mode,
+ },
+#if 0
+ {
+ .ctl_name = NET_IPV4_VS_TO_ES,
+ .procname = "timeout_established",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_VS_TO_SS,
+ .procname = "timeout_synsent",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_VS_TO_SR,
+ .procname = "timeout_synrecv",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_VS_TO_FW,
+ .procname = "timeout_finwait",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_VS_TO_TW,
+ .procname = "timeout_timewait",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_VS_TO_CL,
+ .procname = "timeout_close",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_VS_TO_CW,
+ .procname = "timeout_closewait",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_VS_TO_LA,
+ .procname = "timeout_lastack",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_VS_TO_LI,
+ .procname = "timeout_listen",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_VS_TO_SA,
+ .procname = "timeout_synack",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_VS_TO_UDP,
+ .procname = "timeout_udp",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_VS_TO_ICMP,
+ .procname = "timeout_icmp",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+#endif
+ {
+ .ctl_name = NET_IPV4_VS_CACHE_BYPASS,
+ .procname = "cache_bypass",
+ .data = &sysctl_ip_vs_cache_bypass,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_VS_EXPIRE_NODEST_CONN,
+ .procname = "expire_nodest_conn",
+ .data = &sysctl_ip_vs_expire_nodest_conn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
+ .procname = "expire_quiescent_template",
+ .data = &sysctl_ip_vs_expire_quiescent_template,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_VS_SYNC_THRESHOLD,
+ .procname = "sync_threshold",
+ .data = &sysctl_ip_vs_sync_threshold,
+ .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
+ .mode = 0644,
+ .proc_handler = &proc_do_sync_threshold,
+ },
+ {
+ .ctl_name = NET_IPV4_VS_NAT_ICMP_SEND,
+ .procname = "nat_icmp_send",
+ .data = &sysctl_ip_vs_nat_icmp_send,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table vs_table[] = {
+ {
+ .ctl_name = NET_IPV4_VS,
+ .procname = "vs",
+ .mode = 0555,
+ .child = vs_vars
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table ipv4_table[] = {
+ {
+ .ctl_name = NET_IPV4,
+ .procname = "ipv4",
+ .mode = 0555,
+ .child = vs_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table vs_root_table[] = {
+ {
+ .ctl_name = CTL_NET,
+ .procname = "net",
+ .mode = 0555,
+ .child = ipv4_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static struct ctl_table_header * sysctl_header;
+
+#ifdef CONFIG_PROC_FS
+
+struct ip_vs_iter {
+ struct list_head *table;
+ int bucket;
+};
+
+/*
+ * Write the contents of the VS rule table to a PROCfs file.
+ * (It is kept just for backward compatibility)
+ */
+static inline const char *ip_vs_fwd_name(unsigned flags)
+{
+ switch (flags & IP_VS_CONN_F_FWD_MASK) {
+ case IP_VS_CONN_F_LOCALNODE:
+ return "Local";
+ case IP_VS_CONN_F_TUNNEL:
+ return "Tunnel";
+ case IP_VS_CONN_F_DROUTE:
+ return "Route";
+ default:
+ return "Masq";
+ }
+}
+
+
+/* Get the Nth entry in the two lists */
+static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
+{
+ struct ip_vs_iter *iter = seq->private;
+ int idx;
+ struct ip_vs_service *svc;
+
+ /* look in hash by protocol */
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ if (pos-- == 0){
+ iter->table = ip_vs_svc_table;
+ iter->bucket = idx;
+ return svc;
+ }
+ }
+ }
+
+ /* keep looking in fwmark */
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ if (pos-- == 0) {
+ iter->table = ip_vs_svc_fwm_table;
+ iter->bucket = idx;
+ return svc;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
+{
+
+ read_lock_bh(&__ip_vs_svc_lock);
+ return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+
+static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct list_head *e;
+ struct ip_vs_iter *iter;
+ struct ip_vs_service *svc;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return ip_vs_info_array(seq,0);
+
+ svc = v;
+ iter = seq->private;
+
+ if (iter->table == ip_vs_svc_table) {
+ /* next service in table hashed by protocol */
+ if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
+ return list_entry(e, struct ip_vs_service, s_list);
+
+
+ while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
+ list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
+ s_list) {
+ return svc;
+ }
+ }
+
+ iter->table = ip_vs_svc_fwm_table;
+ iter->bucket = -1;
+ goto scan_fwmark;
+ }
+
+ /* next service in hashed by fwmark */
+ if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
+ return list_entry(e, struct ip_vs_service, f_list);
+
+ scan_fwmark:
+ while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
+ list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
+ f_list)
+ return svc;
+ }
+
+ return NULL;
+}
+
+static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
+{
+ read_unlock_bh(&__ip_vs_svc_lock);
+}
+
+
+static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq,
+ "IP Virtual Server version %d.%d.%d (size=%d)\n",
+ NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
+ seq_puts(seq,
+ "Prot LocalAddress:Port Scheduler Flags\n");
+ seq_puts(seq,
+ " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
+ } else {
+ const struct ip_vs_service *svc = v;
+ const struct ip_vs_iter *iter = seq->private;
+ const struct ip_vs_dest *dest;
+
+ if (iter->table == ip_vs_svc_table)
+ seq_printf(seq, "%s %08X:%04X %s ",
+ ip_vs_proto_name(svc->protocol),
+ ntohl(svc->addr),
+ ntohs(svc->port),
+ svc->scheduler->name);
+ else
+ seq_printf(seq, "FWM %08X %s ",
+ svc->fwmark, svc->scheduler->name);
+
+ if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+ seq_printf(seq, "persistent %d %08X\n",
+ svc->timeout,
+ ntohl(svc->netmask));
+ else
+ seq_putc(seq, '\n');
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ seq_printf(seq,
+ " -> %08X:%04X %-7s %-6d %-10d %-10d\n",
+ ntohl(dest->addr), ntohs(dest->port),
+ ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
+ atomic_read(&dest->weight),
+ atomic_read(&dest->activeconns),
+ atomic_read(&dest->inactconns));
+ }
+ }
+ return 0;
+}
+
+static struct seq_operations ip_vs_info_seq_ops = {
+ .start = ip_vs_info_seq_start,
+ .next = ip_vs_info_seq_next,
+ .stop = ip_vs_info_seq_stop,
+ .show = ip_vs_info_seq_show,
+};
+
+static int ip_vs_info_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc = -ENOMEM;
+ struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (!s)
+ goto out;
+
+ rc = seq_open(file, &ip_vs_info_seq_ops);
+ if (rc)
+ goto out_kfree;
+
+ seq = file->private_data;
+ seq->private = s;
+ memset(s, 0, sizeof(*s));
+out:
+ return rc;
+out_kfree:
+ kfree(s);
+ goto out;
+}
+
+static struct file_operations ip_vs_info_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_info_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+#endif
+
+struct ip_vs_stats ip_vs_stats;
+
+#ifdef CONFIG_PROC_FS
+static int ip_vs_stats_show(struct seq_file *seq, void *v)
+{
+
+/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+ seq_puts(seq,
+ " Total Incoming Outgoing Incoming Outgoing\n");
+ seq_printf(seq,
+ " Conns Packets Packets Bytes Bytes\n");
+
+ spin_lock_bh(&ip_vs_stats.lock);
+ seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
+ ip_vs_stats.inpkts, ip_vs_stats.outpkts,
+ (unsigned long long) ip_vs_stats.inbytes,
+ (unsigned long long) ip_vs_stats.outbytes);
+
+/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+ seq_puts(seq,
+ " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
+ seq_printf(seq,"%8X %8X %8X %16X %16X\n",
+ ip_vs_stats.cps,
+ ip_vs_stats.inpps,
+ ip_vs_stats.outpps,
+ ip_vs_stats.inbps,
+ ip_vs_stats.outbps);
+ spin_unlock_bh(&ip_vs_stats.lock);
+
+ return 0;
+}
+
+static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, ip_vs_stats_show, NULL);
+}
+
+static struct file_operations ip_vs_stats_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_stats_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#endif
+
+/*
+ * Set timeout values for tcp tcpfin udp in the timeout_table.
+ */
+static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
+{
+ IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
+ u->tcp_timeout,
+ u->tcp_fin_timeout,
+ u->udp_timeout);
+
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ if (u->tcp_timeout) {
+ ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
+ = u->tcp_timeout * HZ;
+ }
+
+ if (u->tcp_fin_timeout) {
+ ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
+ = u->tcp_fin_timeout * HZ;
+ }
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ if (u->udp_timeout) {
+ ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
+ = u->udp_timeout * HZ;
+ }
+#endif
+ return 0;
+}
+
+
+#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
+#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
+#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
+ sizeof(struct ip_vs_dest_user))
+#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
+#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
+#define MAX_ARG_LEN SVCDEST_ARG_LEN
+
+static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
+ [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
+ [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
+};
+
+static int
+do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+ int ret;
+ unsigned char arg[MAX_ARG_LEN];
+ struct ip_vs_service_user *usvc;
+ struct ip_vs_service *svc;
+ struct ip_vs_dest_user *udest;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (len != set_arglen[SET_CMDID(cmd)]) {
+ IP_VS_ERR("set_ctl: len %u != %u\n",
+ len, set_arglen[SET_CMDID(cmd)]);
+ return -EINVAL;
+ }
+
+ if (copy_from_user(arg, user, len) != 0)
+ return -EFAULT;
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ if (down_interruptible(&__ip_vs_mutex)) {
+ ret = -ERESTARTSYS;
+ goto out_dec;
+ }
+
+ if (cmd == IP_VS_SO_SET_FLUSH) {
+ /* Flush the virtual service */
+ ret = ip_vs_flush();
+ goto out_unlock;
+ } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
+ /* Set timeout values for (tcp tcpfin udp) */
+ ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
+ goto out_unlock;
+ } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
+ struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
+ ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
+ goto out_unlock;
+ } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
+ struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
+ ret = stop_sync_thread(dm->state);
+ goto out_unlock;
+ }
+
+ usvc = (struct ip_vs_service_user *)arg;
+ udest = (struct ip_vs_dest_user *)(usvc + 1);
+
+ if (cmd == IP_VS_SO_SET_ZERO) {
+ /* if no service address is set, zero counters in all */
+ if (!usvc->fwmark && !usvc->addr && !usvc->port) {
+ ret = ip_vs_zero_all();
+ goto out_unlock;
+ }
+ }
+
+ /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
+ if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
+ IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
+ usvc->protocol, NIPQUAD(usvc->addr),
+ ntohs(usvc->port), usvc->sched_name);
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ /* Lookup the exact service by <protocol, addr, port> or fwmark */
+ if (usvc->fwmark == 0)
+ svc = __ip_vs_service_get(usvc->protocol,
+ usvc->addr, usvc->port);
+ else
+ svc = __ip_vs_svc_fwm_get(usvc->fwmark);
+
+ if (cmd != IP_VS_SO_SET_ADD
+ && (svc == NULL || svc->protocol != usvc->protocol)) {
+ ret = -ESRCH;
+ goto out_unlock;
+ }
+
+ switch (cmd) {
+ case IP_VS_SO_SET_ADD:
+ if (svc != NULL)
+ ret = -EEXIST;
+ else
+ ret = ip_vs_add_service(usvc, &svc);
+ break;
+ case IP_VS_SO_SET_EDIT:
+ ret = ip_vs_edit_service(svc, usvc);
+ break;
+ case IP_VS_SO_SET_DEL:
+ ret = ip_vs_del_service(svc);
+ if (!ret)
+ goto out_unlock;
+ break;
+ case IP_VS_SO_SET_ZERO:
+ ret = ip_vs_zero_service(svc);
+ break;
+ case IP_VS_SO_SET_ADDDEST:
+ ret = ip_vs_add_dest(svc, udest);
+ break;
+ case IP_VS_SO_SET_EDITDEST:
+ ret = ip_vs_edit_dest(svc, udest);
+ break;
+ case IP_VS_SO_SET_DELDEST:
+ ret = ip_vs_del_dest(svc, udest);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ if (svc)
+ ip_vs_service_put(svc);
+
+ out_unlock:
+ up(&__ip_vs_mutex);
+ out_dec:
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ return ret;
+}
+
+
+static void
+ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
+{
+ spin_lock_bh(&src->lock);
+ memcpy(dst, src, (char*)&src->lock - (char*)src);
+ spin_unlock_bh(&src->lock);
+}
+
+static void
+ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
+{
+ dst->protocol = src->protocol;
+ dst->addr = src->addr;
+ dst->port = src->port;
+ dst->fwmark = src->fwmark;
+ strcpy(dst->sched_name, src->scheduler->name);
+ dst->flags = src->flags;
+ dst->timeout = src->timeout / HZ;
+ dst->netmask = src->netmask;
+ dst->num_dests = src->num_dests;
+ ip_vs_copy_stats(&dst->stats, &src->stats);
+}
+
+static inline int
+__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
+ struct ip_vs_get_services __user *uptr)
+{
+ int idx, count=0;
+ struct ip_vs_service *svc;
+ struct ip_vs_service_entry entry;
+ int ret = 0;
+
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ if (count >= get->num_services)
+ goto out;
+ ip_vs_copy_service(&entry, svc);
+ if (copy_to_user(&uptr->entrytable[count],
+ &entry, sizeof(entry))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ count++;
+ }
+ }
+
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ if (count >= get->num_services)
+ goto out;
+ ip_vs_copy_service(&entry, svc);
+ if (copy_to_user(&uptr->entrytable[count],
+ &entry, sizeof(entry))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ count++;
+ }
+ }
+ out:
+ return ret;
+}
+
+static inline int
+__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
+ struct ip_vs_get_dests __user *uptr)
+{
+ struct ip_vs_service *svc;
+ int ret = 0;
+
+ if (get->fwmark)
+ svc = __ip_vs_svc_fwm_get(get->fwmark);
+ else
+ svc = __ip_vs_service_get(get->protocol,
+ get->addr, get->port);
+ if (svc) {
+ int count = 0;
+ struct ip_vs_dest *dest;
+ struct ip_vs_dest_entry entry;
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ if (count >= get->num_dests)
+ break;
+
+ entry.addr = dest->addr;
+ entry.port = dest->port;
+ entry.conn_flags = atomic_read(&dest->conn_flags);
+ entry.weight = atomic_read(&dest->weight);
+ entry.u_threshold = dest->u_threshold;
+ entry.l_threshold = dest->l_threshold;
+ entry.activeconns = atomic_read(&dest->activeconns);
+ entry.inactconns = atomic_read(&dest->inactconns);
+ entry.persistconns = atomic_read(&dest->persistconns);
+ ip_vs_copy_stats(&entry.stats, &dest->stats);
+ if (copy_to_user(&uptr->entrytable[count],
+ &entry, sizeof(entry))) {
+ ret = -EFAULT;
+ break;
+ }
+ count++;
+ }
+ ip_vs_service_put(svc);
+ } else
+ ret = -ESRCH;
+ return ret;
+}
+
+static inline void
+__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
+{
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ u->tcp_timeout =
+ ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
+ u->tcp_fin_timeout =
+ ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ u->udp_timeout =
+ ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
+#endif
+}
+
+
+#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
+#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
+#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
+#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
+#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
+#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
+#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
+
+static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
+ [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
+ [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
+ [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
+ [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
+ [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
+ [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
+ [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
+};
+
+static int
+do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+ unsigned char arg[128];
+ int ret = 0;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (*len < get_arglen[GET_CMDID(cmd)]) {
+ IP_VS_ERR("get_ctl: len %u < %u\n",
+ *len, get_arglen[GET_CMDID(cmd)]);
+ return -EINVAL;
+ }
+
+ if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
+ return -EFAULT;
+
+ if (down_interruptible(&__ip_vs_mutex))
+ return -ERESTARTSYS;
+
+ switch (cmd) {
+ case IP_VS_SO_GET_VERSION:
+ {
+ char buf[64];
+
+ sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
+ NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
+ if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+ *len = strlen(buf)+1;
+ }
+ break;
+
+ case IP_VS_SO_GET_INFO:
+ {
+ struct ip_vs_getinfo info;
+ info.version = IP_VS_VERSION_CODE;
+ info.size = IP_VS_CONN_TAB_SIZE;
+ info.num_services = ip_vs_num_services;
+ if (copy_to_user(user, &info, sizeof(info)) != 0)
+ ret = -EFAULT;
+ }
+ break;
+
+ case IP_VS_SO_GET_SERVICES:
+ {
+ struct ip_vs_get_services *get;
+ int size;
+
+ get = (struct ip_vs_get_services *)arg;
+ size = sizeof(*get) +
+ sizeof(struct ip_vs_service_entry) * get->num_services;
+ if (*len != size) {
+ IP_VS_ERR("length: %u != %u\n", *len, size);
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = __ip_vs_get_service_entries(get, user);
+ }
+ break;
+
+ case IP_VS_SO_GET_SERVICE:
+ {
+ struct ip_vs_service_entry *entry;
+ struct ip_vs_service *svc;
+
+ entry = (struct ip_vs_service_entry *)arg;
+ if (entry->fwmark)
+ svc = __ip_vs_svc_fwm_get(entry->fwmark);
+ else
+ svc = __ip_vs_service_get(entry->protocol,
+ entry->addr, entry->port);
+ if (svc) {
+ ip_vs_copy_service(entry, svc);
+ if (copy_to_user(user, entry, sizeof(*entry)) != 0)
+ ret = -EFAULT;
+ ip_vs_service_put(svc);
+ } else
+ ret = -ESRCH;
+ }
+ break;
+
+ case IP_VS_SO_GET_DESTS:
+ {
+ struct ip_vs_get_dests *get;
+ int size;
+
+ get = (struct ip_vs_get_dests *)arg;
+ size = sizeof(*get) +
+ sizeof(struct ip_vs_dest_entry) * get->num_dests;
+ if (*len != size) {
+ IP_VS_ERR("length: %u != %u\n", *len, size);
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = __ip_vs_get_dest_entries(get, user);
+ }
+ break;
+
+ case IP_VS_SO_GET_TIMEOUT:
+ {
+ struct ip_vs_timeout_user t;
+
+ __ip_vs_get_timeouts(&t);
+ if (copy_to_user(user, &t, sizeof(t)) != 0)
+ ret = -EFAULT;
+ }
+ break;
+
+ case IP_VS_SO_GET_DAEMON:
+ {
+ struct ip_vs_daemon_user d[2];
+
+ memset(&d, 0, sizeof(d));
+ if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
+ d[0].state = IP_VS_STATE_MASTER;
+ strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn);
+ d[0].syncid = ip_vs_master_syncid;
+ }
+ if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
+ d[1].state = IP_VS_STATE_BACKUP;
+ strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn);
+ d[1].syncid = ip_vs_backup_syncid;
+ }
+ if (copy_to_user(user, &d, sizeof(d)) != 0)
+ ret = -EFAULT;
+ }
+ break;
+
+ default:
+ ret = -EINVAL;
+ }
+
+ out:
+ up(&__ip_vs_mutex);
+ return ret;
+}
+
+
+static struct nf_sockopt_ops ip_vs_sockopts = {
+ .pf = PF_INET,
+ .set_optmin = IP_VS_BASE_CTL,
+ .set_optmax = IP_VS_SO_SET_MAX+1,
+ .set = do_ip_vs_set_ctl,
+ .get_optmin = IP_VS_BASE_CTL,
+ .get_optmax = IP_VS_SO_GET_MAX+1,
+ .get = do_ip_vs_get_ctl,
+};
+
+
+int ip_vs_control_init(void)
+{
+ int ret;
+ int idx;
+
+ EnterFunction(2);
+
+ ret = nf_register_sockopt(&ip_vs_sockopts);
+ if (ret) {
+ IP_VS_ERR("cannot register sockopt.\n");
+ return ret;
+ }
+
+ proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
+ proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
+
+ sysctl_header = register_sysctl_table(vs_root_table, 0);
+
+ /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
+ INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+ }
+ for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
+ INIT_LIST_HEAD(&ip_vs_rtable[idx]);
+ }
+
+ memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
+ spin_lock_init(&ip_vs_stats.lock);
+ ip_vs_new_estimator(&ip_vs_stats);
+
+ /* Hook the defense timer */
+ schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
+
+ LeaveFunction(2);
+ return 0;
+}
+
+
+void ip_vs_control_cleanup(void)
+{
+ EnterFunction(2);
+ ip_vs_trash_cleanup();
+ cancel_rearming_delayed_work(&defense_work);
+ ip_vs_kill_estimator(&ip_vs_stats);
+ unregister_sysctl_table(sysctl_header);
+ proc_net_remove("ip_vs_stats");
+ proc_net_remove("ip_vs");
+ nf_unregister_sockopt(&ip_vs_sockopts);
+ LeaveFunction(2);
+}
diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c
new file mode 100644
index 00000000000..f3bc320dce9
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_dh.c
@@ -0,0 +1,258 @@
+/*
+ * IPVS: Destination Hashing scheduling module
+ *
+ * Version: $Id: ip_vs_dh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@gnuchina.org>
+ *
+ * Inspired by the consistent hashing scheduler patch from
+ * Thomas Proell <proellt@gmx.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The dh algorithm is to select server by the hash key of destination IP
+ * address. The pseudo code is as follows:
+ *
+ * n <- servernode[dest_ip];
+ * if (n is dead) OR
+ * (n is overloaded) OR (n.weight <= 0) then
+ * return NULL;
+ *
+ * return n;
+ *
+ * Notes that servernode is a 256-bucket hash table that maps the hash
+ * index derived from packet destination IP address to the current server
+ * array. If the dh scheduler is used in cache cluster, it is good to
+ * combine it with cache_bypass feature. When the statically assigned
+ * server is dead or overloaded, the load balancer can bypass the cache
+ * server and send requests to the original server directly.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * IPVS DH bucket
+ */
+struct ip_vs_dh_bucket {
+ struct ip_vs_dest *dest; /* real server (cache) */
+};
+
+/*
+ * for IPVS DH entry hash table
+ */
+#ifndef CONFIG_IP_VS_DH_TAB_BITS
+#define CONFIG_IP_VS_DH_TAB_BITS 8
+#endif
+#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS
+#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS)
+#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1)
+
+
+/*
+ * Returns hash value for IPVS DH entry
+ */
+static inline unsigned ip_vs_dh_hashkey(__u32 addr)
+{
+ return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK;
+}
+
+
+/*
+ * Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __u32 addr)
+{
+ return (tbl[ip_vs_dh_hashkey(addr)]).dest;
+}
+
+
+/*
+ * Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
+{
+ int i;
+ struct ip_vs_dh_bucket *b;
+ struct list_head *p;
+ struct ip_vs_dest *dest;
+
+ b = tbl;
+ p = &svc->destinations;
+ for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
+ if (list_empty(p)) {
+ b->dest = NULL;
+ } else {
+ if (p == &svc->destinations)
+ p = p->next;
+
+ dest = list_entry(p, struct ip_vs_dest, n_list);
+ atomic_inc(&dest->refcnt);
+ b->dest = dest;
+
+ p = p->next;
+ }
+ b++;
+ }
+ return 0;
+}
+
+
+/*
+ * Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
+{
+ int i;
+ struct ip_vs_dh_bucket *b;
+
+ b = tbl;
+ for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
+ if (b->dest) {
+ atomic_dec(&b->dest->refcnt);
+ b->dest = NULL;
+ }
+ b++;
+ }
+}
+
+
+static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_dh_bucket *tbl;
+
+ /* allocate the DH table for this service */
+ tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE,
+ GFP_ATOMIC);
+ if (tbl == NULL) {
+ IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n");
+ return -ENOMEM;
+ }
+ svc->sched_data = tbl;
+ IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
+ "current service\n",
+ sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
+
+ /* assign the hash buckets with the updated service */
+ ip_vs_dh_assign(tbl, svc);
+
+ return 0;
+}
+
+
+static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_dh_bucket *tbl = svc->sched_data;
+
+ /* got to clean up hash buckets here */
+ ip_vs_dh_flush(tbl);
+
+ /* release the table itself */
+ kfree(svc->sched_data);
+ IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
+ sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
+
+ return 0;
+}
+
+
+static int ip_vs_dh_update_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_dh_bucket *tbl = svc->sched_data;
+
+ /* got to clean up hash buckets here */
+ ip_vs_dh_flush(tbl);
+
+ /* assign the hash buckets with the updated service */
+ ip_vs_dh_assign(tbl, svc);
+
+ return 0;
+}
+
+
+/*
+ * If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
+ * consider that the server is overloaded here.
+ */
+static inline int is_overloaded(struct ip_vs_dest *dest)
+{
+ return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ * Destination hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_dh_bucket *tbl;
+ struct iphdr *iph = skb->nh.iph;
+
+ IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n");
+
+ tbl = (struct ip_vs_dh_bucket *)svc->sched_data;
+ dest = ip_vs_dh_get(tbl, iph->daddr);
+ if (!dest
+ || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
+ || atomic_read(&dest->weight) <= 0
+ || is_overloaded(dest)) {
+ return NULL;
+ }
+
+ IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u "
+ "--> server %u.%u.%u.%u:%d\n",
+ NIPQUAD(iph->daddr),
+ NIPQUAD(dest->addr),
+ ntohs(dest->port));
+
+ return dest;
+}
+
+
+/*
+ * IPVS DH Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_dh_scheduler =
+{
+ .name = "dh",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .init_service = ip_vs_dh_init_svc,
+ .done_service = ip_vs_dh_done_svc,
+ .update_service = ip_vs_dh_update_svc,
+ .schedule = ip_vs_dh_schedule,
+};
+
+
+static int __init ip_vs_dh_init(void)
+{
+ INIT_LIST_HEAD(&ip_vs_dh_scheduler.n_list);
+ return register_ip_vs_scheduler(&ip_vs_dh_scheduler);
+}
+
+
+static void __exit ip_vs_dh_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
+}
+
+
+module_init(ip_vs_dh_init);
+module_exit(ip_vs_dh_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
new file mode 100644
index 00000000000..67b3e2fc1fa
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_est.c
@@ -0,0 +1,200 @@
+/*
+ * ip_vs_est.c: simple rate estimator for IPVS
+ *
+ * Version: $Id: ip_vs_est.c,v 1.4 2002/11/30 01:50:35 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include <net/ip_vs.h>
+
+/*
+ This code is to estimate rate in a shorter interval (such as 8
+ seconds) for virtual services and real servers. For measure rate in a
+ long interval, it is easy to implement a user level daemon which
+ periodically reads those statistical counters and measure rate.
+
+ Currently, the measurement is activated by slow timer handler. Hope
+ this measurement will not introduce too much load.
+
+ We measure rate during the last 8 seconds every 2 seconds:
+
+ avgrate = avgrate*(1-W) + rate*W
+
+ where W = 2^(-2)
+
+ NOTES.
+
+ * The stored value for average bps is scaled by 2^5, so that maximal
+ rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10.
+
+ * A lot code is taken from net/sched/estimator.c
+ */
+
+
+struct ip_vs_estimator
+{
+ struct ip_vs_estimator *next;
+ struct ip_vs_stats *stats;
+
+ u32 last_conns;
+ u32 last_inpkts;
+ u32 last_outpkts;
+ u64 last_inbytes;
+ u64 last_outbytes;
+
+ u32 cps;
+ u32 inpps;
+ u32 outpps;
+ u32 inbps;
+ u32 outbps;
+};
+
+
+static struct ip_vs_estimator *est_list = NULL;
+static DEFINE_RWLOCK(est_lock);
+static struct timer_list est_timer;
+
+static void estimation_timer(unsigned long arg)
+{
+ struct ip_vs_estimator *e;
+ struct ip_vs_stats *s;
+ u32 n_conns;
+ u32 n_inpkts, n_outpkts;
+ u64 n_inbytes, n_outbytes;
+ u32 rate;
+
+ read_lock(&est_lock);
+ for (e = est_list; e; e = e->next) {
+ s = e->stats;
+
+ spin_lock(&s->lock);
+ n_conns = s->conns;
+ n_inpkts = s->inpkts;
+ n_outpkts = s->outpkts;
+ n_inbytes = s->inbytes;
+ n_outbytes = s->outbytes;
+
+ /* scaled by 2^10, but divided 2 seconds */
+ rate = (n_conns - e->last_conns)<<9;
+ e->last_conns = n_conns;
+ e->cps += ((long)rate - (long)e->cps)>>2;
+ s->cps = (e->cps+0x1FF)>>10;
+
+ rate = (n_inpkts - e->last_inpkts)<<9;
+ e->last_inpkts = n_inpkts;
+ e->inpps += ((long)rate - (long)e->inpps)>>2;
+ s->inpps = (e->inpps+0x1FF)>>10;
+
+ rate = (n_outpkts - e->last_outpkts)<<9;
+ e->last_outpkts = n_outpkts;
+ e->outpps += ((long)rate - (long)e->outpps)>>2;
+ s->outpps = (e->outpps+0x1FF)>>10;
+
+ rate = (n_inbytes - e->last_inbytes)<<4;
+ e->last_inbytes = n_inbytes;
+ e->inbps += ((long)rate - (long)e->inbps)>>2;
+ s->inbps = (e->inbps+0xF)>>5;
+
+ rate = (n_outbytes - e->last_outbytes)<<4;
+ e->last_outbytes = n_outbytes;
+ e->outbps += ((long)rate - (long)e->outbps)>>2;
+ s->outbps = (e->outbps+0xF)>>5;
+ spin_unlock(&s->lock);
+ }
+ read_unlock(&est_lock);
+ mod_timer(&est_timer, jiffies + 2*HZ);
+}
+
+int ip_vs_new_estimator(struct ip_vs_stats *stats)
+{
+ struct ip_vs_estimator *est;
+
+ est = kmalloc(sizeof(*est), GFP_KERNEL);
+ if (est == NULL)
+ return -ENOMEM;
+
+ memset(est, 0, sizeof(*est));
+ est->stats = stats;
+ est->last_conns = stats->conns;
+ est->cps = stats->cps<<10;
+
+ est->last_inpkts = stats->inpkts;
+ est->inpps = stats->inpps<<10;
+
+ est->last_outpkts = stats->outpkts;
+ est->outpps = stats->outpps<<10;
+
+ est->last_inbytes = stats->inbytes;
+ est->inbps = stats->inbps<<5;
+
+ est->last_outbytes = stats->outbytes;
+ est->outbps = stats->outbps<<5;
+
+ write_lock_bh(&est_lock);
+ est->next = est_list;
+ if (est->next == NULL) {
+ init_timer(&est_timer);
+ est_timer.expires = jiffies + 2*HZ;
+ est_timer.function = estimation_timer;
+ add_timer(&est_timer);
+ }
+ est_list = est;
+ write_unlock_bh(&est_lock);
+ return 0;
+}
+
+void ip_vs_kill_estimator(struct ip_vs_stats *stats)
+{
+ struct ip_vs_estimator *est, **pest;
+ int killed = 0;
+
+ write_lock_bh(&est_lock);
+ pest = &est_list;
+ while ((est=*pest) != NULL) {
+ if (est->stats != stats) {
+ pest = &est->next;
+ continue;
+ }
+ *pest = est->next;
+ kfree(est);
+ killed++;
+ }
+ if (killed && est_list == NULL)
+ del_timer_sync(&est_timer);
+ write_unlock_bh(&est_lock);
+}
+
+void ip_vs_zero_estimator(struct ip_vs_stats *stats)
+{
+ struct ip_vs_estimator *e;
+
+ write_lock_bh(&est_lock);
+ for (e = est_list; e; e = e->next) {
+ if (e->stats != stats)
+ continue;
+
+ /* set counters zero */
+ e->last_conns = 0;
+ e->last_inpkts = 0;
+ e->last_outpkts = 0;
+ e->last_inbytes = 0;
+ e->last_outbytes = 0;
+ e->cps = 0;
+ e->inpps = 0;
+ e->outpps = 0;
+ e->inbps = 0;
+ e->outbps = 0;
+ }
+ write_unlock_bh(&est_lock);
+}
diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c
new file mode 100644
index 00000000000..a19a33ceb81
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_ftp.c
@@ -0,0 +1,400 @@
+/*
+ * ip_vs_ftp.c: IPVS ftp application module
+ *
+ * Version: $Id: ip_vs_ftp.c,v 1.13 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * Changes:
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
+ * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
+ *
+ * IP_MASQ_FTP ftp masquerading module
+ *
+ * Version: @(#)ip_masq_ftp.c 0.04 02/05/96
+ *
+ * Author: Wouter Gadeyne
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+
+#include <net/ip_vs.h>
+
+
+#define SERVER_STRING "227 Entering Passive Mode ("
+#define CLIENT_STRING "PORT "
+
+
+/*
+ * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
+ * First port is set to the default port.
+ */
+static int ports[IP_VS_APP_MAX_PORTS] = {21, 0};
+module_param_array(ports, int, NULL, 0);
+
+/*
+ * Debug level
+ */
+#ifdef CONFIG_IP_VS_DEBUG
+static int debug=0;
+module_param(debug, int, 0);
+#endif
+
+
+/* Dummy variable */
+static int ip_vs_ftp_pasv;
+
+
+static int
+ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
+{
+ return 0;
+}
+
+
+static int
+ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
+{
+ return 0;
+}
+
+
+/*
+ * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
+ * with the "pattern" and terminated with the "term" character.
+ * <addr,port> is in network order.
+ */
+static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
+ const char *pattern, size_t plen, char term,
+ __u32 *addr, __u16 *port,
+ char **start, char **end)
+{
+ unsigned char p[6];
+ int i = 0;
+
+ if (data_limit - data < plen) {
+ /* check if there is partial match */
+ if (strnicmp(data, pattern, data_limit - data) == 0)
+ return -1;
+ else
+ return 0;
+ }
+
+ if (strnicmp(data, pattern, plen) != 0) {
+ return 0;
+ }
+ *start = data + plen;
+
+ for (data = *start; *data != term; data++) {
+ if (data == data_limit)
+ return -1;
+ }
+ *end = data;
+
+ memset(p, 0, sizeof(p));
+ for (data = *start; data != *end; data++) {
+ if (*data >= '0' && *data <= '9') {
+ p[i] = p[i]*10 + *data - '0';
+ } else if (*data == ',' && i < 5) {
+ i++;
+ } else {
+ /* unexpected character */
+ return -1;
+ }
+ }
+
+ if (i != 5)
+ return -1;
+
+ *addr = (p[3]<<24) | (p[2]<<16) | (p[1]<<8) | p[0];
+ *port = (p[5]<<8) | p[4];
+ return 1;
+}
+
+
+/*
+ * Look at outgoing ftp packets to catch the response to a PASV command
+ * from the server (inside-to-outside).
+ * When we see one, we build a connection entry with the client address,
+ * client port 0 (unknown at the moment), the server address and the
+ * server port. Mark the current connection entry as a control channel
+ * of the new entry. All this work is just to make the data connection
+ * can be scheduled to the right server later.
+ *
+ * The outgoing packet should be something like
+ * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
+ * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
+ */
+static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
+ struct sk_buff **pskb, int *diff)
+{
+ struct iphdr *iph;
+ struct tcphdr *th;
+ char *data, *data_limit;
+ char *start, *end;
+ __u32 from;
+ __u16 port;
+ struct ip_vs_conn *n_cp;
+ char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
+ unsigned buf_len;
+ int ret;
+
+ *diff = 0;
+
+ /* Only useful for established sessions */
+ if (cp->state != IP_VS_TCP_S_ESTABLISHED)
+ return 1;
+
+ /* Linear packets are much easier to deal with. */
+ if (!ip_vs_make_skb_writable(pskb, (*pskb)->len))
+ return 0;
+
+ if (cp->app_data == &ip_vs_ftp_pasv) {
+ iph = (*pskb)->nh.iph;
+ th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+ data = (char *)th + (th->doff << 2);
+ data_limit = (*pskb)->tail;
+
+ if (ip_vs_ftp_get_addrport(data, data_limit,
+ SERVER_STRING,
+ sizeof(SERVER_STRING)-1, ')',
+ &from, &port,
+ &start, &end) != 1)
+ return 1;
+
+ IP_VS_DBG(1-debug, "PASV response (%u.%u.%u.%u:%d) -> "
+ "%u.%u.%u.%u:%d detected\n",
+ NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr), 0);
+
+ /*
+ * Now update or create an connection entry for it
+ */
+ n_cp = ip_vs_conn_out_get(iph->protocol, from, port,
+ cp->caddr, 0);
+ if (!n_cp) {
+ n_cp = ip_vs_conn_new(IPPROTO_TCP,
+ cp->caddr, 0,
+ cp->vaddr, port,
+ from, port,
+ IP_VS_CONN_F_NO_CPORT,
+ cp->dest);
+ if (!n_cp)
+ return 0;
+
+ /* add its controller */
+ ip_vs_control_add(n_cp, cp);
+ }
+
+ /*
+ * Replace the old passive address with the new one
+ */
+ from = n_cp->vaddr;
+ port = n_cp->vport;
+ sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from),
+ port&255, (port>>8)&255);
+ buf_len = strlen(buf);
+
+ /*
+ * Calculate required delta-offset to keep TCP happy
+ */
+ *diff = buf_len - (end-start);
+
+ if (*diff == 0) {
+ /* simply replace it with new passive address */
+ memcpy(start, buf, buf_len);
+ ret = 1;
+ } else {
+ ret = !ip_vs_skb_replace(*pskb, GFP_ATOMIC, start,
+ end-start, buf, buf_len);
+ }
+
+ cp->app_data = NULL;
+ ip_vs_tcp_conn_listen(n_cp);
+ ip_vs_conn_put(n_cp);
+ return ret;
+ }
+ return 1;
+}
+
+
+/*
+ * Look at incoming ftp packets to catch the PASV/PORT command
+ * (outside-to-inside).
+ *
+ * The incoming packet having the PORT command should be something like
+ * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n".
+ * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number.
+ * In this case, we create a connection entry using the client address and
+ * port, so that the active ftp data connection from the server can reach
+ * the client.
+ */
+static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
+ struct sk_buff **pskb, int *diff)
+{
+ struct iphdr *iph;
+ struct tcphdr *th;
+ char *data, *data_start, *data_limit;
+ char *start, *end;
+ __u32 to;
+ __u16 port;
+ struct ip_vs_conn *n_cp;
+
+ /* no diff required for incoming packets */
+ *diff = 0;
+
+ /* Only useful for established sessions */
+ if (cp->state != IP_VS_TCP_S_ESTABLISHED)
+ return 1;
+
+ /* Linear packets are much easier to deal with. */
+ if (!ip_vs_make_skb_writable(pskb, (*pskb)->len))
+ return 0;
+
+ /*
+ * Detecting whether it is passive
+ */
+ iph = (*pskb)->nh.iph;
+ th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+
+ /* Since there may be OPTIONS in the TCP packet and the HLEN is
+ the length of the header in 32-bit multiples, it is accurate
+ to calculate data address by th+HLEN*4 */
+ data = data_start = (char *)th + (th->doff << 2);
+ data_limit = (*pskb)->tail;
+
+ while (data <= data_limit - 6) {
+ if (strnicmp(data, "PASV\r\n", 6) == 0) {
+ /* Passive mode on */
+ IP_VS_DBG(1-debug, "got PASV at %zd of %zd\n",
+ data - data_start,
+ data_limit - data_start);
+ cp->app_data = &ip_vs_ftp_pasv;
+ return 1;
+ }
+ data++;
+ }
+
+ /*
+ * To support virtual FTP server, the scenerio is as follows:
+ * FTP client ----> Load Balancer ----> FTP server
+ * First detect the port number in the application data,
+ * then create a new connection entry for the coming data
+ * connection.
+ */
+ if (ip_vs_ftp_get_addrport(data_start, data_limit,
+ CLIENT_STRING, sizeof(CLIENT_STRING)-1,
+ '\r', &to, &port,
+ &start, &end) != 1)
+ return 1;
+
+ IP_VS_DBG(1-debug, "PORT %u.%u.%u.%u:%d detected\n",
+ NIPQUAD(to), ntohs(port));
+
+ /* Passive mode off */
+ cp->app_data = NULL;
+
+ /*
+ * Now update or create a connection entry for it
+ */
+ IP_VS_DBG(1-debug, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
+ ip_vs_proto_name(iph->protocol),
+ NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr), 0);
+
+ n_cp = ip_vs_conn_in_get(iph->protocol,
+ to, port,
+ cp->vaddr, htons(ntohs(cp->vport)-1));
+ if (!n_cp) {
+ n_cp = ip_vs_conn_new(IPPROTO_TCP,
+ to, port,
+ cp->vaddr, htons(ntohs(cp->vport)-1),
+ cp->daddr, htons(ntohs(cp->dport)-1),
+ 0,
+ cp->dest);
+ if (!n_cp)
+ return 0;
+
+ /* add its controller */
+ ip_vs_control_add(n_cp, cp);
+ }
+
+ /*
+ * Move tunnel to listen state
+ */
+ ip_vs_tcp_conn_listen(n_cp);
+ ip_vs_conn_put(n_cp);
+
+ return 1;
+}
+
+
+static struct ip_vs_app ip_vs_ftp = {
+ .name = "ftp",
+ .type = IP_VS_APP_TYPE_FTP,
+ .protocol = IPPROTO_TCP,
+ .module = THIS_MODULE,
+ .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list),
+ .init_conn = ip_vs_ftp_init_conn,
+ .done_conn = ip_vs_ftp_done_conn,
+ .bind_conn = NULL,
+ .unbind_conn = NULL,
+ .pkt_out = ip_vs_ftp_out,
+ .pkt_in = ip_vs_ftp_in,
+};
+
+
+/*
+ * ip_vs_ftp initialization
+ */
+static int __init ip_vs_ftp_init(void)
+{
+ int i, ret;
+ struct ip_vs_app *app = &ip_vs_ftp;
+
+ ret = register_ip_vs_app(app);
+ if (ret)
+ return ret;
+
+ for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
+ if (!ports[i])
+ continue;
+ ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
+ if (ret)
+ break;
+ IP_VS_DBG(1-debug, "%s: loaded support on port[%d] = %d\n",
+ app->name, i, ports[i]);
+ }
+
+ if (ret)
+ unregister_ip_vs_app(app);
+
+ return ret;
+}
+
+
+/*
+ * ip_vs_ftp finish.
+ */
+static void __exit ip_vs_ftp_exit(void)
+{
+ unregister_ip_vs_app(&ip_vs_ftp);
+}
+
+
+module_init(ip_vs_ftp_init);
+module_exit(ip_vs_ftp_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
new file mode 100644
index 00000000000..c035838b780
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -0,0 +1,624 @@
+/*
+ * IPVS: Locality-Based Least-Connection scheduling module
+ *
+ * Version: $Id: ip_vs_lblc.c,v 1.10 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@gnuchina.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Martin Hamilton : fixed the terrible locking bugs
+ * *lock(tbl->lock) ==> *lock(&tbl->lock)
+ * Wensong Zhang : fixed the uninitilized tbl->lock bug
+ * Wensong Zhang : added doing full expiration check to
+ * collect stale entries of 24+ hours when
+ * no partial expire check in a half hour
+ * Julian Anastasov : replaced del_timer call with del_timer_sync
+ * to avoid the possible race between timer
+ * handler and del_timer thread in SMP
+ *
+ */
+
+/*
+ * The lblc algorithm is as follows (pseudo code):
+ *
+ * if cachenode[dest_ip] is null then
+ * n, cachenode[dest_ip] <- {weighted least-conn node};
+ * else
+ * n <- cachenode[dest_ip];
+ * if (n is dead) OR
+ * (n.conns>n.weight AND
+ * there is a node m with m.conns<m.weight/2) then
+ * n, cachenode[dest_ip] <- {weighted least-conn node};
+ *
+ * return n;
+ *
+ * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
+ * me to write this module.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+/* for sysctl */
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * It is for garbage collection of stale IPVS lblc entries,
+ * when the table is full.
+ */
+#define CHECK_EXPIRE_INTERVAL (60*HZ)
+#define ENTRY_TIMEOUT (6*60*HZ)
+
+/*
+ * It is for full expiration check.
+ * When there is no partial expiration check (garbage collection)
+ * in a half hour, do a full expiration check to collect stale
+ * entries that haven't been touched for a day.
+ */
+#define COUNT_FOR_FULL_EXPIRATION 30
+static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
+
+
+/*
+ * for IPVS lblc entry hash table
+ */
+#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
+#define CONFIG_IP_VS_LBLC_TAB_BITS 10
+#endif
+#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS
+#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS)
+#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1)
+
+
+/*
+ * IPVS lblc entry represents an association between destination
+ * IP address and its destination server
+ */
+struct ip_vs_lblc_entry {
+ struct list_head list;
+ __u32 addr; /* destination IP address */
+ struct ip_vs_dest *dest; /* real server (cache) */
+ unsigned long lastuse; /* last used time */
+};
+
+
+/*
+ * IPVS lblc hash table
+ */
+struct ip_vs_lblc_table {
+ rwlock_t lock; /* lock for this table */
+ struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
+ atomic_t entries; /* number of entries */
+ int max_size; /* maximum size of entries */
+ struct timer_list periodic_timer; /* collect stale entries */
+ int rover; /* rover for expire check */
+ int counter; /* counter for no expire */
+};
+
+
+/*
+ * IPVS LBLC sysctl table
+ */
+
+static ctl_table vs_vars_table[] = {
+ {
+ .ctl_name = NET_IPV4_VS_LBLC_EXPIRE,
+ .procname = "lblc_expiration",
+ .data = &sysctl_ip_vs_lblc_expiration,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table vs_table[] = {
+ {
+ .ctl_name = NET_IPV4_VS,
+ .procname = "vs",
+ .mode = 0555,
+ .child = vs_vars_table
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table ipv4_table[] = {
+ {
+ .ctl_name = NET_IPV4,
+ .procname = "ipv4",
+ .mode = 0555,
+ .child = vs_table
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table lblc_root_table[] = {
+ {
+ .ctl_name = CTL_NET,
+ .procname = "net",
+ .mode = 0555,
+ .child = ipv4_table
+ },
+ { .ctl_name = 0 }
+};
+
+static struct ctl_table_header * sysctl_header;
+
+/*
+ * new/free a ip_vs_lblc_entry, which is a mapping of a destionation
+ * IP address to a server.
+ */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_new(__u32 daddr, struct ip_vs_dest *dest)
+{
+ struct ip_vs_lblc_entry *en;
+
+ en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC);
+ if (en == NULL) {
+ IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&en->list);
+ en->addr = daddr;
+
+ atomic_inc(&dest->refcnt);
+ en->dest = dest;
+
+ return en;
+}
+
+
+static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
+{
+ list_del(&en->list);
+ /*
+ * We don't kfree dest because it is refered either by its service
+ * or the trash dest list.
+ */
+ atomic_dec(&en->dest->refcnt);
+ kfree(en);
+}
+
+
+/*
+ * Returns hash value for IPVS LBLC entry
+ */
+static inline unsigned ip_vs_lblc_hashkey(__u32 addr)
+{
+ return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
+}
+
+
+/*
+ * Hash an entry in the ip_vs_lblc_table.
+ * returns bool success.
+ */
+static int
+ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
+{
+ unsigned hash;
+
+ if (!list_empty(&en->list)) {
+ IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, "
+ "called from %p\n", __builtin_return_address(0));
+ return 0;
+ }
+
+ /*
+ * Hash by destination IP address
+ */
+ hash = ip_vs_lblc_hashkey(en->addr);
+
+ write_lock(&tbl->lock);
+ list_add(&en->list, &tbl->bucket[hash]);
+ atomic_inc(&tbl->entries);
+ write_unlock(&tbl->lock);
+
+ return 1;
+}
+
+
+#if 0000
+/*
+ * Unhash ip_vs_lblc_entry from ip_vs_lblc_table.
+ * returns bool success.
+ */
+static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl,
+ struct ip_vs_lblc_entry *en)
+{
+ if (list_empty(&en->list)) {
+ IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, "
+ "called from %p\n", __builtin_return_address(0));
+ return 0;
+ }
+
+ /*
+ * Remove it from the table
+ */
+ write_lock(&tbl->lock);
+ list_del(&en->list);
+ INIT_LIST_HEAD(&en->list);
+ write_unlock(&tbl->lock);
+
+ return 1;
+}
+#endif
+
+
+/*
+ * Get ip_vs_lblc_entry associated with supplied parameters.
+ */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __u32 addr)
+{
+ unsigned hash;
+ struct ip_vs_lblc_entry *en;
+
+ hash = ip_vs_lblc_hashkey(addr);
+
+ read_lock(&tbl->lock);
+
+ list_for_each_entry(en, &tbl->bucket[hash], list) {
+ if (en->addr == addr) {
+ /* HIT */
+ read_unlock(&tbl->lock);
+ return en;
+ }
+ }
+
+ read_unlock(&tbl->lock);
+
+ return NULL;
+}
+
+
+/*
+ * Flush all the entries of the specified table.
+ */
+static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
+{
+ int i;
+ struct ip_vs_lblc_entry *en, *nxt;
+
+ for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
+ write_lock(&tbl->lock);
+ list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
+ ip_vs_lblc_free(en);
+ atomic_dec(&tbl->entries);
+ }
+ write_unlock(&tbl->lock);
+ }
+}
+
+
+static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
+{
+ unsigned long now = jiffies;
+ int i, j;
+ struct ip_vs_lblc_entry *en, *nxt;
+
+ for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
+ j = (j + 1) & IP_VS_LBLC_TAB_MASK;
+
+ write_lock(&tbl->lock);
+ list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+ if (time_before(now,
+ en->lastuse + sysctl_ip_vs_lblc_expiration))
+ continue;
+
+ ip_vs_lblc_free(en);
+ atomic_dec(&tbl->entries);
+ }
+ write_unlock(&tbl->lock);
+ }
+ tbl->rover = j;
+}
+
+
+/*
+ * Periodical timer handler for IPVS lblc table
+ * It is used to collect stale entries when the number of entries
+ * exceeds the maximum size of the table.
+ *
+ * Fixme: we probably need more complicated algorithm to collect
+ * entries that have not been used for a long time even
+ * if the number of entries doesn't exceed the maximum size
+ * of the table.
+ * The full expiration check is for this purpose now.
+ */
+static void ip_vs_lblc_check_expire(unsigned long data)
+{
+ struct ip_vs_lblc_table *tbl;
+ unsigned long now = jiffies;
+ int goal;
+ int i, j;
+ struct ip_vs_lblc_entry *en, *nxt;
+
+ tbl = (struct ip_vs_lblc_table *)data;
+
+ if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
+ /* do full expiration check */
+ ip_vs_lblc_full_check(tbl);
+ tbl->counter = 1;
+ goto out;
+ }
+
+ if (atomic_read(&tbl->entries) <= tbl->max_size) {
+ tbl->counter++;
+ goto out;
+ }
+
+ goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
+ if (goal > tbl->max_size/2)
+ goal = tbl->max_size/2;
+
+ for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
+ j = (j + 1) & IP_VS_LBLC_TAB_MASK;
+
+ write_lock(&tbl->lock);
+ list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+ if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
+ continue;
+
+ ip_vs_lblc_free(en);
+ atomic_dec(&tbl->entries);
+ goal--;
+ }
+ write_unlock(&tbl->lock);
+ if (goal <= 0)
+ break;
+ }
+ tbl->rover = j;
+
+ out:
+ mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
+}
+
+
+static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
+{
+ int i;
+ struct ip_vs_lblc_table *tbl;
+
+ /*
+ * Allocate the ip_vs_lblc_table for this service
+ */
+ tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC);
+ if (tbl == NULL) {
+ IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
+ return -ENOMEM;
+ }
+ svc->sched_data = tbl;
+ IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
+ "current service\n",
+ sizeof(struct ip_vs_lblc_table));
+
+ /*
+ * Initialize the hash buckets
+ */
+ for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
+ INIT_LIST_HEAD(&tbl->bucket[i]);
+ }
+ rwlock_init(&tbl->lock);
+ tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
+ tbl->rover = 0;
+ tbl->counter = 1;
+
+ /*
+ * Hook periodic timer for garbage collection
+ */
+ init_timer(&tbl->periodic_timer);
+ tbl->periodic_timer.data = (unsigned long)tbl;
+ tbl->periodic_timer.function = ip_vs_lblc_check_expire;
+ tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
+ add_timer(&tbl->periodic_timer);
+
+ return 0;
+}
+
+
+static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_lblc_table *tbl = svc->sched_data;
+
+ /* remove periodic timer */
+ del_timer_sync(&tbl->periodic_timer);
+
+ /* got to clean up table entries here */
+ ip_vs_lblc_flush(tbl);
+
+ /* release the table itself */
+ kfree(svc->sched_data);
+ IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
+ sizeof(struct ip_vs_lblc_table));
+
+ return 0;
+}
+
+
+static int ip_vs_lblc_update_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static inline struct ip_vs_dest *
+__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+ struct ip_vs_dest *dest, *least;
+ int loh, doh;
+
+ /*
+ * We think the overhead of processing active connections is fifty
+ * times higher than that of inactive connections in average. (This
+ * fifty times might not be accurate, we will change it later.) We
+ * use the following formula to estimate the overhead:
+ * dest->activeconns*50 + dest->inactconns
+ * and the load:
+ * (dest overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connection.
+ */
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+ if (atomic_read(&dest->weight) > 0) {
+ least = dest;
+ loh = atomic_read(&least->activeconns) * 50
+ + atomic_read(&least->inactconns);
+ goto nextstage;
+ }
+ }
+ return NULL;
+
+ /*
+ * Find the destination with the least load.
+ */
+ nextstage:
+ list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ doh = atomic_read(&dest->activeconns) * 50
+ + atomic_read(&dest->inactconns);
+ if (loh * atomic_read(&dest->weight) >
+ doh * atomic_read(&least->weight)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ NIPQUAD(least->addr), ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+/*
+ * If this destination server is overloaded and there is a less loaded
+ * server, then return true.
+ */
+static inline int
+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+ if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
+ struct ip_vs_dest *d;
+
+ list_for_each_entry(d, &svc->destinations, n_list) {
+ if (atomic_read(&d->activeconns)*2
+ < atomic_read(&d->weight)) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+
+/*
+ * Locality-Based (weighted) Least-Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_lblc_table *tbl;
+ struct ip_vs_lblc_entry *en;
+ struct iphdr *iph = skb->nh.iph;
+
+ IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
+
+ tbl = (struct ip_vs_lblc_table *)svc->sched_data;
+ en = ip_vs_lblc_get(tbl, iph->daddr);
+ if (en == NULL) {
+ dest = __ip_vs_wlc_schedule(svc, iph);
+ if (dest == NULL) {
+ IP_VS_DBG(1, "no destination available\n");
+ return NULL;
+ }
+ en = ip_vs_lblc_new(iph->daddr, dest);
+ if (en == NULL) {
+ return NULL;
+ }
+ ip_vs_lblc_hash(tbl, en);
+ } else {
+ dest = en->dest;
+ if (!(dest->flags & IP_VS_DEST_F_AVAILABLE)
+ || atomic_read(&dest->weight) <= 0
+ || is_overloaded(dest, svc)) {
+ dest = __ip_vs_wlc_schedule(svc, iph);
+ if (dest == NULL) {
+ IP_VS_DBG(1, "no destination available\n");
+ return NULL;
+ }
+ atomic_dec(&en->dest->refcnt);
+ atomic_inc(&dest->refcnt);
+ en->dest = dest;
+ }
+ }
+ en->lastuse = jiffies;
+
+ IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
+ "--> server %u.%u.%u.%u:%d\n",
+ NIPQUAD(en->addr),
+ NIPQUAD(dest->addr),
+ ntohs(dest->port));
+
+ return dest;
+}
+
+
+/*
+ * IPVS LBLC Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_lblc_scheduler =
+{
+ .name = "lblc",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .init_service = ip_vs_lblc_init_svc,
+ .done_service = ip_vs_lblc_done_svc,
+ .update_service = ip_vs_lblc_update_svc,
+ .schedule = ip_vs_lblc_schedule,
+};
+
+
+static int __init ip_vs_lblc_init(void)
+{
+ INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list);
+ sysctl_header = register_sysctl_table(lblc_root_table, 0);
+ return register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+}
+
+
+static void __exit ip_vs_lblc_cleanup(void)
+{
+ unregister_sysctl_table(sysctl_header);
+ unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+}
+
+
+module_init(ip_vs_lblc_init);
+module_exit(ip_vs_lblc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
new file mode 100644
index 00000000000..22b5dd55d27
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -0,0 +1,888 @@
+/*
+ * IPVS: Locality-Based Least-Connection with Replication scheduler
+ *
+ * Version: $Id: ip_vs_lblcr.c,v 1.11 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@gnuchina.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Julian Anastasov : Added the missing (dest->weight>0)
+ * condition in the ip_vs_dest_set_max.
+ *
+ */
+
+/*
+ * The lblc/r algorithm is as follows (pseudo code):
+ *
+ * if serverSet[dest_ip] is null then
+ * n, serverSet[dest_ip] <- {weighted least-conn node};
+ * else
+ * n <- {least-conn (alive) node in serverSet[dest_ip]};
+ * if (n is null) OR
+ * (n.conns>n.weight AND
+ * there is a node m with m.conns<m.weight/2) then
+ * n <- {weighted least-conn node};
+ * add n to serverSet[dest_ip];
+ * if |serverSet[dest_ip]| > 1 AND
+ * now - serverSet[dest_ip].lastMod > T then
+ * m <- {most conn node in serverSet[dest_ip]};
+ * remove m from serverSet[dest_ip];
+ * if serverSet[dest_ip] changed then
+ * serverSet[dest_ip].lastMod <- now;
+ *
+ * return n;
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+/* for sysctl */
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+/* for proc_net_create/proc_net_remove */
+#include <linux/proc_fs.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * It is for garbage collection of stale IPVS lblcr entries,
+ * when the table is full.
+ */
+#define CHECK_EXPIRE_INTERVAL (60*HZ)
+#define ENTRY_TIMEOUT (6*60*HZ)
+
+/*
+ * It is for full expiration check.
+ * When there is no partial expiration check (garbage collection)
+ * in a half hour, do a full expiration check to collect stale
+ * entries that haven't been touched for a day.
+ */
+#define COUNT_FOR_FULL_EXPIRATION 30
+static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
+
+
+/*
+ * for IPVS lblcr entry hash table
+ */
+#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
+#define CONFIG_IP_VS_LBLCR_TAB_BITS 10
+#endif
+#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS
+#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS)
+#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1)
+
+
+/*
+ * IPVS destination set structure and operations
+ */
+struct ip_vs_dest_list {
+ struct ip_vs_dest_list *next; /* list link */
+ struct ip_vs_dest *dest; /* destination server */
+};
+
+struct ip_vs_dest_set {
+ atomic_t size; /* set size */
+ unsigned long lastmod; /* last modified time */
+ struct ip_vs_dest_list *list; /* destination list */
+ rwlock_t lock; /* lock for this list */
+};
+
+
+static struct ip_vs_dest_list *
+ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+{
+ struct ip_vs_dest_list *e;
+
+ for (e=set->list; e!=NULL; e=e->next) {
+ if (e->dest == dest)
+ /* already existed */
+ return NULL;
+ }
+
+ e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
+ if (e == NULL) {
+ IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
+ return NULL;
+ }
+
+ atomic_inc(&dest->refcnt);
+ e->dest = dest;
+
+ /* link it to the list */
+ write_lock(&set->lock);
+ e->next = set->list;
+ set->list = e;
+ atomic_inc(&set->size);
+ write_unlock(&set->lock);
+
+ set->lastmod = jiffies;
+ return e;
+}
+
+static void
+ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+{
+ struct ip_vs_dest_list *e, **ep;
+
+ write_lock(&set->lock);
+ for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
+ if (e->dest == dest) {
+ /* HIT */
+ *ep = e->next;
+ atomic_dec(&set->size);
+ set->lastmod = jiffies;
+ atomic_dec(&e->dest->refcnt);
+ kfree(e);
+ break;
+ }
+ ep = &e->next;
+ }
+ write_unlock(&set->lock);
+}
+
+static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
+{
+ struct ip_vs_dest_list *e, **ep;
+
+ write_lock(&set->lock);
+ for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
+ *ep = e->next;
+ /*
+ * We don't kfree dest because it is refered either
+ * by its service or by the trash dest list.
+ */
+ atomic_dec(&e->dest->refcnt);
+ kfree(e);
+ }
+ write_unlock(&set->lock);
+}
+
+/* get weighted least-connection node in the destination set */
+static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
+{
+ register struct ip_vs_dest_list *e;
+ struct ip_vs_dest *dest, *least;
+ int loh, doh;
+
+ if (set == NULL)
+ return NULL;
+
+ read_lock(&set->lock);
+ /* select the first destination server, whose weight > 0 */
+ for (e=set->list; e!=NULL; e=e->next) {
+ least = e->dest;
+ if (least->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ if ((atomic_read(&least->weight) > 0)
+ && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
+ loh = atomic_read(&least->activeconns) * 50
+ + atomic_read(&least->inactconns);
+ goto nextstage;
+ }
+ }
+ read_unlock(&set->lock);
+ return NULL;
+
+ /* find the destination with the weighted least load */
+ nextstage:
+ for (e=e->next; e!=NULL; e=e->next) {
+ dest = e->dest;
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ doh = atomic_read(&dest->activeconns) * 50
+ + atomic_read(&dest->inactconns);
+ if ((loh * atomic_read(&dest->weight) >
+ doh * atomic_read(&least->weight))
+ && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+ read_unlock(&set->lock);
+
+ IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ NIPQUAD(least->addr), ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+ return least;
+}
+
+
+/* get weighted most-connection node in the destination set */
+static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
+{
+ register struct ip_vs_dest_list *e;
+ struct ip_vs_dest *dest, *most;
+ int moh, doh;
+
+ if (set == NULL)
+ return NULL;
+
+ read_lock(&set->lock);
+ /* select the first destination server, whose weight > 0 */
+ for (e=set->list; e!=NULL; e=e->next) {
+ most = e->dest;
+ if (atomic_read(&most->weight) > 0) {
+ moh = atomic_read(&most->activeconns) * 50
+ + atomic_read(&most->inactconns);
+ goto nextstage;
+ }
+ }
+ read_unlock(&set->lock);
+ return NULL;
+
+ /* find the destination with the weighted most load */
+ nextstage:
+ for (e=e->next; e!=NULL; e=e->next) {
+ dest = e->dest;
+ doh = atomic_read(&dest->activeconns) * 50
+ + atomic_read(&dest->inactconns);
+ /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
+ if ((moh * atomic_read(&dest->weight) <
+ doh * atomic_read(&most->weight))
+ && (atomic_read(&dest->weight) > 0)) {
+ most = dest;
+ moh = doh;
+ }
+ }
+ read_unlock(&set->lock);
+
+ IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ NIPQUAD(most->addr), ntohs(most->port),
+ atomic_read(&most->activeconns),
+ atomic_read(&most->refcnt),
+ atomic_read(&most->weight), moh);
+ return most;
+}
+
+
+/*
+ * IPVS lblcr entry represents an association between destination
+ * IP address and its destination server set
+ */
+struct ip_vs_lblcr_entry {
+ struct list_head list;
+ __u32 addr; /* destination IP address */
+ struct ip_vs_dest_set set; /* destination server set */
+ unsigned long lastuse; /* last used time */
+};
+
+
+/*
+ * IPVS lblcr hash table
+ */
+struct ip_vs_lblcr_table {
+ rwlock_t lock; /* lock for this table */
+ struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
+ atomic_t entries; /* number of entries */
+ int max_size; /* maximum size of entries */
+ struct timer_list periodic_timer; /* collect stale entries */
+ int rover; /* rover for expire check */
+ int counter; /* counter for no expire */
+};
+
+
+/*
+ * IPVS LBLCR sysctl table
+ */
+
+static ctl_table vs_vars_table[] = {
+ {
+ .ctl_name = NET_IPV4_VS_LBLCR_EXPIRE,
+ .procname = "lblcr_expiration",
+ .data = &sysctl_ip_vs_lblcr_expiration,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table vs_table[] = {
+ {
+ .ctl_name = NET_IPV4_VS,
+ .procname = "vs",
+ .mode = 0555,
+ .child = vs_vars_table
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table ipv4_table[] = {
+ {
+ .ctl_name = NET_IPV4,
+ .procname = "ipv4",
+ .mode = 0555,
+ .child = vs_table
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table lblcr_root_table[] = {
+ {
+ .ctl_name = CTL_NET,
+ .procname = "net",
+ .mode = 0555,
+ .child = ipv4_table
+ },
+ { .ctl_name = 0 }
+};
+
+static struct ctl_table_header * sysctl_header;
+
+/*
+ * new/free a ip_vs_lblcr_entry, which is a mapping of a destination
+ * IP address to a server.
+ */
+static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__u32 daddr)
+{
+ struct ip_vs_lblcr_entry *en;
+
+ en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
+ if (en == NULL) {
+ IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&en->list);
+ en->addr = daddr;
+
+ /* initilize its dest set */
+ atomic_set(&(en->set.size), 0);
+ en->set.list = NULL;
+ rwlock_init(&en->set.lock);
+
+ return en;
+}
+
+
+static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
+{
+ list_del(&en->list);
+ ip_vs_dest_set_eraseall(&en->set);
+ kfree(en);
+}
+
+
+/*
+ * Returns hash value for IPVS LBLCR entry
+ */
+static inline unsigned ip_vs_lblcr_hashkey(__u32 addr)
+{
+ return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
+}
+
+
+/*
+ * Hash an entry in the ip_vs_lblcr_table.
+ * returns bool success.
+ */
+static int
+ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
+{
+ unsigned hash;
+
+ if (!list_empty(&en->list)) {
+ IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
+ "called from %p\n", __builtin_return_address(0));
+ return 0;
+ }
+
+ /*
+ * Hash by destination IP address
+ */
+ hash = ip_vs_lblcr_hashkey(en->addr);
+
+ write_lock(&tbl->lock);
+ list_add(&en->list, &tbl->bucket[hash]);
+ atomic_inc(&tbl->entries);
+ write_unlock(&tbl->lock);
+
+ return 1;
+}
+
+
+#if 0000
+/*
+ * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table.
+ * returns bool success.
+ */
+static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl,
+ struct ip_vs_lblcr_entry *en)
+{
+ if (list_empty(&en->list)) {
+ IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, "
+ "called from %p\n", __builtin_return_address(0));
+ return 0;
+ }
+
+ /*
+ * Remove it from the table
+ */
+ write_lock(&tbl->lock);
+ list_del(&en->list);
+ INIT_LIST_HEAD(&en->list);
+ write_unlock(&tbl->lock);
+
+ return 1;
+}
+#endif
+
+
+/*
+ * Get ip_vs_lblcr_entry associated with supplied parameters.
+ */
+static inline struct ip_vs_lblcr_entry *
+ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __u32 addr)
+{
+ unsigned hash;
+ struct ip_vs_lblcr_entry *en;
+
+ hash = ip_vs_lblcr_hashkey(addr);
+
+ read_lock(&tbl->lock);
+
+ list_for_each_entry(en, &tbl->bucket[hash], list) {
+ if (en->addr == addr) {
+ /* HIT */
+ read_unlock(&tbl->lock);
+ return en;
+ }
+ }
+
+ read_unlock(&tbl->lock);
+
+ return NULL;
+}
+
+
+/*
+ * Flush all the entries of the specified table.
+ */
+static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
+{
+ int i;
+ struct ip_vs_lblcr_entry *en, *nxt;
+
+ for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+ write_lock(&tbl->lock);
+ list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
+ ip_vs_lblcr_free(en);
+ atomic_dec(&tbl->entries);
+ }
+ write_unlock(&tbl->lock);
+ }
+}
+
+
+static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
+{
+ unsigned long now = jiffies;
+ int i, j;
+ struct ip_vs_lblcr_entry *en, *nxt;
+
+ for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+ j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
+
+ write_lock(&tbl->lock);
+ list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+ if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
+ now))
+ continue;
+
+ ip_vs_lblcr_free(en);
+ atomic_dec(&tbl->entries);
+ }
+ write_unlock(&tbl->lock);
+ }
+ tbl->rover = j;
+}
+
+
+/*
+ * Periodical timer handler for IPVS lblcr table
+ * It is used to collect stale entries when the number of entries
+ * exceeds the maximum size of the table.
+ *
+ * Fixme: we probably need more complicated algorithm to collect
+ * entries that have not been used for a long time even
+ * if the number of entries doesn't exceed the maximum size
+ * of the table.
+ * The full expiration check is for this purpose now.
+ */
+static void ip_vs_lblcr_check_expire(unsigned long data)
+{
+ struct ip_vs_lblcr_table *tbl;
+ unsigned long now = jiffies;
+ int goal;
+ int i, j;
+ struct ip_vs_lblcr_entry *en, *nxt;
+
+ tbl = (struct ip_vs_lblcr_table *)data;
+
+ if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
+ /* do full expiration check */
+ ip_vs_lblcr_full_check(tbl);
+ tbl->counter = 1;
+ goto out;
+ }
+
+ if (atomic_read(&tbl->entries) <= tbl->max_size) {
+ tbl->counter++;
+ goto out;
+ }
+
+ goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
+ if (goal > tbl->max_size/2)
+ goal = tbl->max_size/2;
+
+ for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+ j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
+
+ write_lock(&tbl->lock);
+ list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+ if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
+ continue;
+
+ ip_vs_lblcr_free(en);
+ atomic_dec(&tbl->entries);
+ goal--;
+ }
+ write_unlock(&tbl->lock);
+ if (goal <= 0)
+ break;
+ }
+ tbl->rover = j;
+
+ out:
+ mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
+}
+
+
+#ifdef CONFIG_IP_VS_LBLCR_DEBUG
+static struct ip_vs_lblcr_table *lblcr_table_list;
+
+/*
+ * /proc/net/ip_vs_lblcr to display the mappings of
+ * destination IP address <==> its serverSet
+ */
+static int
+ip_vs_lblcr_getinfo(char *buffer, char **start, off_t offset, int length)
+{
+ off_t pos=0, begin;
+ int len=0, size;
+ struct ip_vs_lblcr_table *tbl;
+ unsigned long now = jiffies;
+ int i;
+ struct ip_vs_lblcr_entry *en;
+
+ tbl = lblcr_table_list;
+
+ size = sprintf(buffer, "LastTime Dest IP address Server set\n");
+ pos += size;
+ len += size;
+
+ for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+ read_lock_bh(&tbl->lock);
+ list_for_each_entry(en, &tbl->bucket[i], list) {
+ char tbuf[16];
+ struct ip_vs_dest_list *d;
+
+ sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(en->addr));
+ size = sprintf(buffer+len, "%8lu %-16s ",
+ now-en->lastuse, tbuf);
+
+ read_lock(&en->set.lock);
+ for (d=en->set.list; d!=NULL; d=d->next) {
+ size += sprintf(buffer+len+size,
+ "%u.%u.%u.%u ",
+ NIPQUAD(d->dest->addr));
+ }
+ read_unlock(&en->set.lock);
+ size += sprintf(buffer+len+size, "\n");
+ len += size;
+ pos += size;
+ if (pos <= offset)
+ len=0;
+ if (pos >= offset+length) {
+ read_unlock_bh(&tbl->lock);
+ goto done;
+ }
+ }
+ read_unlock_bh(&tbl->lock);
+ }
+
+ done:
+ begin = len - (pos - offset);
+ *start = buffer + begin;
+ len -= begin;
+ if(len>length)
+ len = length;
+ return len;
+}
+#endif
+
+
+static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
+{
+ int i;
+ struct ip_vs_lblcr_table *tbl;
+
+ /*
+ * Allocate the ip_vs_lblcr_table for this service
+ */
+ tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC);
+ if (tbl == NULL) {
+ IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
+ return -ENOMEM;
+ }
+ svc->sched_data = tbl;
+ IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
+ "current service\n",
+ sizeof(struct ip_vs_lblcr_table));
+
+ /*
+ * Initialize the hash buckets
+ */
+ for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+ INIT_LIST_HEAD(&tbl->bucket[i]);
+ }
+ rwlock_init(&tbl->lock);
+ tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
+ tbl->rover = 0;
+ tbl->counter = 1;
+
+ /*
+ * Hook periodic timer for garbage collection
+ */
+ init_timer(&tbl->periodic_timer);
+ tbl->periodic_timer.data = (unsigned long)tbl;
+ tbl->periodic_timer.function = ip_vs_lblcr_check_expire;
+ tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
+ add_timer(&tbl->periodic_timer);
+
+#ifdef CONFIG_IP_VS_LBLCR_DEBUG
+ lblcr_table_list = tbl;
+#endif
+ return 0;
+}
+
+
+static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
+
+ /* remove periodic timer */
+ del_timer_sync(&tbl->periodic_timer);
+
+ /* got to clean up table entries here */
+ ip_vs_lblcr_flush(tbl);
+
+ /* release the table itself */
+ kfree(svc->sched_data);
+ IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
+ sizeof(struct ip_vs_lblcr_table));
+
+ return 0;
+}
+
+
+static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static inline struct ip_vs_dest *
+__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+ struct ip_vs_dest *dest, *least;
+ int loh, doh;
+
+ /*
+ * We think the overhead of processing active connections is fifty
+ * times higher than that of inactive connections in average. (This
+ * fifty times might not be accurate, we will change it later.) We
+ * use the following formula to estimate the overhead:
+ * dest->activeconns*50 + dest->inactconns
+ * and the load:
+ * (dest overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connection.
+ */
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ if (atomic_read(&dest->weight) > 0) {
+ least = dest;
+ loh = atomic_read(&least->activeconns) * 50
+ + atomic_read(&least->inactconns);
+ goto nextstage;
+ }
+ }
+ return NULL;
+
+ /*
+ * Find the destination with the least load.
+ */
+ nextstage:
+ list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ doh = atomic_read(&dest->activeconns) * 50
+ + atomic_read(&dest->inactconns);
+ if (loh * atomic_read(&dest->weight) >
+ doh * atomic_read(&least->weight)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ NIPQUAD(least->addr), ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+/*
+ * If this destination server is overloaded and there is a less loaded
+ * server, then return true.
+ */
+static inline int
+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+ if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
+ struct ip_vs_dest *d;
+
+ list_for_each_entry(d, &svc->destinations, n_list) {
+ if (atomic_read(&d->activeconns)*2
+ < atomic_read(&d->weight)) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+
+/*
+ * Locality-Based (weighted) Least-Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_lblcr_table *tbl;
+ struct ip_vs_lblcr_entry *en;
+ struct iphdr *iph = skb->nh.iph;
+
+ IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
+
+ tbl = (struct ip_vs_lblcr_table *)svc->sched_data;
+ en = ip_vs_lblcr_get(tbl, iph->daddr);
+ if (en == NULL) {
+ dest = __ip_vs_wlc_schedule(svc, iph);
+ if (dest == NULL) {
+ IP_VS_DBG(1, "no destination available\n");
+ return NULL;
+ }
+ en = ip_vs_lblcr_new(iph->daddr);
+ if (en == NULL) {
+ return NULL;
+ }
+ ip_vs_dest_set_insert(&en->set, dest);
+ ip_vs_lblcr_hash(tbl, en);
+ } else {
+ dest = ip_vs_dest_set_min(&en->set);
+ if (!dest || is_overloaded(dest, svc)) {
+ dest = __ip_vs_wlc_schedule(svc, iph);
+ if (dest == NULL) {
+ IP_VS_DBG(1, "no destination available\n");
+ return NULL;
+ }
+ ip_vs_dest_set_insert(&en->set, dest);
+ }
+ if (atomic_read(&en->set.size) > 1 &&
+ jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) {
+ struct ip_vs_dest *m;
+ m = ip_vs_dest_set_max(&en->set);
+ if (m)
+ ip_vs_dest_set_erase(&en->set, m);
+ }
+ }
+ en->lastuse = jiffies;
+
+ IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
+ "--> server %u.%u.%u.%u:%d\n",
+ NIPQUAD(en->addr),
+ NIPQUAD(dest->addr),
+ ntohs(dest->port));
+
+ return dest;
+}
+
+
+/*
+ * IPVS LBLCR Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
+{
+ .name = "lblcr",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .init_service = ip_vs_lblcr_init_svc,
+ .done_service = ip_vs_lblcr_done_svc,
+ .update_service = ip_vs_lblcr_update_svc,
+ .schedule = ip_vs_lblcr_schedule,
+};
+
+
+static int __init ip_vs_lblcr_init(void)
+{
+ INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list);
+ sysctl_header = register_sysctl_table(lblcr_root_table, 0);
+#ifdef CONFIG_IP_VS_LBLCR_DEBUG
+ proc_net_create("ip_vs_lblcr", 0, ip_vs_lblcr_getinfo);
+#endif
+ return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+}
+
+
+static void __exit ip_vs_lblcr_cleanup(void)
+{
+#ifdef CONFIG_IP_VS_LBLCR_DEBUG
+ proc_net_remove("ip_vs_lblcr");
+#endif
+ unregister_sysctl_table(sysctl_header);
+ unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+}
+
+
+module_init(ip_vs_lblcr_init);
+module_exit(ip_vs_lblcr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c
new file mode 100644
index 00000000000..d88fef90a64
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_lc.c
@@ -0,0 +1,123 @@
+/*
+ * IPVS: Least-Connection Scheduling module
+ *
+ * Version: $Id: ip_vs_lc.c,v 1.10 2003/04/18 09:03:16 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Wensong Zhang : added the ip_vs_lc_update_svc
+ * Wensong Zhang : added any dest with weight=0 is quiesced
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static int ip_vs_lc_init_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static int ip_vs_lc_done_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static int ip_vs_lc_update_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static inline unsigned int
+ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
+{
+ /*
+ * We think the overhead of processing active connections is 256
+ * times higher than that of inactive connections in average. (This
+ * 256 times might not be accurate, we will change it later) We
+ * use the following formula to estimate the overhead now:
+ * dest->activeconns*256 + dest->inactconns
+ */
+ return (atomic_read(&dest->activeconns) << 8) +
+ atomic_read(&dest->inactconns);
+}
+
+
+/*
+ * Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+ struct ip_vs_dest *dest, *least = NULL;
+ unsigned int loh = 0, doh;
+
+ IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n");
+
+ /*
+ * Simply select the server with the least number of
+ * (activeconns<<5) + inactconns
+ * Except whose weight is equal to zero.
+ * If the weight is equal to zero, it means that the server is
+ * quiesced, the existing connections to the server still get
+ * served, but no new connection is assigned to the server.
+ */
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
+ atomic_read(&dest->weight) == 0)
+ continue;
+ doh = ip_vs_lc_dest_overhead(dest);
+ if (!least || doh < loh) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ if (least)
+ IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n",
+ NIPQUAD(least->addr), ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->inactconns));
+
+ return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_lc_scheduler = {
+ .name = "lc",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .init_service = ip_vs_lc_init_svc,
+ .done_service = ip_vs_lc_done_svc,
+ .update_service = ip_vs_lc_update_svc,
+ .schedule = ip_vs_lc_schedule,
+};
+
+
+static int __init ip_vs_lc_init(void)
+{
+ INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list);
+ return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
+}
+
+static void __exit ip_vs_lc_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
+}
+
+module_init(ip_vs_lc_init);
+module_exit(ip_vs_lc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c
new file mode 100644
index 00000000000..bc2a9e5f2a7
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_nq.c
@@ -0,0 +1,161 @@
+/*
+ * IPVS: Never Queue scheduling module
+ *
+ * Version: $Id: ip_vs_nq.c,v 1.2 2003/06/08 09:31:19 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The NQ algorithm adopts a two-speed model. When there is an idle server
+ * available, the job will be sent to the idle server, instead of waiting
+ * for a fast one. When there is no idle server available, the job will be
+ * sent to the server that minimize its expected delay (The Shortest
+ * Expected Delay scheduling algorithm).
+ *
+ * See the following paper for more information:
+ * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
+ * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
+ * pages 986-994, 1988.
+ *
+ * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me.
+ *
+ * The difference between NQ and SED is that NQ can improve overall
+ * system utilization.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static int
+ip_vs_nq_init_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static int
+ip_vs_nq_done_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static int
+ip_vs_nq_update_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static inline unsigned int
+ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
+{
+ /*
+ * We only use the active connection number in the cost
+ * calculation here.
+ */
+ return atomic_read(&dest->activeconns) + 1;
+}
+
+
+/*
+ * Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+ struct ip_vs_dest *dest, *least = NULL;
+ unsigned int loh = 0, doh;
+
+ IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n");
+
+ /*
+ * We calculate the load of each dest server as follows:
+ * (server expected overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connections.
+ */
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
+ !atomic_read(&dest->weight))
+ continue;
+
+ doh = ip_vs_nq_dest_overhead(dest);
+
+ /* return the server directly if it is idle */
+ if (atomic_read(&dest->activeconns) == 0) {
+ least = dest;
+ loh = doh;
+ goto out;
+ }
+
+ if (!least ||
+ (loh * atomic_read(&dest->weight) >
+ doh * atomic_read(&least->weight))) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ if (!least)
+ return NULL;
+
+ out:
+ IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ NIPQUAD(least->addr), ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_nq_scheduler =
+{
+ .name = "nq",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .init_service = ip_vs_nq_init_svc,
+ .done_service = ip_vs_nq_done_svc,
+ .update_service = ip_vs_nq_update_svc,
+ .schedule = ip_vs_nq_schedule,
+};
+
+
+static int __init ip_vs_nq_init(void)
+{
+ INIT_LIST_HEAD(&ip_vs_nq_scheduler.n_list);
+ return register_ip_vs_scheduler(&ip_vs_nq_scheduler);
+}
+
+static void __exit ip_vs_nq_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
+}
+
+module_init(ip_vs_nq_init);
+module_exit(ip_vs_nq_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c
new file mode 100644
index 00000000000..253c46252bd
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto.c
@@ -0,0 +1,244 @@
+/*
+ * ip_vs_proto.c: transport protocol load balancing support for IPVS
+ *
+ * Version: $Id: ip_vs_proto.c,v 1.2 2003/04/18 09:03:16 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <asm/system.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * IPVS protocols can only be registered/unregistered when the ipvs
+ * module is loaded/unloaded, so no lock is needed in accessing the
+ * ipvs protocol table.
+ */
+
+#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */
+#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1))
+
+static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
+
+
+/*
+ * register an ipvs protocol
+ */
+static int register_ip_vs_protocol(struct ip_vs_protocol *pp)
+{
+ unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+
+ pp->next = ip_vs_proto_table[hash];
+ ip_vs_proto_table[hash] = pp;
+
+ if (pp->init != NULL)
+ pp->init(pp);
+
+ return 0;
+}
+
+
+/*
+ * unregister an ipvs protocol
+ */
+static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
+{
+ struct ip_vs_protocol **pp_p;
+ unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+
+ pp_p = &ip_vs_proto_table[hash];
+ for (; *pp_p; pp_p = &(*pp_p)->next) {
+ if (*pp_p == pp) {
+ *pp_p = pp->next;
+ if (pp->exit != NULL)
+ pp->exit(pp);
+ return 0;
+ }
+ }
+
+ return -ESRCH;
+}
+
+
+/*
+ * get ip_vs_protocol object by its proto.
+ */
+struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
+{
+ struct ip_vs_protocol *pp;
+ unsigned hash = IP_VS_PROTO_HASH(proto);
+
+ for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
+ if (pp->protocol == proto)
+ return pp;
+ }
+
+ return NULL;
+}
+
+
+/*
+ * Propagate event for state change to all protocols
+ */
+void ip_vs_protocol_timeout_change(int flags)
+{
+ struct ip_vs_protocol *pp;
+ int i;
+
+ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+ for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
+ if (pp->timeout_change)
+ pp->timeout_change(pp, flags);
+ }
+ }
+}
+
+
+int *
+ip_vs_create_timeout_table(int *table, int size)
+{
+ int *t;
+
+ t = kmalloc(size, GFP_ATOMIC);
+ if (t == NULL)
+ return NULL;
+ memcpy(t, table, size);
+ return t;
+}
+
+
+/*
+ * Set timeout value for state specified by name
+ */
+int
+ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to)
+{
+ int i;
+
+ if (!table || !name || !to)
+ return -EINVAL;
+
+ for (i = 0; i < num; i++) {
+ if (strcmp(names[i], name))
+ continue;
+ table[i] = to * HZ;
+ return 0;
+ }
+ return -ENOENT;
+}
+
+
+const char * ip_vs_state_name(__u16 proto, int state)
+{
+ struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+
+ if (pp == NULL || pp->state_name == NULL)
+ return "ERR!";
+ return pp->state_name(state);
+}
+
+
+void
+ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
+ const struct sk_buff *skb,
+ int offset,
+ const char *msg)
+{
+ char buf[128];
+ struct iphdr _iph, *ih;
+
+ ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+ if (ih == NULL)
+ sprintf(buf, "%s TRUNCATED", pp->name);
+ else if (ih->frag_off & __constant_htons(IP_OFFSET))
+ sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
+ pp->name, NIPQUAD(ih->saddr),
+ NIPQUAD(ih->daddr));
+ else {
+ __u16 _ports[2], *pptr
+;
+ pptr = skb_header_pointer(skb, offset + ih->ihl*4,
+ sizeof(_ports), _ports);
+ if (pptr == NULL)
+ sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u",
+ pp->name,
+ NIPQUAD(ih->saddr),
+ NIPQUAD(ih->daddr));
+ else
+ sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u",
+ pp->name,
+ NIPQUAD(ih->saddr),
+ ntohs(pptr[0]),
+ NIPQUAD(ih->daddr),
+ ntohs(pptr[1]));
+ }
+
+ printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+
+
+int ip_vs_protocol_init(void)
+{
+ char protocols[64];
+#define REGISTER_PROTOCOL(p) \
+ do { \
+ register_ip_vs_protocol(p); \
+ strcat(protocols, ", "); \
+ strcat(protocols, (p)->name); \
+ } while (0)
+
+ protocols[0] = '\0';
+ protocols[2] = '\0';
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ REGISTER_PROTOCOL(&ip_vs_protocol_udp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ICMP
+ REGISTER_PROTOCOL(&ip_vs_protocol_icmp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_AH
+ REGISTER_PROTOCOL(&ip_vs_protocol_ah);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ESP
+ REGISTER_PROTOCOL(&ip_vs_protocol_esp);
+#endif
+ IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]);
+
+ return 0;
+}
+
+
+void ip_vs_protocol_cleanup(void)
+{
+ struct ip_vs_protocol *pp;
+ int i;
+
+ /* unregister all the ipvs protocols */
+ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+ while ((pp = ip_vs_proto_table[i]) != NULL)
+ unregister_ip_vs_protocol(pp);
+ }
+}
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c
new file mode 100644
index 00000000000..453e94a0bbd
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_ah.c
@@ -0,0 +1,177 @@
+/*
+ * ip_vs_proto_ah.c: AH IPSec load balancing support for IPVS
+ *
+ * Version: $Id: ip_vs_proto_ah.c,v 1.1 2003/07/04 15:04:37 wensong Exp $
+ *
+ * Authors: Julian Anastasov <ja@ssi.bg>, February 2002
+ * Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation;
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+/* TODO:
+
+struct isakmp_hdr {
+ __u8 icookie[8];
+ __u8 rcookie[8];
+ __u8 np;
+ __u8 version;
+ __u8 xchgtype;
+ __u8 flags;
+ __u32 msgid;
+ __u32 length;
+};
+
+*/
+
+#define PORT_ISAKMP 500
+
+
+static struct ip_vs_conn *
+ah_conn_in_get(const struct sk_buff *skb,
+ struct ip_vs_protocol *pp,
+ const struct iphdr *iph,
+ unsigned int proto_off,
+ int inverse)
+{
+ struct ip_vs_conn *cp;
+
+ if (likely(!inverse)) {
+ cp = ip_vs_conn_in_get(IPPROTO_UDP,
+ iph->saddr,
+ __constant_htons(PORT_ISAKMP),
+ iph->daddr,
+ __constant_htons(PORT_ISAKMP));
+ } else {
+ cp = ip_vs_conn_in_get(IPPROTO_UDP,
+ iph->daddr,
+ __constant_htons(PORT_ISAKMP),
+ iph->saddr,
+ __constant_htons(PORT_ISAKMP));
+ }
+
+ if (!cp) {
+ /*
+ * We are not sure if the packet is from our
+ * service, so our conn_schedule hook should return NF_ACCEPT
+ */
+ IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
+ "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
+ inverse ? "ICMP+" : "",
+ pp->name,
+ NIPQUAD(iph->saddr),
+ NIPQUAD(iph->daddr));
+ }
+
+ return cp;
+}
+
+
+static struct ip_vs_conn *
+ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+ const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+ struct ip_vs_conn *cp;
+
+ if (likely(!inverse)) {
+ cp = ip_vs_conn_out_get(IPPROTO_UDP,
+ iph->saddr,
+ __constant_htons(PORT_ISAKMP),
+ iph->daddr,
+ __constant_htons(PORT_ISAKMP));
+ } else {
+ cp = ip_vs_conn_out_get(IPPROTO_UDP,
+ iph->daddr,
+ __constant_htons(PORT_ISAKMP),
+ iph->saddr,
+ __constant_htons(PORT_ISAKMP));
+ }
+
+ if (!cp) {
+ IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
+ "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
+ inverse ? "ICMP+" : "",
+ pp->name,
+ NIPQUAD(iph->saddr),
+ NIPQUAD(iph->daddr));
+ }
+
+ return cp;
+}
+
+
+static int
+ah_conn_schedule(struct sk_buff *skb,
+ struct ip_vs_protocol *pp,
+ int *verdict, struct ip_vs_conn **cpp)
+{
+ /*
+ * AH is only related traffic. Pass the packet to IP stack.
+ */
+ *verdict = NF_ACCEPT;
+ return 0;
+}
+
+
+static void
+ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
+ int offset, const char *msg)
+{
+ char buf[256];
+ struct iphdr _iph, *ih;
+
+ ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+ if (ih == NULL)
+ sprintf(buf, "%s TRUNCATED", pp->name);
+ else
+ sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
+ pp->name, NIPQUAD(ih->saddr),
+ NIPQUAD(ih->daddr));
+
+ printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+
+
+static void ah_init(struct ip_vs_protocol *pp)
+{
+ /* nothing to do now */
+}
+
+
+static void ah_exit(struct ip_vs_protocol *pp)
+{
+ /* nothing to do now */
+}
+
+
+struct ip_vs_protocol ip_vs_protocol_ah = {
+ .name = "AH",
+ .protocol = IPPROTO_AH,
+ .dont_defrag = 1,
+ .init = ah_init,
+ .exit = ah_exit,
+ .conn_schedule = ah_conn_schedule,
+ .conn_in_get = ah_conn_in_get,
+ .conn_out_get = ah_conn_out_get,
+ .snat_handler = NULL,
+ .dnat_handler = NULL,
+ .csum_check = NULL,
+ .state_transition = NULL,
+ .register_app = NULL,
+ .unregister_app = NULL,
+ .app_conn_bind = NULL,
+ .debug_packet = ah_debug_packet,
+ .timeout_change = NULL, /* ISAKMP */
+ .set_state_timeout = NULL,
+};
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c
new file mode 100644
index 00000000000..478e5c7c7e8
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_esp.c
@@ -0,0 +1,175 @@
+/*
+ * ip_vs_proto_esp.c: ESP IPSec load balancing support for IPVS
+ *
+ * Version: $Id: ip_vs_proto_esp.c,v 1.1 2003/07/04 15:04:37 wensong Exp $
+ *
+ * Authors: Julian Anastasov <ja@ssi.bg>, February 2002
+ * Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation;
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+/* TODO:
+
+struct isakmp_hdr {
+ __u8 icookie[8];
+ __u8 rcookie[8];
+ __u8 np;
+ __u8 version;
+ __u8 xchgtype;
+ __u8 flags;
+ __u32 msgid;
+ __u32 length;
+};
+
+*/
+
+#define PORT_ISAKMP 500
+
+
+static struct ip_vs_conn *
+esp_conn_in_get(const struct sk_buff *skb,
+ struct ip_vs_protocol *pp,
+ const struct iphdr *iph,
+ unsigned int proto_off,
+ int inverse)
+{
+ struct ip_vs_conn *cp;
+
+ if (likely(!inverse)) {
+ cp = ip_vs_conn_in_get(IPPROTO_UDP,
+ iph->saddr,
+ __constant_htons(PORT_ISAKMP),
+ iph->daddr,
+ __constant_htons(PORT_ISAKMP));
+ } else {
+ cp = ip_vs_conn_in_get(IPPROTO_UDP,
+ iph->daddr,
+ __constant_htons(PORT_ISAKMP),
+ iph->saddr,
+ __constant_htons(PORT_ISAKMP));
+ }
+
+ if (!cp) {
+ /*
+ * We are not sure if the packet is from our
+ * service, so our conn_schedule hook should return NF_ACCEPT
+ */
+ IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
+ "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
+ inverse ? "ICMP+" : "",
+ pp->name,
+ NIPQUAD(iph->saddr),
+ NIPQUAD(iph->daddr));
+ }
+
+ return cp;
+}
+
+
+static struct ip_vs_conn *
+esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+ const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+ struct ip_vs_conn *cp;
+
+ if (likely(!inverse)) {
+ cp = ip_vs_conn_out_get(IPPROTO_UDP,
+ iph->saddr,
+ __constant_htons(PORT_ISAKMP),
+ iph->daddr,
+ __constant_htons(PORT_ISAKMP));
+ } else {
+ cp = ip_vs_conn_out_get(IPPROTO_UDP,
+ iph->daddr,
+ __constant_htons(PORT_ISAKMP),
+ iph->saddr,
+ __constant_htons(PORT_ISAKMP));
+ }
+
+ if (!cp) {
+ IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
+ "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
+ inverse ? "ICMP+" : "",
+ pp->name,
+ NIPQUAD(iph->saddr),
+ NIPQUAD(iph->daddr));
+ }
+
+ return cp;
+}
+
+
+static int
+esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ int *verdict, struct ip_vs_conn **cpp)
+{
+ /*
+ * ESP is only related traffic. Pass the packet to IP stack.
+ */
+ *verdict = NF_ACCEPT;
+ return 0;
+}
+
+
+static void
+esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
+ int offset, const char *msg)
+{
+ char buf[256];
+ struct iphdr _iph, *ih;
+
+ ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+ if (ih == NULL)
+ sprintf(buf, "%s TRUNCATED", pp->name);
+ else
+ sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
+ pp->name, NIPQUAD(ih->saddr),
+ NIPQUAD(ih->daddr));
+
+ printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+
+
+static void esp_init(struct ip_vs_protocol *pp)
+{
+ /* nothing to do now */
+}
+
+
+static void esp_exit(struct ip_vs_protocol *pp)
+{
+ /* nothing to do now */
+}
+
+
+struct ip_vs_protocol ip_vs_protocol_esp = {
+ .name = "ESP",
+ .protocol = IPPROTO_ESP,
+ .dont_defrag = 1,
+ .init = esp_init,
+ .exit = esp_exit,
+ .conn_schedule = esp_conn_schedule,
+ .conn_in_get = esp_conn_in_get,
+ .conn_out_get = esp_conn_out_get,
+ .snat_handler = NULL,
+ .dnat_handler = NULL,
+ .csum_check = NULL,
+ .state_transition = NULL,
+ .register_app = NULL,
+ .unregister_app = NULL,
+ .app_conn_bind = NULL,
+ .debug_packet = esp_debug_packet,
+ .timeout_change = NULL, /* ISAKMP */
+};
diff --git a/net/ipv4/ipvs/ip_vs_proto_icmp.c b/net/ipv4/ipvs/ip_vs_proto_icmp.c
new file mode 100644
index 00000000000..191e94aa1c1
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_icmp.c
@@ -0,0 +1,182 @@
+/*
+ * ip_vs_proto_icmp.c: ICMP load balancing support for IP Virtual Server
+ *
+ * Authors: Julian Anastasov <ja@ssi.bg>, March 2002
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation;
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/icmp.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+static int icmp_timeouts[1] = { 1*60*HZ };
+
+static char * icmp_state_name_table[1] = { "ICMP" };
+
+static struct ip_vs_conn *
+icmp_conn_in_get(const struct sk_buff *skb,
+ struct ip_vs_protocol *pp,
+ const struct iphdr *iph,
+ unsigned int proto_off,
+ int inverse)
+{
+#if 0
+ struct ip_vs_conn *cp;
+
+ if (likely(!inverse)) {
+ cp = ip_vs_conn_in_get(iph->protocol,
+ iph->saddr, 0,
+ iph->daddr, 0);
+ } else {
+ cp = ip_vs_conn_in_get(iph->protocol,
+ iph->daddr, 0,
+ iph->saddr, 0);
+ }
+
+ return cp;
+
+#else
+ return NULL;
+#endif
+}
+
+static struct ip_vs_conn *
+icmp_conn_out_get(const struct sk_buff *skb,
+ struct ip_vs_protocol *pp,
+ const struct iphdr *iph,
+ unsigned int proto_off,
+ int inverse)
+{
+#if 0
+ struct ip_vs_conn *cp;
+
+ if (likely(!inverse)) {
+ cp = ip_vs_conn_out_get(iph->protocol,
+ iph->saddr, 0,
+ iph->daddr, 0);
+ } else {
+ cp = ip_vs_conn_out_get(IPPROTO_UDP,
+ iph->daddr, 0,
+ iph->saddr, 0);
+ }
+
+ return cp;
+#else
+ return NULL;
+#endif
+}
+
+static int
+icmp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ int *verdict, struct ip_vs_conn **cpp)
+{
+ *verdict = NF_ACCEPT;
+ return 0;
+}
+
+static int
+icmp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+ if (!(skb->nh.iph->frag_off & __constant_htons(IP_OFFSET))) {
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
+ if (ip_vs_checksum_complete(skb, skb->nh.iph->ihl * 4)) {
+ IP_VS_DBG_RL_PKT(0, pp, skb, 0, "Failed checksum for");
+ return 0;
+ }
+ }
+ }
+ return 1;
+}
+
+static void
+icmp_debug_packet(struct ip_vs_protocol *pp,
+ const struct sk_buff *skb,
+ int offset,
+ const char *msg)
+{
+ char buf[256];
+ struct iphdr _iph, *ih;
+
+ ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+ if (ih == NULL)
+ sprintf(buf, "%s TRUNCATED", pp->name);
+ else if (ih->frag_off & __constant_htons(IP_OFFSET))
+ sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
+ pp->name, NIPQUAD(ih->saddr),
+ NIPQUAD(ih->daddr));
+ else {
+ struct icmphdr _icmph, *ic;
+
+ ic = skb_header_pointer(skb, offset + ih->ihl*4,
+ sizeof(_icmph), &_icmph);
+ if (ic == NULL)
+ sprintf(buf, "%s TRUNCATED to %u bytes\n",
+ pp->name, skb->len - offset);
+ else
+ sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u T:%d C:%d",
+ pp->name, NIPQUAD(ih->saddr),
+ NIPQUAD(ih->daddr),
+ ic->type, ic->code);
+ }
+ printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+
+static int
+icmp_state_transition(struct ip_vs_conn *cp, int direction,
+ const struct sk_buff *skb,
+ struct ip_vs_protocol *pp)
+{
+ cp->timeout = pp->timeout_table[IP_VS_ICMP_S_NORMAL];
+ return 1;
+}
+
+static int
+icmp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
+{
+ int num;
+ char **names;
+
+ num = IP_VS_ICMP_S_LAST;
+ names = icmp_state_name_table;
+ return ip_vs_set_state_timeout(pp->timeout_table, num, names, sname, to);
+}
+
+
+static void icmp_init(struct ip_vs_protocol *pp)
+{
+ pp->timeout_table = icmp_timeouts;
+}
+
+static void icmp_exit(struct ip_vs_protocol *pp)
+{
+}
+
+struct ip_vs_protocol ip_vs_protocol_icmp = {
+ .name = "ICMP",
+ .protocol = IPPROTO_ICMP,
+ .dont_defrag = 0,
+ .init = icmp_init,
+ .exit = icmp_exit,
+ .conn_schedule = icmp_conn_schedule,
+ .conn_in_get = icmp_conn_in_get,
+ .conn_out_get = icmp_conn_out_get,
+ .snat_handler = NULL,
+ .dnat_handler = NULL,
+ .csum_check = icmp_csum_check,
+ .state_transition = icmp_state_transition,
+ .register_app = NULL,
+ .unregister_app = NULL,
+ .app_conn_bind = NULL,
+ .debug_packet = icmp_debug_packet,
+ .timeout_change = NULL,
+ .set_state_timeout = icmp_set_state_timeout,
+};
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
new file mode 100644
index 00000000000..e65de675da7
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -0,0 +1,640 @@
+/*
+ * ip_vs_proto_tcp.c: TCP load balancing support for IPVS
+ *
+ * Version: $Id: ip_vs_proto_tcp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h> /* for tcphdr */
+#include <net/ip.h>
+#include <net/tcp.h> /* for csum_tcpudp_magic */
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+static struct ip_vs_conn *
+tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+ const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+ __u16 _ports[2], *pptr;
+
+ pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+ if (pptr == NULL)
+ return NULL;
+
+ if (likely(!inverse)) {
+ return ip_vs_conn_in_get(iph->protocol,
+ iph->saddr, pptr[0],
+ iph->daddr, pptr[1]);
+ } else {
+ return ip_vs_conn_in_get(iph->protocol,
+ iph->daddr, pptr[1],
+ iph->saddr, pptr[0]);
+ }
+}
+
+static struct ip_vs_conn *
+tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+ const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+ __u16 _ports[2], *pptr;
+
+ pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+ if (pptr == NULL)
+ return NULL;
+
+ if (likely(!inverse)) {
+ return ip_vs_conn_out_get(iph->protocol,
+ iph->saddr, pptr[0],
+ iph->daddr, pptr[1]);
+ } else {
+ return ip_vs_conn_out_get(iph->protocol,
+ iph->daddr, pptr[1],
+ iph->saddr, pptr[0]);
+ }
+}
+
+
+static int
+tcp_conn_schedule(struct sk_buff *skb,
+ struct ip_vs_protocol *pp,
+ int *verdict, struct ip_vs_conn **cpp)
+{
+ struct ip_vs_service *svc;
+ struct tcphdr _tcph, *th;
+
+ th = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+ sizeof(_tcph), &_tcph);
+ if (th == NULL) {
+ *verdict = NF_DROP;
+ return 0;
+ }
+
+ if (th->syn &&
+ (svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol,
+ skb->nh.iph->daddr, th->dest))) {
+ if (ip_vs_todrop()) {
+ /*
+ * It seems that we are very loaded.
+ * We have to drop this packet :(
+ */
+ ip_vs_service_put(svc);
+ *verdict = NF_DROP;
+ return 0;
+ }
+
+ /*
+ * Let the virtual server select a real server for the
+ * incoming connection, and create a connection entry.
+ */
+ *cpp = ip_vs_schedule(svc, skb);
+ if (!*cpp) {
+ *verdict = ip_vs_leave(svc, skb, pp);
+ return 0;
+ }
+ ip_vs_service_put(svc);
+ }
+ return 1;
+}
+
+
+static inline void
+tcp_fast_csum_update(struct tcphdr *tcph, u32 oldip, u32 newip,
+ u16 oldport, u16 newport)
+{
+ tcph->check =
+ ip_vs_check_diff(~oldip, newip,
+ ip_vs_check_diff(oldport ^ 0xFFFF,
+ newport, tcph->check));
+}
+
+
+static int
+tcp_snat_handler(struct sk_buff **pskb,
+ struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+ struct tcphdr *tcph;
+ unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4;
+
+ /* csum_check requires unshared skb */
+ if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph)))
+ return 0;
+
+ if (unlikely(cp->app != NULL)) {
+ /* Some checks before mangling */
+ if (pp->csum_check && !pp->csum_check(*pskb, pp))
+ return 0;
+
+ /* Call application helper if needed */
+ if (!ip_vs_app_pkt_out(cp, pskb))
+ return 0;
+ }
+
+ tcph = (void *)(*pskb)->nh.iph + tcphoff;
+ tcph->source = cp->vport;
+
+ /* Adjust TCP checksums */
+ if (!cp->app) {
+ /* Only port and addr are changed, do fast csum update */
+ tcp_fast_csum_update(tcph, cp->daddr, cp->vaddr,
+ cp->dport, cp->vport);
+ if ((*pskb)->ip_summed == CHECKSUM_HW)
+ (*pskb)->ip_summed = CHECKSUM_NONE;
+ } else {
+ /* full checksum calculation */
+ tcph->check = 0;
+ (*pskb)->csum = skb_checksum(*pskb, tcphoff,
+ (*pskb)->len - tcphoff, 0);
+ tcph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
+ (*pskb)->len - tcphoff,
+ cp->protocol,
+ (*pskb)->csum);
+ IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
+ pp->name, tcph->check,
+ (char*)&(tcph->check) - (char*)tcph);
+ }
+ return 1;
+}
+
+
+static int
+tcp_dnat_handler(struct sk_buff **pskb,
+ struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+ struct tcphdr *tcph;
+ unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4;
+
+ /* csum_check requires unshared skb */
+ if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph)))
+ return 0;
+
+ if (unlikely(cp->app != NULL)) {
+ /* Some checks before mangling */
+ if (pp->csum_check && !pp->csum_check(*pskb, pp))
+ return 0;
+
+ /*
+ * Attempt ip_vs_app call.
+ * It will fix ip_vs_conn and iph ack_seq stuff
+ */
+ if (!ip_vs_app_pkt_in(cp, pskb))
+ return 0;
+ }
+
+ tcph = (void *)(*pskb)->nh.iph + tcphoff;
+ tcph->dest = cp->dport;
+
+ /*
+ * Adjust TCP checksums
+ */
+ if (!cp->app) {
+ /* Only port and addr are changed, do fast csum update */
+ tcp_fast_csum_update(tcph, cp->vaddr, cp->daddr,
+ cp->vport, cp->dport);
+ if ((*pskb)->ip_summed == CHECKSUM_HW)
+ (*pskb)->ip_summed = CHECKSUM_NONE;
+ } else {
+ /* full checksum calculation */
+ tcph->check = 0;
+ (*pskb)->csum = skb_checksum(*pskb, tcphoff,
+ (*pskb)->len - tcphoff, 0);
+ tcph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
+ (*pskb)->len - tcphoff,
+ cp->protocol,
+ (*pskb)->csum);
+ (*pskb)->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+ return 1;
+}
+
+
+static int
+tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+ unsigned int tcphoff = skb->nh.iph->ihl*4;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_NONE:
+ skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+ case CHECKSUM_HW:
+ if (csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr,
+ skb->len - tcphoff,
+ skb->nh.iph->protocol, skb->csum)) {
+ IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+ "Failed checksum for");
+ return 0;
+ }
+ break;
+ default:
+ /* CHECKSUM_UNNECESSARY */
+ break;
+ }
+
+ return 1;
+}
+
+
+#define TCP_DIR_INPUT 0
+#define TCP_DIR_OUTPUT 4
+#define TCP_DIR_INPUT_ONLY 8
+
+static int tcp_state_off[IP_VS_DIR_LAST] = {
+ [IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
+ [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
+ [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
+};
+
+/*
+ * Timeout table[state]
+ */
+static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
+ [IP_VS_TCP_S_NONE] = 2*HZ,
+ [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
+ [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
+ [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
+ [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
+ [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
+ [IP_VS_TCP_S_CLOSE] = 10*HZ,
+ [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
+ [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
+ [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
+ [IP_VS_TCP_S_SYNACK] = 120*HZ,
+ [IP_VS_TCP_S_LAST] = 2*HZ,
+};
+
+
+#if 0
+
+/* FIXME: This is going to die */
+
+static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = {
+ [IP_VS_TCP_S_NONE] = 2*HZ,
+ [IP_VS_TCP_S_ESTABLISHED] = 8*60*HZ,
+ [IP_VS_TCP_S_SYN_SENT] = 60*HZ,
+ [IP_VS_TCP_S_SYN_RECV] = 10*HZ,
+ [IP_VS_TCP_S_FIN_WAIT] = 60*HZ,
+ [IP_VS_TCP_S_TIME_WAIT] = 60*HZ,
+ [IP_VS_TCP_S_CLOSE] = 10*HZ,
+ [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
+ [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
+ [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
+ [IP_VS_TCP_S_SYNACK] = 100*HZ,
+ [IP_VS_TCP_S_LAST] = 2*HZ,
+};
+
+#endif
+
+static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
+ [IP_VS_TCP_S_NONE] = "NONE",
+ [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
+ [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
+ [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
+ [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
+ [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
+ [IP_VS_TCP_S_CLOSE] = "CLOSE",
+ [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
+ [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
+ [IP_VS_TCP_S_LISTEN] = "LISTEN",
+ [IP_VS_TCP_S_SYNACK] = "SYNACK",
+ [IP_VS_TCP_S_LAST] = "BUG!",
+};
+
+#define sNO IP_VS_TCP_S_NONE
+#define sES IP_VS_TCP_S_ESTABLISHED
+#define sSS IP_VS_TCP_S_SYN_SENT
+#define sSR IP_VS_TCP_S_SYN_RECV
+#define sFW IP_VS_TCP_S_FIN_WAIT
+#define sTW IP_VS_TCP_S_TIME_WAIT
+#define sCL IP_VS_TCP_S_CLOSE
+#define sCW IP_VS_TCP_S_CLOSE_WAIT
+#define sLA IP_VS_TCP_S_LAST_ACK
+#define sLI IP_VS_TCP_S_LISTEN
+#define sSA IP_VS_TCP_S_SYNACK
+
+struct tcp_states_t {
+ int next_state[IP_VS_TCP_S_LAST];
+};
+
+static const char * tcp_state_name(int state)
+{
+ if (state >= IP_VS_TCP_S_LAST)
+ return "ERR!";
+ return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
+}
+
+static struct tcp_states_t tcp_states [] = {
+/* INPUT */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
+
+/* OUTPUT */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+
+/* INPUT-ONLY */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+
+static struct tcp_states_t tcp_states_dos [] = {
+/* INPUT */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
+/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+
+/* OUTPUT */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+
+/* INPUT-ONLY */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+
+static struct tcp_states_t *tcp_state_table = tcp_states;
+
+
+static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
+{
+ int on = (flags & 1); /* secure_tcp */
+
+ /*
+ ** FIXME: change secure_tcp to independent sysctl var
+ ** or make it per-service or per-app because it is valid
+ ** for most if not for all of the applications. Something
+ ** like "capabilities" (flags) for each object.
+ */
+ tcp_state_table = (on? tcp_states_dos : tcp_states);
+}
+
+static int
+tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
+{
+ return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
+ tcp_state_name_table, sname, to);
+}
+
+static inline int tcp_state_idx(struct tcphdr *th)
+{
+ if (th->rst)
+ return 3;
+ if (th->syn)
+ return 0;
+ if (th->fin)
+ return 1;
+ if (th->ack)
+ return 2;
+ return -1;
+}
+
+static inline void
+set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+ int direction, struct tcphdr *th)
+{
+ int state_idx;
+ int new_state = IP_VS_TCP_S_CLOSE;
+ int state_off = tcp_state_off[direction];
+
+ /*
+ * Update state offset to INPUT_ONLY if necessary
+ * or delete NO_OUTPUT flag if output packet detected
+ */
+ if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
+ if (state_off == TCP_DIR_OUTPUT)
+ cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
+ else
+ state_off = TCP_DIR_INPUT_ONLY;
+ }
+
+ if ((state_idx = tcp_state_idx(th)) < 0) {
+ IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
+ goto tcp_state_out;
+ }
+
+ new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
+
+ tcp_state_out:
+ if (new_state != cp->state) {
+ struct ip_vs_dest *dest = cp->dest;
+
+ IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
+ "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
+ pp->name,
+ (state_off==TCP_DIR_OUTPUT)?"output ":"input ",
+ th->syn? 'S' : '.',
+ th->fin? 'F' : '.',
+ th->ack? 'A' : '.',
+ th->rst? 'R' : '.',
+ NIPQUAD(cp->daddr), ntohs(cp->dport),
+ NIPQUAD(cp->caddr), ntohs(cp->cport),
+ tcp_state_name(cp->state),
+ tcp_state_name(new_state),
+ atomic_read(&cp->refcnt));
+ if (dest) {
+ if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (new_state != IP_VS_TCP_S_ESTABLISHED)) {
+ atomic_dec(&dest->activeconns);
+ atomic_inc(&dest->inactconns);
+ cp->flags |= IP_VS_CONN_F_INACTIVE;
+ } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (new_state == IP_VS_TCP_S_ESTABLISHED)) {
+ atomic_inc(&dest->activeconns);
+ atomic_dec(&dest->inactconns);
+ cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ }
+ }
+ }
+
+ cp->timeout = pp->timeout_table[cp->state = new_state];
+}
+
+
+/*
+ * Handle state transitions
+ */
+static int
+tcp_state_transition(struct ip_vs_conn *cp, int direction,
+ const struct sk_buff *skb,
+ struct ip_vs_protocol *pp)
+{
+ struct tcphdr _tcph, *th;
+
+ th = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+ sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return 0;
+
+ spin_lock(&cp->lock);
+ set_tcp_state(pp, cp, direction, th);
+ spin_unlock(&cp->lock);
+
+ return 1;
+}
+
+
+/*
+ * Hash table for TCP application incarnations
+ */
+#define TCP_APP_TAB_BITS 4
+#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS)
+#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1)
+
+static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
+static DEFINE_SPINLOCK(tcp_app_lock);
+
+static inline __u16 tcp_app_hashkey(__u16 port)
+{
+ return ((port >> TCP_APP_TAB_BITS) ^ port) & TCP_APP_TAB_MASK;
+}
+
+
+static int tcp_register_app(struct ip_vs_app *inc)
+{
+ struct ip_vs_app *i;
+ __u16 hash, port = inc->port;
+ int ret = 0;
+
+ hash = tcp_app_hashkey(port);
+
+ spin_lock_bh(&tcp_app_lock);
+ list_for_each_entry(i, &tcp_apps[hash], p_list) {
+ if (i->port == port) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+ list_add(&inc->p_list, &tcp_apps[hash]);
+ atomic_inc(&ip_vs_protocol_tcp.appcnt);
+
+ out:
+ spin_unlock_bh(&tcp_app_lock);
+ return ret;
+}
+
+
+static void
+tcp_unregister_app(struct ip_vs_app *inc)
+{
+ spin_lock_bh(&tcp_app_lock);
+ atomic_dec(&ip_vs_protocol_tcp.appcnt);
+ list_del(&inc->p_list);
+ spin_unlock_bh(&tcp_app_lock);
+}
+
+
+static int
+tcp_app_conn_bind(struct ip_vs_conn *cp)
+{
+ int hash;
+ struct ip_vs_app *inc;
+ int result = 0;
+
+ /* Default binding: bind app only for NAT */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ return 0;
+
+ /* Lookup application incarnations and bind the right one */
+ hash = tcp_app_hashkey(cp->vport);
+
+ spin_lock(&tcp_app_lock);
+ list_for_each_entry(inc, &tcp_apps[hash], p_list) {
+ if (inc->port == cp->vport) {
+ if (unlikely(!ip_vs_app_inc_get(inc)))
+ break;
+ spin_unlock(&tcp_app_lock);
+
+ IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
+ "%u.%u.%u.%u:%u to app %s on port %u\n",
+ __FUNCTION__,
+ NIPQUAD(cp->caddr), ntohs(cp->cport),
+ NIPQUAD(cp->vaddr), ntohs(cp->vport),
+ inc->name, ntohs(inc->port));
+ cp->app = inc;
+ if (inc->init_conn)
+ result = inc->init_conn(inc, cp);
+ goto out;
+ }
+ }
+ spin_unlock(&tcp_app_lock);
+
+ out:
+ return result;
+}
+
+
+/*
+ * Set LISTEN timeout. (ip_vs_conn_put will setup timer)
+ */
+void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
+{
+ spin_lock(&cp->lock);
+ cp->state = IP_VS_TCP_S_LISTEN;
+ cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
+ spin_unlock(&cp->lock);
+}
+
+
+static void tcp_init(struct ip_vs_protocol *pp)
+{
+ IP_VS_INIT_HASH_TABLE(tcp_apps);
+ pp->timeout_table = tcp_timeouts;
+}
+
+
+static void tcp_exit(struct ip_vs_protocol *pp)
+{
+}
+
+
+struct ip_vs_protocol ip_vs_protocol_tcp = {
+ .name = "TCP",
+ .protocol = IPPROTO_TCP,
+ .dont_defrag = 0,
+ .appcnt = ATOMIC_INIT(0),
+ .init = tcp_init,
+ .exit = tcp_exit,
+ .register_app = tcp_register_app,
+ .unregister_app = tcp_unregister_app,
+ .conn_schedule = tcp_conn_schedule,
+ .conn_in_get = tcp_conn_in_get,
+ .conn_out_get = tcp_conn_out_get,
+ .snat_handler = tcp_snat_handler,
+ .dnat_handler = tcp_dnat_handler,
+ .csum_check = tcp_csum_check,
+ .state_name = tcp_state_name,
+ .state_transition = tcp_state_transition,
+ .app_conn_bind = tcp_app_conn_bind,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
+ .timeout_change = tcp_timeout_change,
+ .set_state_timeout = tcp_set_state_timeout,
+};
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
new file mode 100644
index 00000000000..8ae5f2e0aef
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -0,0 +1,427 @@
+/*
+ * ip_vs_proto_udp.c: UDP load balancing support for IPVS
+ *
+ * Version: $Id: ip_vs_proto_udp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+static struct ip_vs_conn *
+udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+ const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+ struct ip_vs_conn *cp;
+ __u16 _ports[2], *pptr;
+
+ pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+ if (pptr == NULL)
+ return NULL;
+
+ if (likely(!inverse)) {
+ cp = ip_vs_conn_in_get(iph->protocol,
+ iph->saddr, pptr[0],
+ iph->daddr, pptr[1]);
+ } else {
+ cp = ip_vs_conn_in_get(iph->protocol,
+ iph->daddr, pptr[1],
+ iph->saddr, pptr[0]);
+ }
+
+ return cp;
+}
+
+
+static struct ip_vs_conn *
+udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+ const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+ struct ip_vs_conn *cp;
+ __u16 _ports[2], *pptr;
+
+ pptr = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+ sizeof(_ports), _ports);
+ if (pptr == NULL)
+ return NULL;
+
+ if (likely(!inverse)) {
+ cp = ip_vs_conn_out_get(iph->protocol,
+ iph->saddr, pptr[0],
+ iph->daddr, pptr[1]);
+ } else {
+ cp = ip_vs_conn_out_get(iph->protocol,
+ iph->daddr, pptr[1],
+ iph->saddr, pptr[0]);
+ }
+
+ return cp;
+}
+
+
+static int
+udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ int *verdict, struct ip_vs_conn **cpp)
+{
+ struct ip_vs_service *svc;
+ struct udphdr _udph, *uh;
+
+ uh = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+ sizeof(_udph), &_udph);
+ if (uh == NULL) {
+ *verdict = NF_DROP;
+ return 0;
+ }
+
+ if ((svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol,
+ skb->nh.iph->daddr, uh->dest))) {
+ if (ip_vs_todrop()) {
+ /*
+ * It seems that we are very loaded.
+ * We have to drop this packet :(
+ */
+ ip_vs_service_put(svc);
+ *verdict = NF_DROP;
+ return 0;
+ }
+
+ /*
+ * Let the virtual server select a real server for the
+ * incoming connection, and create a connection entry.
+ */
+ *cpp = ip_vs_schedule(svc, skb);
+ if (!*cpp) {
+ *verdict = ip_vs_leave(svc, skb, pp);
+ return 0;
+ }
+ ip_vs_service_put(svc);
+ }
+ return 1;
+}
+
+
+static inline void
+udp_fast_csum_update(struct udphdr *uhdr, u32 oldip, u32 newip,
+ u16 oldport, u16 newport)
+{
+ uhdr->check =
+ ip_vs_check_diff(~oldip, newip,
+ ip_vs_check_diff(oldport ^ 0xFFFF,
+ newport, uhdr->check));
+ if (!uhdr->check)
+ uhdr->check = 0xFFFF;
+}
+
+static int
+udp_snat_handler(struct sk_buff **pskb,
+ struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+ struct udphdr *udph;
+ unsigned int udphoff = (*pskb)->nh.iph->ihl * 4;
+
+ /* csum_check requires unshared skb */
+ if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph)))
+ return 0;
+
+ if (unlikely(cp->app != NULL)) {
+ /* Some checks before mangling */
+ if (pp->csum_check && !pp->csum_check(*pskb, pp))
+ return 0;
+
+ /*
+ * Call application helper if needed
+ */
+ if (!ip_vs_app_pkt_out(cp, pskb))
+ return 0;
+ }
+
+ udph = (void *)(*pskb)->nh.iph + udphoff;
+ udph->source = cp->vport;
+
+ /*
+ * Adjust UDP checksums
+ */
+ if (!cp->app && (udph->check != 0)) {
+ /* Only port and addr are changed, do fast csum update */
+ udp_fast_csum_update(udph, cp->daddr, cp->vaddr,
+ cp->dport, cp->vport);
+ if ((*pskb)->ip_summed == CHECKSUM_HW)
+ (*pskb)->ip_summed = CHECKSUM_NONE;
+ } else {
+ /* full checksum calculation */
+ udph->check = 0;
+ (*pskb)->csum = skb_checksum(*pskb, udphoff,
+ (*pskb)->len - udphoff, 0);
+ udph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
+ (*pskb)->len - udphoff,
+ cp->protocol,
+ (*pskb)->csum);
+ if (udph->check == 0)
+ udph->check = 0xFFFF;
+ IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
+ pp->name, udph->check,
+ (char*)&(udph->check) - (char*)udph);
+ }
+ return 1;
+}
+
+
+static int
+udp_dnat_handler(struct sk_buff **pskb,
+ struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+ struct udphdr *udph;
+ unsigned int udphoff = (*pskb)->nh.iph->ihl * 4;
+
+ /* csum_check requires unshared skb */
+ if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph)))
+ return 0;
+
+ if (unlikely(cp->app != NULL)) {
+ /* Some checks before mangling */
+ if (pp->csum_check && !pp->csum_check(*pskb, pp))
+ return 0;
+
+ /*
+ * Attempt ip_vs_app call.
+ * It will fix ip_vs_conn
+ */
+ if (!ip_vs_app_pkt_in(cp, pskb))
+ return 0;
+ }
+
+ udph = (void *)(*pskb)->nh.iph + udphoff;
+ udph->dest = cp->dport;
+
+ /*
+ * Adjust UDP checksums
+ */
+ if (!cp->app && (udph->check != 0)) {
+ /* Only port and addr are changed, do fast csum update */
+ udp_fast_csum_update(udph, cp->vaddr, cp->daddr,
+ cp->vport, cp->dport);
+ if ((*pskb)->ip_summed == CHECKSUM_HW)
+ (*pskb)->ip_summed = CHECKSUM_NONE;
+ } else {
+ /* full checksum calculation */
+ udph->check = 0;
+ (*pskb)->csum = skb_checksum(*pskb, udphoff,
+ (*pskb)->len - udphoff, 0);
+ udph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
+ (*pskb)->len - udphoff,
+ cp->protocol,
+ (*pskb)->csum);
+ if (udph->check == 0)
+ udph->check = 0xFFFF;
+ (*pskb)->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+ return 1;
+}
+
+
+static int
+udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+ struct udphdr _udph, *uh;
+ unsigned int udphoff = skb->nh.iph->ihl*4;
+
+ uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
+ if (uh == NULL)
+ return 0;
+
+ if (uh->check != 0) {
+ switch (skb->ip_summed) {
+ case CHECKSUM_NONE:
+ skb->csum = skb_checksum(skb, udphoff,
+ skb->len - udphoff, 0);
+ case CHECKSUM_HW:
+ if (csum_tcpudp_magic(skb->nh.iph->saddr,
+ skb->nh.iph->daddr,
+ skb->len - udphoff,
+ skb->nh.iph->protocol,
+ skb->csum)) {
+ IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+ "Failed checksum for");
+ return 0;
+ }
+ break;
+ default:
+ /* CHECKSUM_UNNECESSARY */
+ break;
+ }
+ }
+ return 1;
+}
+
+
+/*
+ * Note: the caller guarantees that only one of register_app,
+ * unregister_app or app_conn_bind is called each time.
+ */
+
+#define UDP_APP_TAB_BITS 4
+#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS)
+#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1)
+
+static struct list_head udp_apps[UDP_APP_TAB_SIZE];
+static DEFINE_SPINLOCK(udp_app_lock);
+
+static inline __u16 udp_app_hashkey(__u16 port)
+{
+ return ((port >> UDP_APP_TAB_BITS) ^ port) & UDP_APP_TAB_MASK;
+}
+
+
+static int udp_register_app(struct ip_vs_app *inc)
+{
+ struct ip_vs_app *i;
+ __u16 hash, port = inc->port;
+ int ret = 0;
+
+ hash = udp_app_hashkey(port);
+
+
+ spin_lock_bh(&udp_app_lock);
+ list_for_each_entry(i, &udp_apps[hash], p_list) {
+ if (i->port == port) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+ list_add(&inc->p_list, &udp_apps[hash]);
+ atomic_inc(&ip_vs_protocol_udp.appcnt);
+
+ out:
+ spin_unlock_bh(&udp_app_lock);
+ return ret;
+}
+
+
+static void
+udp_unregister_app(struct ip_vs_app *inc)
+{
+ spin_lock_bh(&udp_app_lock);
+ atomic_dec(&ip_vs_protocol_udp.appcnt);
+ list_del(&inc->p_list);
+ spin_unlock_bh(&udp_app_lock);
+}
+
+
+static int udp_app_conn_bind(struct ip_vs_conn *cp)
+{
+ int hash;
+ struct ip_vs_app *inc;
+ int result = 0;
+
+ /* Default binding: bind app only for NAT */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ return 0;
+
+ /* Lookup application incarnations and bind the right one */
+ hash = udp_app_hashkey(cp->vport);
+
+ spin_lock(&udp_app_lock);
+ list_for_each_entry(inc, &udp_apps[hash], p_list) {
+ if (inc->port == cp->vport) {
+ if (unlikely(!ip_vs_app_inc_get(inc)))
+ break;
+ spin_unlock(&udp_app_lock);
+
+ IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
+ "%u.%u.%u.%u:%u to app %s on port %u\n",
+ __FUNCTION__,
+ NIPQUAD(cp->caddr), ntohs(cp->cport),
+ NIPQUAD(cp->vaddr), ntohs(cp->vport),
+ inc->name, ntohs(inc->port));
+ cp->app = inc;
+ if (inc->init_conn)
+ result = inc->init_conn(inc, cp);
+ goto out;
+ }
+ }
+ spin_unlock(&udp_app_lock);
+
+ out:
+ return result;
+}
+
+
+static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
+ [IP_VS_UDP_S_NORMAL] = 5*60*HZ,
+ [IP_VS_UDP_S_LAST] = 2*HZ,
+};
+
+static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
+ [IP_VS_UDP_S_NORMAL] = "UDP",
+ [IP_VS_UDP_S_LAST] = "BUG!",
+};
+
+
+static int
+udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
+{
+ return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
+ udp_state_name_table, sname, to);
+}
+
+static const char * udp_state_name(int state)
+{
+ if (state >= IP_VS_UDP_S_LAST)
+ return "ERR!";
+ return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
+}
+
+static int
+udp_state_transition(struct ip_vs_conn *cp, int direction,
+ const struct sk_buff *skb,
+ struct ip_vs_protocol *pp)
+{
+ cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
+ return 1;
+}
+
+static void udp_init(struct ip_vs_protocol *pp)
+{
+ IP_VS_INIT_HASH_TABLE(udp_apps);
+ pp->timeout_table = udp_timeouts;
+}
+
+static void udp_exit(struct ip_vs_protocol *pp)
+{
+}
+
+
+struct ip_vs_protocol ip_vs_protocol_udp = {
+ .name = "UDP",
+ .protocol = IPPROTO_UDP,
+ .dont_defrag = 0,
+ .init = udp_init,
+ .exit = udp_exit,
+ .conn_schedule = udp_conn_schedule,
+ .conn_in_get = udp_conn_in_get,
+ .conn_out_get = udp_conn_out_get,
+ .snat_handler = udp_snat_handler,
+ .dnat_handler = udp_dnat_handler,
+ .csum_check = udp_csum_check,
+ .state_transition = udp_state_transition,
+ .state_name = udp_state_name,
+ .register_app = udp_register_app,
+ .unregister_app = udp_unregister_app,
+ .app_conn_bind = udp_app_conn_bind,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
+ .timeout_change = NULL,
+ .set_state_timeout = udp_set_state_timeout,
+};
diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c
new file mode 100644
index 00000000000..b23bab231ca
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_rr.c
@@ -0,0 +1,118 @@
+/*
+ * IPVS: Round-Robin Scheduling module
+ *
+ * Version: $Id: ip_vs_rr.c,v 1.9 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes/Changes:
+ * Wensong Zhang : changed the ip_vs_rr_schedule to return dest
+ * Julian Anastasov : fixed the NULL pointer access bug in debugging
+ * Wensong Zhang : changed some comestics things for debugging
+ * Wensong Zhang : changed for the d-linked destination list
+ * Wensong Zhang : added the ip_vs_rr_update_svc
+ * Wensong Zhang : added any dest with weight=0 is quiesced
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
+{
+ svc->sched_data = &svc->destinations;
+ return 0;
+}
+
+
+static int ip_vs_rr_done_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
+{
+ svc->sched_data = &svc->destinations;
+ return 0;
+}
+
+
+/*
+ * Round-Robin Scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+ struct list_head *p, *q;
+ struct ip_vs_dest *dest;
+
+ IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n");
+
+ write_lock(&svc->sched_lock);
+ p = (struct list_head *)svc->sched_data;
+ p = p->next;
+ q = p;
+ do {
+ /* skip list head */
+ if (q == &svc->destinations) {
+ q = q->next;
+ continue;
+ }
+
+ dest = list_entry(q, struct ip_vs_dest, n_list);
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) > 0)
+ /* HIT */
+ goto out;
+ q = q->next;
+ } while (q != p);
+ write_unlock(&svc->sched_lock);
+ return NULL;
+
+ out:
+ svc->sched_data = q;
+ write_unlock(&svc->sched_lock);
+ IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u "
+ "activeconns %d refcnt %d weight %d\n",
+ NIPQUAD(dest->addr), ntohs(dest->port),
+ atomic_read(&dest->activeconns),
+ atomic_read(&dest->refcnt), atomic_read(&dest->weight));
+
+ return dest;
+}
+
+
+static struct ip_vs_scheduler ip_vs_rr_scheduler = {
+ .name = "rr", /* name */
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .init_service = ip_vs_rr_init_svc,
+ .done_service = ip_vs_rr_done_svc,
+ .update_service = ip_vs_rr_update_svc,
+ .schedule = ip_vs_rr_schedule,
+};
+
+static int __init ip_vs_rr_init(void)
+{
+ INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list);
+ return register_ip_vs_scheduler(&ip_vs_rr_scheduler);
+}
+
+static void __exit ip_vs_rr_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
+}
+
+module_init(ip_vs_rr_init);
+module_exit(ip_vs_rr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c
new file mode 100644
index 00000000000..0f7c56a225b
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sched.c
@@ -0,0 +1,251 @@
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the Netfilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Version: $Id: ip_vs_sched.c,v 1.13 2003/05/10 03:05:23 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <asm/string.h>
+#include <linux/kmod.h>
+
+#include <net/ip_vs.h>
+
+/*
+ * IPVS scheduler list
+ */
+static LIST_HEAD(ip_vs_schedulers);
+
+/* lock for service table */
+static DEFINE_RWLOCK(__ip_vs_sched_lock);
+
+
+/*
+ * Bind a service with a scheduler
+ */
+int ip_vs_bind_scheduler(struct ip_vs_service *svc,
+ struct ip_vs_scheduler *scheduler)
+{
+ int ret;
+
+ if (svc == NULL) {
+ IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n");
+ return -EINVAL;
+ }
+ if (scheduler == NULL) {
+ IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n");
+ return -EINVAL;
+ }
+
+ svc->scheduler = scheduler;
+
+ if (scheduler->init_service) {
+ ret = scheduler->init_service(svc);
+ if (ret) {
+ IP_VS_ERR("ip_vs_bind_scheduler(): init error\n");
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+
+/*
+ * Unbind a service with its scheduler
+ */
+int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
+{
+ struct ip_vs_scheduler *sched;
+
+ if (svc == NULL) {
+ IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n");
+ return -EINVAL;
+ }
+
+ sched = svc->scheduler;
+ if (sched == NULL) {
+ IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n");
+ return -EINVAL;
+ }
+
+ if (sched->done_service) {
+ if (sched->done_service(svc) != 0) {
+ IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n");
+ return -EINVAL;
+ }
+ }
+
+ svc->scheduler = NULL;
+ return 0;
+}
+
+
+/*
+ * Get scheduler in the scheduler list by name
+ */
+static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
+{
+ struct ip_vs_scheduler *sched;
+
+ IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n",
+ sched_name);
+
+ read_lock_bh(&__ip_vs_sched_lock);
+
+ list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
+ /*
+ * Test and get the modules atomically
+ */
+ if (sched->module && !try_module_get(sched->module)) {
+ /*
+ * This scheduler is just deleted
+ */
+ continue;
+ }
+ if (strcmp(sched_name, sched->name)==0) {
+ /* HIT */
+ read_unlock_bh(&__ip_vs_sched_lock);
+ return sched;
+ }
+ if (sched->module)
+ module_put(sched->module);
+ }
+
+ read_unlock_bh(&__ip_vs_sched_lock);
+ return NULL;
+}
+
+
+/*
+ * Lookup scheduler and try to load it if it doesn't exist
+ */
+struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
+{
+ struct ip_vs_scheduler *sched;
+
+ /*
+ * Search for the scheduler by sched_name
+ */
+ sched = ip_vs_sched_getbyname(sched_name);
+
+ /*
+ * If scheduler not found, load the module and search again
+ */
+ if (sched == NULL) {
+ request_module("ip_vs_%s", sched_name);
+ sched = ip_vs_sched_getbyname(sched_name);
+ }
+
+ return sched;
+}
+
+void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
+{
+ if (scheduler->module)
+ module_put(scheduler->module);
+}
+
+
+/*
+ * Register a scheduler in the scheduler list
+ */
+int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
+{
+ struct ip_vs_scheduler *sched;
+
+ if (!scheduler) {
+ IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n");
+ return -EINVAL;
+ }
+
+ if (!scheduler->name) {
+ IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n");
+ return -EINVAL;
+ }
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ /*
+ * Make sure that the scheduler with this name doesn't exist
+ * in the scheduler list.
+ */
+ sched = ip_vs_sched_getbyname(scheduler->name);
+ if (sched) {
+ ip_vs_scheduler_put(sched);
+ ip_vs_use_count_dec();
+ IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
+ "already existed in the system\n", scheduler->name);
+ return -EINVAL;
+ }
+
+ write_lock_bh(&__ip_vs_sched_lock);
+
+ if (scheduler->n_list.next != &scheduler->n_list) {
+ write_unlock_bh(&__ip_vs_sched_lock);
+ ip_vs_use_count_dec();
+ IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
+ "already linked\n", scheduler->name);
+ return -EINVAL;
+ }
+
+ /*
+ * Add it into the d-linked scheduler list
+ */
+ list_add(&scheduler->n_list, &ip_vs_schedulers);
+ write_unlock_bh(&__ip_vs_sched_lock);
+
+ IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name);
+
+ return 0;
+}
+
+
+/*
+ * Unregister a scheduler from the scheduler list
+ */
+int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
+{
+ if (!scheduler) {
+ IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n");
+ return -EINVAL;
+ }
+
+ write_lock_bh(&__ip_vs_sched_lock);
+ if (scheduler->n_list.next == &scheduler->n_list) {
+ write_unlock_bh(&__ip_vs_sched_lock);
+ IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler "
+ "is not in the list. failed\n", scheduler->name);
+ return -EINVAL;
+ }
+
+ /*
+ * Remove it from the d-linked scheduler list
+ */
+ list_del(&scheduler->n_list);
+ write_unlock_bh(&__ip_vs_sched_lock);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name);
+
+ return 0;
+}
diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c
new file mode 100644
index 00000000000..ff366f7390d
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sed.c
@@ -0,0 +1,163 @@
+/*
+ * IPVS: Shortest Expected Delay scheduling module
+ *
+ * Version: $Id: ip_vs_sed.c,v 1.1 2003/05/10 03:06:08 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The SED algorithm attempts to minimize each job's expected delay until
+ * completion. The expected delay that the job will experience is
+ * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
+ * jobs on the the ith server and Ui is the fixed service rate (weight) of
+ * the ith server. The SED algorithm adopts a greedy policy that each does
+ * what is in its own best interest, i.e. to join the queue which would
+ * minimize its expected delay of completion.
+ *
+ * See the following paper for more information:
+ * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
+ * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
+ * pages 986-994, 1988.
+ *
+ * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
+ *
+ * The difference between SED and WLC is that SED includes the incoming
+ * job in the cost function (the increment of 1). SED may outperform
+ * WLC, while scheduling big jobs under larger heterogeneous systems
+ * (the server weight varies a lot).
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static int
+ip_vs_sed_init_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static int
+ip_vs_sed_done_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static int
+ip_vs_sed_update_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static inline unsigned int
+ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
+{
+ /*
+ * We only use the active connection number in the cost
+ * calculation here.
+ */
+ return atomic_read(&dest->activeconns) + 1;
+}
+
+
+/*
+ * Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+ struct ip_vs_dest *dest, *least;
+ unsigned int loh, doh;
+
+ IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n");
+
+ /*
+ * We calculate the load of each dest server as follows:
+ * (server expected overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connections.
+ */
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) > 0) {
+ least = dest;
+ loh = ip_vs_sed_dest_overhead(least);
+ goto nextstage;
+ }
+ }
+ return NULL;
+
+ /*
+ * Find the destination with the least load.
+ */
+ nextstage:
+ list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+ doh = ip_vs_sed_dest_overhead(dest);
+ if (loh * atomic_read(&dest->weight) >
+ doh * atomic_read(&least->weight)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ NIPQUAD(least->addr), ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_sed_scheduler =
+{
+ .name = "sed",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .init_service = ip_vs_sed_init_svc,
+ .done_service = ip_vs_sed_done_svc,
+ .update_service = ip_vs_sed_update_svc,
+ .schedule = ip_vs_sed_schedule,
+};
+
+
+static int __init ip_vs_sed_init(void)
+{
+ INIT_LIST_HEAD(&ip_vs_sed_scheduler.n_list);
+ return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
+}
+
+static void __exit ip_vs_sed_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
+}
+
+module_init(ip_vs_sed_init);
+module_exit(ip_vs_sed_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
new file mode 100644
index 00000000000..6f7c50e44a3
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sh.c
@@ -0,0 +1,255 @@
+/*
+ * IPVS: Source Hashing scheduling module
+ *
+ * Version: $Id: ip_vs_sh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@gnuchina.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The sh algorithm is to select server by the hash key of source IP
+ * address. The pseudo code is as follows:
+ *
+ * n <- servernode[src_ip];
+ * if (n is dead) OR
+ * (n is overloaded) or (n.weight <= 0) then
+ * return NULL;
+ *
+ * return n;
+ *
+ * Notes that servernode is a 256-bucket hash table that maps the hash
+ * index derived from packet source IP address to the current server
+ * array. If the sh scheduler is used in cache cluster, it is good to
+ * combine it with cache_bypass feature. When the statically assigned
+ * server is dead or overloaded, the load balancer can bypass the cache
+ * server and send requests to the original server directly.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * IPVS SH bucket
+ */
+struct ip_vs_sh_bucket {
+ struct ip_vs_dest *dest; /* real server (cache) */
+};
+
+/*
+ * for IPVS SH entry hash table
+ */
+#ifndef CONFIG_IP_VS_SH_TAB_BITS
+#define CONFIG_IP_VS_SH_TAB_BITS 8
+#endif
+#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS
+#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS)
+#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1)
+
+
+/*
+ * Returns hash value for IPVS SH entry
+ */
+static inline unsigned ip_vs_sh_hashkey(__u32 addr)
+{
+ return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK;
+}
+
+
+/*
+ * Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __u32 addr)
+{
+ return (tbl[ip_vs_sh_hashkey(addr)]).dest;
+}
+
+
+/*
+ * Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
+{
+ int i;
+ struct ip_vs_sh_bucket *b;
+ struct list_head *p;
+ struct ip_vs_dest *dest;
+
+ b = tbl;
+ p = &svc->destinations;
+ for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
+ if (list_empty(p)) {
+ b->dest = NULL;
+ } else {
+ if (p == &svc->destinations)
+ p = p->next;
+
+ dest = list_entry(p, struct ip_vs_dest, n_list);
+ atomic_inc(&dest->refcnt);
+ b->dest = dest;
+
+ p = p->next;
+ }
+ b++;
+ }
+ return 0;
+}
+
+
+/*
+ * Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
+{
+ int i;
+ struct ip_vs_sh_bucket *b;
+
+ b = tbl;
+ for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
+ if (b->dest) {
+ atomic_dec(&b->dest->refcnt);
+ b->dest = NULL;
+ }
+ b++;
+ }
+}
+
+
+static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_sh_bucket *tbl;
+
+ /* allocate the SH table for this service */
+ tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
+ GFP_ATOMIC);
+ if (tbl == NULL) {
+ IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n");
+ return -ENOMEM;
+ }
+ svc->sched_data = tbl;
+ IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
+ "current service\n",
+ sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
+
+ /* assign the hash buckets with the updated service */
+ ip_vs_sh_assign(tbl, svc);
+
+ return 0;
+}
+
+
+static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_sh_bucket *tbl = svc->sched_data;
+
+ /* got to clean up hash buckets here */
+ ip_vs_sh_flush(tbl);
+
+ /* release the table itself */
+ kfree(svc->sched_data);
+ IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
+ sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
+
+ return 0;
+}
+
+
+static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_sh_bucket *tbl = svc->sched_data;
+
+ /* got to clean up hash buckets here */
+ ip_vs_sh_flush(tbl);
+
+ /* assign the hash buckets with the updated service */
+ ip_vs_sh_assign(tbl, svc);
+
+ return 0;
+}
+
+
+/*
+ * If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
+ * consider that the server is overloaded here.
+ */
+static inline int is_overloaded(struct ip_vs_dest *dest)
+{
+ return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ * Source Hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_sh_bucket *tbl;
+ struct iphdr *iph = skb->nh.iph;
+
+ IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
+
+ tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
+ dest = ip_vs_sh_get(tbl, iph->saddr);
+ if (!dest
+ || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
+ || atomic_read(&dest->weight) <= 0
+ || is_overloaded(dest)) {
+ return NULL;
+ }
+
+ IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u "
+ "--> server %u.%u.%u.%u:%d\n",
+ NIPQUAD(iph->saddr),
+ NIPQUAD(dest->addr),
+ ntohs(dest->port));
+
+ return dest;
+}
+
+
+/*
+ * IPVS SH Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_sh_scheduler =
+{
+ .name = "sh",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .init_service = ip_vs_sh_init_svc,
+ .done_service = ip_vs_sh_done_svc,
+ .update_service = ip_vs_sh_update_svc,
+ .schedule = ip_vs_sh_schedule,
+};
+
+
+static int __init ip_vs_sh_init(void)
+{
+ INIT_LIST_HEAD(&ip_vs_sh_scheduler.n_list);
+ return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
+}
+
+
+static void __exit ip_vs_sh_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
+}
+
+
+module_init(ip_vs_sh_init);
+module_exit(ip_vs_sh_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
new file mode 100644
index 00000000000..25c479550a3
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -0,0 +1,892 @@
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the NetFilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Version: $Id: ip_vs_sync.c,v 1.13 2003/06/08 09:31:19 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * ip_vs_sync: sync connection info from master load balancer to backups
+ * through multicast
+ *
+ * Changes:
+ * Alexandre Cassen : Added master & backup support at a time.
+ * Alexandre Cassen : Added SyncID support for incoming sync
+ * messages filtering.
+ * Justin Ossevoort : Fix endian problem on sync message size.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/net.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/igmp.h> /* for ip_mc_join_group */
+
+#include <net/ip.h>
+#include <net/sock.h>
+#include <asm/uaccess.h> /* for get_fs and set_fs */
+
+#include <net/ip_vs.h>
+
+#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
+#define IP_VS_SYNC_PORT 8848 /* multicast port */
+
+
+/*
+ * IPVS sync connection entry
+ */
+struct ip_vs_sync_conn {
+ __u8 reserved;
+
+ /* Protocol, addresses and port numbers */
+ __u8 protocol; /* Which protocol (TCP/UDP) */
+ __u16 cport;
+ __u16 vport;
+ __u16 dport;
+ __u32 caddr; /* client address */
+ __u32 vaddr; /* virtual address */
+ __u32 daddr; /* destination address */
+
+ /* Flags and state transition */
+ __u16 flags; /* status flags */
+ __u16 state; /* state info */
+
+ /* The sequence options start here */
+};
+
+struct ip_vs_sync_conn_options {
+ struct ip_vs_seq in_seq; /* incoming seq. struct */
+ struct ip_vs_seq out_seq; /* outgoing seq. struct */
+};
+
+#define IP_VS_SYNC_CONN_TIMEOUT (3*60*HZ)
+#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn))
+#define FULL_CONN_SIZE \
+(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
+
+
+/*
+ The master mulitcasts messages to the backup load balancers in the
+ following format.
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Count Conns | SyncID | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ | IPVS Sync Connection (1) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | . |
+ | . |
+ | . |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ | IPVS Sync Connection (n) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*/
+
+#define SYNC_MESG_HEADER_LEN 4
+
+struct ip_vs_sync_mesg {
+ __u8 nr_conns;
+ __u8 syncid;
+ __u16 size;
+
+ /* ip_vs_sync_conn entries start here */
+};
+
+/* the maximum length of sync (sending/receiving) message */
+static int sync_send_mesg_maxlen;
+static int sync_recv_mesg_maxlen;
+
+struct ip_vs_sync_buff {
+ struct list_head list;
+ unsigned long firstuse;
+
+ /* pointers for the message data */
+ struct ip_vs_sync_mesg *mesg;
+ unsigned char *head;
+ unsigned char *end;
+};
+
+
+/* the sync_buff list head and the lock */
+static LIST_HEAD(ip_vs_sync_queue);
+static DEFINE_SPINLOCK(ip_vs_sync_lock);
+
+/* current sync_buff for accepting new conn entries */
+static struct ip_vs_sync_buff *curr_sb = NULL;
+static DEFINE_SPINLOCK(curr_sb_lock);
+
+/* ipvs sync daemon state */
+volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
+volatile int ip_vs_master_syncid = 0;
+volatile int ip_vs_backup_syncid = 0;
+
+/* multicast interface name */
+char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
+char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
+
+/* multicast addr */
+static struct sockaddr_in mcast_addr;
+
+
+static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
+{
+ spin_lock(&ip_vs_sync_lock);
+ list_add_tail(&sb->list, &ip_vs_sync_queue);
+ spin_unlock(&ip_vs_sync_lock);
+}
+
+static inline struct ip_vs_sync_buff * sb_dequeue(void)
+{
+ struct ip_vs_sync_buff *sb;
+
+ spin_lock_bh(&ip_vs_sync_lock);
+ if (list_empty(&ip_vs_sync_queue)) {
+ sb = NULL;
+ } else {
+ sb = list_entry(ip_vs_sync_queue.next,
+ struct ip_vs_sync_buff,
+ list);
+ list_del(&sb->list);
+ }
+ spin_unlock_bh(&ip_vs_sync_lock);
+
+ return sb;
+}
+
+static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
+{
+ struct ip_vs_sync_buff *sb;
+
+ if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
+ return NULL;
+
+ if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
+ kfree(sb);
+ return NULL;
+ }
+ sb->mesg->nr_conns = 0;
+ sb->mesg->syncid = ip_vs_master_syncid;
+ sb->mesg->size = 4;
+ sb->head = (unsigned char *)sb->mesg + 4;
+ sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
+ sb->firstuse = jiffies;
+ return sb;
+}
+
+static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
+{
+ kfree(sb->mesg);
+ kfree(sb);
+}
+
+/*
+ * Get the current sync buffer if it has been created for more
+ * than the specified time or the specified time is zero.
+ */
+static inline struct ip_vs_sync_buff *
+get_curr_sync_buff(unsigned long time)
+{
+ struct ip_vs_sync_buff *sb;
+
+ spin_lock_bh(&curr_sb_lock);
+ if (curr_sb && (time == 0 ||
+ time_before(jiffies - curr_sb->firstuse, time))) {
+ sb = curr_sb;
+ curr_sb = NULL;
+ } else
+ sb = NULL;
+ spin_unlock_bh(&curr_sb_lock);
+ return sb;
+}
+
+
+/*
+ * Add an ip_vs_conn information into the current sync_buff.
+ * Called by ip_vs_in.
+ */
+void ip_vs_sync_conn(struct ip_vs_conn *cp)
+{
+ struct ip_vs_sync_mesg *m;
+ struct ip_vs_sync_conn *s;
+ int len;
+
+ spin_lock(&curr_sb_lock);
+ if (!curr_sb) {
+ if (!(curr_sb=ip_vs_sync_buff_create())) {
+ spin_unlock(&curr_sb_lock);
+ IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
+ return;
+ }
+ }
+
+ len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
+ SIMPLE_CONN_SIZE;
+ m = curr_sb->mesg;
+ s = (struct ip_vs_sync_conn *)curr_sb->head;
+
+ /* copy members */
+ s->protocol = cp->protocol;
+ s->cport = cp->cport;
+ s->vport = cp->vport;
+ s->dport = cp->dport;
+ s->caddr = cp->caddr;
+ s->vaddr = cp->vaddr;
+ s->daddr = cp->daddr;
+ s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
+ s->state = htons(cp->state);
+ if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
+ struct ip_vs_sync_conn_options *opt =
+ (struct ip_vs_sync_conn_options *)&s[1];
+ memcpy(opt, &cp->in_seq, sizeof(*opt));
+ }
+
+ m->nr_conns++;
+ m->size += len;
+ curr_sb->head += len;
+
+ /* check if there is a space for next one */
+ if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
+ sb_queue_tail(curr_sb);
+ curr_sb = NULL;
+ }
+ spin_unlock(&curr_sb_lock);
+
+ /* synchronize its controller if it has */
+ if (cp->control)
+ ip_vs_sync_conn(cp->control);
+}
+
+
+/*
+ * Process received multicast message and create the corresponding
+ * ip_vs_conn entries.
+ */
+static void ip_vs_process_message(const char *buffer, const size_t buflen)
+{
+ struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
+ struct ip_vs_sync_conn *s;
+ struct ip_vs_sync_conn_options *opt;
+ struct ip_vs_conn *cp;
+ char *p;
+ int i;
+
+ /* Convert size back to host byte order */
+ m->size = ntohs(m->size);
+
+ if (buflen != m->size) {
+ IP_VS_ERR("bogus message\n");
+ return;
+ }
+
+ /* SyncID sanity check */
+ if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
+ IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
+ m->syncid);
+ return;
+ }
+
+ p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
+ for (i=0; i<m->nr_conns; i++) {
+ s = (struct ip_vs_sync_conn *)p;
+ cp = ip_vs_conn_in_get(s->protocol,
+ s->caddr, s->cport,
+ s->vaddr, s->vport);
+ if (!cp) {
+ cp = ip_vs_conn_new(s->protocol,
+ s->caddr, s->cport,
+ s->vaddr, s->vport,
+ s->daddr, s->dport,
+ ntohs(s->flags), NULL);
+ if (!cp) {
+ IP_VS_ERR("ip_vs_conn_new failed\n");
+ return;
+ }
+ cp->state = ntohs(s->state);
+ } else if (!cp->dest) {
+ /* it is an entry created by the synchronization */
+ cp->state = ntohs(s->state);
+ cp->flags = ntohs(s->flags) | IP_VS_CONN_F_HASHED;
+ } /* Note that we don't touch its state and flags
+ if it is a normal entry. */
+
+ if (ntohs(s->flags) & IP_VS_CONN_F_SEQ_MASK) {
+ opt = (struct ip_vs_sync_conn_options *)&s[1];
+ memcpy(&cp->in_seq, opt, sizeof(*opt));
+ p += FULL_CONN_SIZE;
+ } else
+ p += SIMPLE_CONN_SIZE;
+
+ atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
+ cp->timeout = IP_VS_SYNC_CONN_TIMEOUT;
+ ip_vs_conn_put(cp);
+
+ if (p > buffer+buflen) {
+ IP_VS_ERR("bogus message\n");
+ return;
+ }
+ }
+}
+
+
+/*
+ * Setup loopback of outgoing multicasts on a sending socket
+ */
+static void set_mcast_loop(struct sock *sk, u_char loop)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
+ lock_sock(sk);
+ inet->mc_loop = loop ? 1 : 0;
+ release_sock(sk);
+}
+
+/*
+ * Specify TTL for outgoing multicasts on a sending socket
+ */
+static void set_mcast_ttl(struct sock *sk, u_char ttl)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
+ lock_sock(sk);
+ inet->mc_ttl = ttl;
+ release_sock(sk);
+}
+
+/*
+ * Specifiy default interface for outgoing multicasts
+ */
+static int set_mcast_if(struct sock *sk, char *ifname)
+{
+ struct net_device *dev;
+ struct inet_sock *inet = inet_sk(sk);
+
+ if ((dev = __dev_get_by_name(ifname)) == NULL)
+ return -ENODEV;
+
+ if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+ return -EINVAL;
+
+ lock_sock(sk);
+ inet->mc_index = dev->ifindex;
+ /* inet->mc_addr = 0; */
+ release_sock(sk);
+
+ return 0;
+}
+
+
+/*
+ * Set the maximum length of sync message according to the
+ * specified interface's MTU.
+ */
+static int set_sync_mesg_maxlen(int sync_state)
+{
+ struct net_device *dev;
+ int num;
+
+ if (sync_state == IP_VS_STATE_MASTER) {
+ if ((dev = __dev_get_by_name(ip_vs_master_mcast_ifn)) == NULL)
+ return -ENODEV;
+
+ num = (dev->mtu - sizeof(struct iphdr) -
+ sizeof(struct udphdr) -
+ SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
+ sync_send_mesg_maxlen =
+ SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num;
+ IP_VS_DBG(7, "setting the maximum length of sync sending "
+ "message %d.\n", sync_send_mesg_maxlen);
+ } else if (sync_state == IP_VS_STATE_BACKUP) {
+ if ((dev = __dev_get_by_name(ip_vs_backup_mcast_ifn)) == NULL)
+ return -ENODEV;
+
+ sync_recv_mesg_maxlen = dev->mtu -
+ sizeof(struct iphdr) - sizeof(struct udphdr);
+ IP_VS_DBG(7, "setting the maximum length of sync receiving "
+ "message %d.\n", sync_recv_mesg_maxlen);
+ }
+
+ return 0;
+}
+
+
+/*
+ * Join a multicast group.
+ * the group is specified by a class D multicast address 224.0.0.0/8
+ * in the in_addr structure passed in as a parameter.
+ */
+static int
+join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
+{
+ struct ip_mreqn mreq;
+ struct net_device *dev;
+ int ret;
+
+ memset(&mreq, 0, sizeof(mreq));
+ memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
+
+ if ((dev = __dev_get_by_name(ifname)) == NULL)
+ return -ENODEV;
+ if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+ return -EINVAL;
+
+ mreq.imr_ifindex = dev->ifindex;
+
+ lock_sock(sk);
+ ret = ip_mc_join_group(sk, &mreq);
+ release_sock(sk);
+
+ return ret;
+}
+
+
+static int bind_mcastif_addr(struct socket *sock, char *ifname)
+{
+ struct net_device *dev;
+ u32 addr;
+ struct sockaddr_in sin;
+
+ if ((dev = __dev_get_by_name(ifname)) == NULL)
+ return -ENODEV;
+
+ addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+ if (!addr)
+ IP_VS_ERR("You probably need to specify IP address on "
+ "multicast interface.\n");
+
+ IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n",
+ ifname, NIPQUAD(addr));
+
+ /* Now bind the socket with the address of multicast interface */
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = addr;
+ sin.sin_port = 0;
+
+ return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
+}
+
+/*
+ * Set up sending multicast socket over UDP
+ */
+static struct socket * make_send_sock(void)
+{
+ struct socket *sock;
+
+ /* First create a socket */
+ if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
+ IP_VS_ERR("Error during creation of socket; terminating\n");
+ return NULL;
+ }
+
+ if (set_mcast_if(sock->sk, ip_vs_master_mcast_ifn) < 0) {
+ IP_VS_ERR("Error setting outbound mcast interface\n");
+ goto error;
+ }
+
+ set_mcast_loop(sock->sk, 0);
+ set_mcast_ttl(sock->sk, 1);
+
+ if (bind_mcastif_addr(sock, ip_vs_master_mcast_ifn) < 0) {
+ IP_VS_ERR("Error binding address of the mcast interface\n");
+ goto error;
+ }
+
+ if (sock->ops->connect(sock,
+ (struct sockaddr*)&mcast_addr,
+ sizeof(struct sockaddr), 0) < 0) {
+ IP_VS_ERR("Error connecting to the multicast addr\n");
+ goto error;
+ }
+
+ return sock;
+
+ error:
+ sock_release(sock);
+ return NULL;
+}
+
+
+/*
+ * Set up receiving multicast socket over UDP
+ */
+static struct socket * make_receive_sock(void)
+{
+ struct socket *sock;
+
+ /* First create a socket */
+ if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
+ IP_VS_ERR("Error during creation of socket; terminating\n");
+ return NULL;
+ }
+
+ /* it is equivalent to the REUSEADDR option in user-space */
+ sock->sk->sk_reuse = 1;
+
+ if (sock->ops->bind(sock,
+ (struct sockaddr*)&mcast_addr,
+ sizeof(struct sockaddr)) < 0) {
+ IP_VS_ERR("Error binding to the multicast addr\n");
+ goto error;
+ }
+
+ /* join the multicast group */
+ if (join_mcast_group(sock->sk,
+ (struct in_addr*)&mcast_addr.sin_addr,
+ ip_vs_backup_mcast_ifn) < 0) {
+ IP_VS_ERR("Error joining to the multicast group\n");
+ goto error;
+ }
+
+ return sock;
+
+ error:
+ sock_release(sock);
+ return NULL;
+}
+
+
+static int
+ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
+{
+ struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
+ struct kvec iov;
+ int len;
+
+ EnterFunction(7);
+ iov.iov_base = (void *)buffer;
+ iov.iov_len = length;
+
+ len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
+
+ LeaveFunction(7);
+ return len;
+}
+
+static void
+ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
+{
+ int msize;
+
+ msize = msg->size;
+
+ /* Put size in network byte order */
+ msg->size = htons(msg->size);
+
+ if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
+ IP_VS_ERR("ip_vs_send_async error\n");
+}
+
+static int
+ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
+{
+ struct msghdr msg = {NULL,};
+ struct kvec iov;
+ int len;
+
+ EnterFunction(7);
+
+ /* Receive a packet */
+ iov.iov_base = buffer;
+ iov.iov_len = (size_t)buflen;
+
+ len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
+
+ if (len < 0)
+ return -1;
+
+ LeaveFunction(7);
+ return len;
+}
+
+
+static DECLARE_WAIT_QUEUE_HEAD(sync_wait);
+static pid_t sync_master_pid = 0;
+static pid_t sync_backup_pid = 0;
+
+static DECLARE_WAIT_QUEUE_HEAD(stop_sync_wait);
+static int stop_master_sync = 0;
+static int stop_backup_sync = 0;
+
+static void sync_master_loop(void)
+{
+ struct socket *sock;
+ struct ip_vs_sync_buff *sb;
+
+ /* create the sending multicast socket */
+ sock = make_send_sock();
+ if (!sock)
+ return;
+
+ IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "
+ "syncid = %d\n",
+ ip_vs_master_mcast_ifn, ip_vs_master_syncid);
+
+ for (;;) {
+ while ((sb=sb_dequeue())) {
+ ip_vs_send_sync_msg(sock, sb->mesg);
+ ip_vs_sync_buff_release(sb);
+ }
+
+ /* check if entries stay in curr_sb for 2 seconds */
+ if ((sb = get_curr_sync_buff(2*HZ))) {
+ ip_vs_send_sync_msg(sock, sb->mesg);
+ ip_vs_sync_buff_release(sb);
+ }
+
+ if (stop_master_sync)
+ break;
+
+ ssleep(1);
+ }
+
+ /* clean up the sync_buff queue */
+ while ((sb=sb_dequeue())) {
+ ip_vs_sync_buff_release(sb);
+ }
+
+ /* clean up the current sync_buff */
+ if ((sb = get_curr_sync_buff(0))) {
+ ip_vs_sync_buff_release(sb);
+ }
+
+ /* release the sending multicast socket */
+ sock_release(sock);
+}
+
+
+static void sync_backup_loop(void)
+{
+ struct socket *sock;
+ char *buf;
+ int len;
+
+ if (!(buf = kmalloc(sync_recv_mesg_maxlen, GFP_ATOMIC))) {
+ IP_VS_ERR("sync_backup_loop: kmalloc error\n");
+ return;
+ }
+
+ /* create the receiving multicast socket */
+ sock = make_receive_sock();
+ if (!sock)
+ goto out;
+
+ IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "
+ "syncid = %d\n",
+ ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
+
+ for (;;) {
+ /* do you have data now? */
+ while (!skb_queue_empty(&(sock->sk->sk_receive_queue))) {
+ if ((len =
+ ip_vs_receive(sock, buf,
+ sync_recv_mesg_maxlen)) <= 0) {
+ IP_VS_ERR("receiving message error\n");
+ break;
+ }
+ /* disable bottom half, because it accessed the data
+ shared by softirq while getting/creating conns */
+ local_bh_disable();
+ ip_vs_process_message(buf, len);
+ local_bh_enable();
+ }
+
+ if (stop_backup_sync)
+ break;
+
+ ssleep(1);
+ }
+
+ /* release the sending multicast socket */
+ sock_release(sock);
+
+ out:
+ kfree(buf);
+}
+
+
+static void set_sync_pid(int sync_state, pid_t sync_pid)
+{
+ if (sync_state == IP_VS_STATE_MASTER)
+ sync_master_pid = sync_pid;
+ else if (sync_state == IP_VS_STATE_BACKUP)
+ sync_backup_pid = sync_pid;
+}
+
+static void set_stop_sync(int sync_state, int set)
+{
+ if (sync_state == IP_VS_STATE_MASTER)
+ stop_master_sync = set;
+ else if (sync_state == IP_VS_STATE_BACKUP)
+ stop_backup_sync = set;
+ else {
+ stop_master_sync = set;
+ stop_backup_sync = set;
+ }
+}
+
+static int sync_thread(void *startup)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ mm_segment_t oldmm;
+ int state;
+ const char *name;
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ if (ip_vs_sync_state & IP_VS_STATE_MASTER && !sync_master_pid) {
+ state = IP_VS_STATE_MASTER;
+ name = "ipvs_syncmaster";
+ } else if (ip_vs_sync_state & IP_VS_STATE_BACKUP && !sync_backup_pid) {
+ state = IP_VS_STATE_BACKUP;
+ name = "ipvs_syncbackup";
+ } else {
+ IP_VS_BUG();
+ ip_vs_use_count_dec();
+ return -EINVAL;
+ }
+
+ daemonize(name);
+
+ oldmm = get_fs();
+ set_fs(KERNEL_DS);
+
+ /* Block all signals */
+ spin_lock_irq(&current->sighand->siglock);
+ siginitsetinv(&current->blocked, 0);
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+ /* set the maximum length of sync message */
+ set_sync_mesg_maxlen(state);
+
+ /* set up multicast address */
+ mcast_addr.sin_family = AF_INET;
+ mcast_addr.sin_port = htons(IP_VS_SYNC_PORT);
+ mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP);
+
+ add_wait_queue(&sync_wait, &wait);
+
+ set_sync_pid(state, current->pid);
+ complete((struct completion *)startup);
+
+ /* processing master/backup loop here */
+ if (state == IP_VS_STATE_MASTER)
+ sync_master_loop();
+ else if (state == IP_VS_STATE_BACKUP)
+ sync_backup_loop();
+ else IP_VS_BUG();
+
+ remove_wait_queue(&sync_wait, &wait);
+
+ /* thread exits */
+ set_sync_pid(state, 0);
+ IP_VS_INFO("sync thread stopped!\n");
+
+ set_fs(oldmm);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ set_stop_sync(state, 0);
+ wake_up(&stop_sync_wait);
+
+ return 0;
+}
+
+
+static int fork_sync_thread(void *startup)
+{
+ pid_t pid;
+
+ /* fork the sync thread here, then the parent process of the
+ sync thread is the init process after this thread exits. */
+ repeat:
+ if ((pid = kernel_thread(sync_thread, startup, 0)) < 0) {
+ IP_VS_ERR("could not create sync_thread due to %d... "
+ "retrying.\n", pid);
+ ssleep(1);
+ goto repeat;
+ }
+
+ return 0;
+}
+
+
+int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
+{
+ DECLARE_COMPLETION(startup);
+ pid_t pid;
+
+ if ((state == IP_VS_STATE_MASTER && sync_master_pid) ||
+ (state == IP_VS_STATE_BACKUP && sync_backup_pid))
+ return -EEXIST;
+
+ IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
+ IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %Zd bytes\n",
+ sizeof(struct ip_vs_sync_conn));
+
+ ip_vs_sync_state |= state;
+ if (state == IP_VS_STATE_MASTER) {
+ strcpy(ip_vs_master_mcast_ifn, mcast_ifn);
+ ip_vs_master_syncid = syncid;
+ } else {
+ strcpy(ip_vs_backup_mcast_ifn, mcast_ifn);
+ ip_vs_backup_syncid = syncid;
+ }
+
+ repeat:
+ if ((pid = kernel_thread(fork_sync_thread, &startup, 0)) < 0) {
+ IP_VS_ERR("could not create fork_sync_thread due to %d... "
+ "retrying.\n", pid);
+ ssleep(1);
+ goto repeat;
+ }
+
+ wait_for_completion(&startup);
+
+ return 0;
+}
+
+
+int stop_sync_thread(int state)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ if ((state == IP_VS_STATE_MASTER && !sync_master_pid) ||
+ (state == IP_VS_STATE_BACKUP && !sync_backup_pid))
+ return -ESRCH;
+
+ IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
+ IP_VS_INFO("stopping sync thread %d ...\n",
+ (state == IP_VS_STATE_MASTER) ? sync_master_pid : sync_backup_pid);
+
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ add_wait_queue(&stop_sync_wait, &wait);
+ set_stop_sync(state, 1);
+ ip_vs_sync_state -= state;
+ wake_up(&sync_wait);
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&stop_sync_wait, &wait);
+
+ /* Note: no need to reap the sync thread, because its parent
+ process is the init process */
+
+ if ((state == IP_VS_STATE_MASTER && stop_master_sync) ||
+ (state == IP_VS_STATE_BACKUP && stop_backup_sync))
+ IP_VS_BUG();
+
+ return 0;
+}
diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c
new file mode 100644
index 00000000000..8a9d913261d
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_wlc.c
@@ -0,0 +1,151 @@
+/*
+ * IPVS: Weighted Least-Connection Scheduling module
+ *
+ * Version: $Id: ip_vs_wlc.c,v 1.13 2003/04/18 09:03:16 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest
+ * Wensong Zhang : changed to use the inactconns in scheduling
+ * Wensong Zhang : changed some comestics things for debugging
+ * Wensong Zhang : changed for the d-linked destination list
+ * Wensong Zhang : added the ip_vs_wlc_update_svc
+ * Wensong Zhang : added any dest with weight=0 is quiesced
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static int
+ip_vs_wlc_init_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static int
+ip_vs_wlc_done_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static int
+ip_vs_wlc_update_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static inline unsigned int
+ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
+{
+ /*
+ * We think the overhead of processing active connections is 256
+ * times higher than that of inactive connections in average. (This
+ * 256 times might not be accurate, we will change it later) We
+ * use the following formula to estimate the overhead now:
+ * dest->activeconns*256 + dest->inactconns
+ */
+ return (atomic_read(&dest->activeconns) << 8) +
+ atomic_read(&dest->inactconns);
+}
+
+
+/*
+ * Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+ struct ip_vs_dest *dest, *least;
+ unsigned int loh, doh;
+
+ IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
+
+ /*
+ * We calculate the load of each dest server as follows:
+ * (dest overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connections.
+ */
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) > 0) {
+ least = dest;
+ loh = ip_vs_wlc_dest_overhead(least);
+ goto nextstage;
+ }
+ }
+ return NULL;
+
+ /*
+ * Find the destination with the least load.
+ */
+ nextstage:
+ list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+ doh = ip_vs_wlc_dest_overhead(dest);
+ if (loh * atomic_read(&dest->weight) >
+ doh * atomic_read(&least->weight)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ NIPQUAD(least->addr), ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_wlc_scheduler =
+{
+ .name = "wlc",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .init_service = ip_vs_wlc_init_svc,
+ .done_service = ip_vs_wlc_done_svc,
+ .update_service = ip_vs_wlc_update_svc,
+ .schedule = ip_vs_wlc_schedule,
+};
+
+
+static int __init ip_vs_wlc_init(void)
+{
+ INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list);
+ return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+}
+
+static void __exit ip_vs_wlc_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+}
+
+module_init(ip_vs_wlc_init);
+module_exit(ip_vs_wlc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c
new file mode 100644
index 00000000000..749fa044eca
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_wrr.c
@@ -0,0 +1,235 @@
+/*
+ * IPVS: Weighted Round-Robin Scheduling module
+ *
+ * Version: $Id: ip_vs_wrr.c,v 1.12 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest
+ * Wensong Zhang : changed some comestics things for debugging
+ * Wensong Zhang : changed for the d-linked destination list
+ * Wensong Zhang : added the ip_vs_wrr_update_svc
+ * Julian Anastasov : fixed the bug of returning destination
+ * with weight 0 when all weights are zero
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+/*
+ * current destination pointer for weighted round-robin scheduling
+ */
+struct ip_vs_wrr_mark {
+ struct list_head *cl; /* current list head */
+ int cw; /* current weight */
+ int mw; /* maximum weight */
+ int di; /* decreasing interval */
+};
+
+
+/*
+ * Get the gcd of server weights
+ */
+static int gcd(int a, int b)
+{
+ int c;
+
+ while ((c = a % b)) {
+ a = b;
+ b = c;
+ }
+ return b;
+}
+
+static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest;
+ int weight;
+ int g = 0;
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ weight = atomic_read(&dest->weight);
+ if (weight > 0) {
+ if (g > 0)
+ g = gcd(weight, g);
+ else
+ g = weight;
+ }
+ }
+ return g ? g : 1;
+}
+
+
+/*
+ * Get the maximum weight of the service destinations.
+ */
+static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest;
+ int weight = 0;
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ if (atomic_read(&dest->weight) > weight)
+ weight = atomic_read(&dest->weight);
+ }
+
+ return weight;
+}
+
+
+static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_wrr_mark *mark;
+
+ /*
+ * Allocate the mark variable for WRR scheduling
+ */
+ mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
+ if (mark == NULL) {
+ IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n");
+ return -ENOMEM;
+ }
+ mark->cl = &svc->destinations;
+ mark->cw = 0;
+ mark->mw = ip_vs_wrr_max_weight(svc);
+ mark->di = ip_vs_wrr_gcd_weight(svc);
+ svc->sched_data = mark;
+
+ return 0;
+}
+
+
+static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
+{
+ /*
+ * Release the mark variable
+ */
+ kfree(svc->sched_data);
+
+ return 0;
+}
+
+
+static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_wrr_mark *mark = svc->sched_data;
+
+ mark->cl = &svc->destinations;
+ mark->mw = ip_vs_wrr_max_weight(svc);
+ mark->di = ip_vs_wrr_gcd_weight(svc);
+ if (mark->cw > mark->mw)
+ mark->cw = 0;
+ return 0;
+}
+
+
+/*
+ * Weighted Round-Robin Scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_wrr_mark *mark = svc->sched_data;
+ struct list_head *p;
+
+ IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
+
+ /*
+ * This loop will always terminate, because mark->cw in (0, max_weight]
+ * and at least one server has its weight equal to max_weight.
+ */
+ write_lock(&svc->sched_lock);
+ p = mark->cl;
+ while (1) {
+ if (mark->cl == &svc->destinations) {
+ /* it is at the head of the destination list */
+
+ if (mark->cl == mark->cl->next) {
+ /* no dest entry */
+ dest = NULL;
+ goto out;
+ }
+
+ mark->cl = svc->destinations.next;
+ mark->cw -= mark->di;
+ if (mark->cw <= 0) {
+ mark->cw = mark->mw;
+ /*
+ * Still zero, which means no available servers.
+ */
+ if (mark->cw == 0) {
+ mark->cl = &svc->destinations;
+ IP_VS_INFO("ip_vs_wrr_schedule(): "
+ "no available servers\n");
+ dest = NULL;
+ goto out;
+ }
+ }
+ } else
+ mark->cl = mark->cl->next;
+
+ if (mark->cl != &svc->destinations) {
+ /* not at the head of the list */
+ dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) >= mark->cw) {
+ /* got it */
+ break;
+ }
+ }
+
+ if (mark->cl == p && mark->cw == mark->di) {
+ /* back to the start, and no dest is found.
+ It is only possible when all dests are OVERLOADED */
+ dest = NULL;
+ goto out;
+ }
+ }
+
+ IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u "
+ "activeconns %d refcnt %d weight %d\n",
+ NIPQUAD(dest->addr), ntohs(dest->port),
+ atomic_read(&dest->activeconns),
+ atomic_read(&dest->refcnt),
+ atomic_read(&dest->weight));
+
+ out:
+ write_unlock(&svc->sched_lock);
+ return dest;
+}
+
+
+static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
+ .name = "wrr",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .init_service = ip_vs_wrr_init_svc,
+ .done_service = ip_vs_wrr_done_svc,
+ .update_service = ip_vs_wrr_update_svc,
+ .schedule = ip_vs_wrr_schedule,
+};
+
+static int __init ip_vs_wrr_init(void)
+{
+ INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list);
+ return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
+}
+
+static void __exit ip_vs_wrr_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
+}
+
+module_init(ip_vs_wrr_init);
+module_exit(ip_vs_wrr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
new file mode 100644
index 00000000000..faa6176bbeb
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -0,0 +1,563 @@
+/*
+ * ip_vs_xmit.c: various packet transmitters for IPVS
+ *
+ * Version: $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h> /* for tcphdr */
+#include <net/tcp.h> /* for csum_tcpudp_magic */
+#include <net/udp.h>
+#include <net/icmp.h> /* for icmp_send */
+#include <net/route.h> /* for ip_route_output */
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * Destination cache to speed up outgoing route lookup
+ */
+static inline void
+__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
+{
+ struct dst_entry *old_dst;
+
+ old_dst = dest->dst_cache;
+ dest->dst_cache = dst;
+ dest->dst_rtos = rtos;
+ dst_release(old_dst);
+}
+
+static inline struct dst_entry *
+__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
+{
+ struct dst_entry *dst = dest->dst_cache;
+
+ if (!dst)
+ return NULL;
+ if ((dst->obsolete || rtos != dest->dst_rtos) &&
+ dst->ops->check(dst, cookie) == NULL) {
+ dest->dst_cache = NULL;
+ dst_release(dst);
+ return NULL;
+ }
+ dst_hold(dst);
+ return dst;
+}
+
+static inline struct rtable *
+__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
+{
+ struct rtable *rt; /* Route to the other host */
+ struct ip_vs_dest *dest = cp->dest;
+
+ if (dest) {
+ spin_lock(&dest->dst_lock);
+ if (!(rt = (struct rtable *)
+ __ip_vs_dst_check(dest, rtos, 0))) {
+ struct flowi fl = {
+ .oif = 0,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = dest->addr,
+ .saddr = 0,
+ .tos = rtos, } },
+ };
+
+ if (ip_route_output_key(&rt, &fl)) {
+ spin_unlock(&dest->dst_lock);
+ IP_VS_DBG_RL("ip_route_output error, "
+ "dest: %u.%u.%u.%u\n",
+ NIPQUAD(dest->addr));
+ return NULL;
+ }
+ __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
+ IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
+ NIPQUAD(dest->addr),
+ atomic_read(&rt->u.dst.__refcnt), rtos);
+ }
+ spin_unlock(&dest->dst_lock);
+ } else {
+ struct flowi fl = {
+ .oif = 0,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = cp->daddr,
+ .saddr = 0,
+ .tos = rtos, } },
+ };
+
+ if (ip_route_output_key(&rt, &fl)) {
+ IP_VS_DBG_RL("ip_route_output error, dest: "
+ "%u.%u.%u.%u\n", NIPQUAD(cp->daddr));
+ return NULL;
+ }
+ }
+
+ return rt;
+}
+
+
+/*
+ * Release dest->dst_cache before a dest is removed
+ */
+void
+ip_vs_dst_reset(struct ip_vs_dest *dest)
+{
+ struct dst_entry *old_dst;
+
+ old_dst = dest->dst_cache;
+ dest->dst_cache = NULL;
+ dst_release(old_dst);
+}
+
+#define IP_VS_XMIT(skb, rt) \
+do { \
+ nf_reset_debug(skb); \
+ (skb)->nfcache |= NFC_IPVS_PROPERTY; \
+ (skb)->ip_summed = CHECKSUM_NONE; \
+ NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \
+ (rt)->u.dst.dev, dst_output); \
+} while (0)
+
+
+/*
+ * NULL transmitter (do nothing except return NF_ACCEPT)
+ */
+int
+ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp)
+{
+ /* we do not touch skb and do not need pskb ptr */
+ return NF_ACCEPT;
+}
+
+
+/*
+ * Bypass transmitter
+ * Let packets bypass the destination when the destination is not
+ * available, it may be only used in transparent cache cluster.
+ */
+int
+ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp)
+{
+ struct rtable *rt; /* Route to the other host */
+ struct iphdr *iph = skb->nh.iph;
+ u8 tos = iph->tos;
+ int mtu;
+ struct flowi fl = {
+ .oif = 0,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = iph->daddr,
+ .saddr = 0,
+ .tos = RT_TOS(tos), } },
+ };
+
+ EnterFunction(10);
+
+ if (ip_route_output_key(&rt, &fl)) {
+ IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
+ "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
+ goto tx_error_icmp;
+ }
+
+ /* MTU checking */
+ mtu = dst_mtu(&rt->u.dst);
+ if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
+ ip_rt_put(rt);
+ icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+ IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
+ goto tx_error;
+ }
+
+ /*
+ * Call ip_send_check because we are not sure it is called
+ * after ip_defrag. Is copy-on-write needed?
+ */
+ if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
+ ip_rt_put(rt);
+ return NF_STOLEN;
+ }
+ ip_send_check(skb->nh.iph);
+
+ /* drop old route */
+ dst_release(skb->dst);
+ skb->dst = &rt->u.dst;
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->local_df = 1;
+
+ IP_VS_XMIT(skb, rt);
+
+ LeaveFunction(10);
+ return NF_STOLEN;
+
+ tx_error_icmp:
+ dst_link_failure(skb);
+ tx_error:
+ kfree_skb(skb);
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+
+
+/*
+ * NAT transmitter (only for outside-to-inside nat forwarding)
+ * Not used for related ICMP
+ */
+int
+ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp)
+{
+ struct rtable *rt; /* Route to the other host */
+ int mtu;
+ struct iphdr *iph = skb->nh.iph;
+
+ EnterFunction(10);
+
+ /* check if it is a connection of no-client-port */
+ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+ __u16 _pt, *p;
+ p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
+ if (p == NULL)
+ goto tx_error;
+ ip_vs_conn_fill_cport(cp, *p);
+ IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
+ }
+
+ if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+ goto tx_error_icmp;
+
+ /* MTU checking */
+ mtu = dst_mtu(&rt->u.dst);
+ if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
+ ip_rt_put(rt);
+ icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+ IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
+ goto tx_error;
+ }
+
+ /* copy-on-write the packet before mangling it */
+ if (!ip_vs_make_skb_writable(&skb, sizeof(struct iphdr)))
+ goto tx_error_put;
+
+ if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+ goto tx_error_put;
+
+ /* drop old route */
+ dst_release(skb->dst);
+ skb->dst = &rt->u.dst;
+
+ /* mangle the packet */
+ if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp))
+ goto tx_error;
+ skb->nh.iph->daddr = cp->daddr;
+ ip_send_check(skb->nh.iph);
+
+ IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+
+ /* FIXME: when application helper enlarges the packet and the length
+ is larger than the MTU of outgoing device, there will be still
+ MTU problem. */
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->local_df = 1;
+
+ IP_VS_XMIT(skb, rt);
+
+ LeaveFunction(10);
+ return NF_STOLEN;
+
+ tx_error_icmp:
+ dst_link_failure(skb);
+ tx_error:
+ LeaveFunction(10);
+ kfree_skb(skb);
+ return NF_STOLEN;
+ tx_error_put:
+ ip_rt_put(rt);
+ goto tx_error;
+}
+
+
+/*
+ * IP Tunneling transmitter
+ *
+ * This function encapsulates the packet in a new IP packet, its
+ * destination will be set to cp->daddr. Most code of this function
+ * is taken from ipip.c.
+ *
+ * It is used in VS/TUN cluster. The load balancer selects a real
+ * server from a cluster based on a scheduling algorithm,
+ * encapsulates the request packet and forwards it to the selected
+ * server. For example, all real servers are configured with
+ * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
+ * the encapsulated packet, it will decapsulate the packet, processe
+ * the request and return the response packets directly to the client
+ * without passing the load balancer. This can greatly increase the
+ * scalability of virtual server.
+ *
+ * Used for ANY protocol
+ */
+int
+ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp)
+{
+ struct rtable *rt; /* Route to the other host */
+ struct net_device *tdev; /* Device to other host */
+ struct iphdr *old_iph = skb->nh.iph;
+ u8 tos = old_iph->tos;
+ u16 df = old_iph->frag_off;
+ struct iphdr *iph; /* Our new IP header */
+ int max_headroom; /* The extra header space needed */
+ int mtu;
+
+ EnterFunction(10);
+
+ if (skb->protocol != __constant_htons(ETH_P_IP)) {
+ IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
+ "ETH_P_IP: %d, skb protocol: %d\n",
+ __constant_htons(ETH_P_IP), skb->protocol);
+ goto tx_error;
+ }
+
+ if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
+ goto tx_error_icmp;
+
+ tdev = rt->u.dst.dev;
+
+ mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
+ if (mtu < 68) {
+ ip_rt_put(rt);
+ IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
+ goto tx_error;
+ }
+ if (skb->dst)
+ skb->dst->ops->update_pmtu(skb->dst, mtu);
+
+ df |= (old_iph->frag_off&__constant_htons(IP_DF));
+
+ if ((old_iph->frag_off&__constant_htons(IP_DF))
+ && mtu < ntohs(old_iph->tot_len)) {
+ icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+ ip_rt_put(rt);
+ IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
+ goto tx_error;
+ }
+
+ /*
+ * Okay, now see if we can stuff it in the buffer as-is.
+ */
+ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
+
+ if (skb_headroom(skb) < max_headroom
+ || skb_cloned(skb) || skb_shared(skb)) {
+ struct sk_buff *new_skb =
+ skb_realloc_headroom(skb, max_headroom);
+ if (!new_skb) {
+ ip_rt_put(rt);
+ kfree_skb(skb);
+ IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
+ return NF_STOLEN;
+ }
+ kfree_skb(skb);
+ skb = new_skb;
+ old_iph = skb->nh.iph;
+ }
+
+ skb->h.raw = (void *) old_iph;
+
+ /* fix old IP header checksum */
+ ip_send_check(old_iph);
+
+ skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
+ /* drop old route */
+ dst_release(skb->dst);
+ skb->dst = &rt->u.dst;
+
+ /*
+ * Push down and install the IPIP header.
+ */
+ iph = skb->nh.iph;
+ iph->version = 4;
+ iph->ihl = sizeof(struct iphdr)>>2;
+ iph->frag_off = df;
+ iph->protocol = IPPROTO_IPIP;
+ iph->tos = tos;
+ iph->daddr = rt->rt_dst;
+ iph->saddr = rt->rt_src;
+ iph->ttl = old_iph->ttl;
+ iph->tot_len = htons(skb->len);
+ ip_select_ident(iph, &rt->u.dst, NULL);
+ ip_send_check(iph);
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->local_df = 1;
+
+ IP_VS_XMIT(skb, rt);
+
+ LeaveFunction(10);
+
+ return NF_STOLEN;
+
+ tx_error_icmp:
+ dst_link_failure(skb);
+ tx_error:
+ kfree_skb(skb);
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+
+
+/*
+ * Direct Routing transmitter
+ * Used for ANY protocol
+ */
+int
+ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp)
+{
+ struct rtable *rt; /* Route to the other host */
+ struct iphdr *iph = skb->nh.iph;
+ int mtu;
+
+ EnterFunction(10);
+
+ if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+ goto tx_error_icmp;
+
+ /* MTU checking */
+ mtu = dst_mtu(&rt->u.dst);
+ if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
+ icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+ ip_rt_put(rt);
+ IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
+ goto tx_error;
+ }
+
+ /*
+ * Call ip_send_check because we are not sure it is called
+ * after ip_defrag. Is copy-on-write needed?
+ */
+ if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
+ ip_rt_put(rt);
+ return NF_STOLEN;
+ }
+ ip_send_check(skb->nh.iph);
+
+ /* drop old route */
+ dst_release(skb->dst);
+ skb->dst = &rt->u.dst;
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->local_df = 1;
+
+ IP_VS_XMIT(skb, rt);
+
+ LeaveFunction(10);
+ return NF_STOLEN;
+
+ tx_error_icmp:
+ dst_link_failure(skb);
+ tx_error:
+ kfree_skb(skb);
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+
+
+/*
+ * ICMP packet transmitter
+ * called by the ip_vs_in_icmp
+ */
+int
+ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, int offset)
+{
+ struct rtable *rt; /* Route to the other host */
+ int mtu;
+ int rc;
+
+ EnterFunction(10);
+
+ /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
+ forwarded directly here, because there is no need to
+ translate address/port back */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+ if (cp->packet_xmit)
+ rc = cp->packet_xmit(skb, cp, pp);
+ else
+ rc = NF_ACCEPT;
+ /* do not touch skb anymore */
+ atomic_inc(&cp->in_pkts);
+ __ip_vs_conn_put(cp);
+ goto out;
+ }
+
+ /*
+ * mangle and send the packet here (only for VS/NAT)
+ */
+
+ if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos))))
+ goto tx_error_icmp;
+
+ /* MTU checking */
+ mtu = dst_mtu(&rt->u.dst);
+ if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) {
+ ip_rt_put(rt);
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
+ goto tx_error;
+ }
+
+ /* copy-on-write the packet before mangling it */
+ if (!ip_vs_make_skb_writable(&skb, offset))
+ goto tx_error_put;
+
+ if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+ goto tx_error_put;
+
+ /* drop the old route when skb is not shared */
+ dst_release(skb->dst);
+ skb->dst = &rt->u.dst;
+
+ ip_vs_nat_icmp(skb, pp, cp, 0);
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->local_df = 1;
+
+ IP_VS_XMIT(skb, rt);
+
+ rc = NF_STOLEN;
+ goto out;
+
+ tx_error_icmp:
+ dst_link_failure(skb);
+ tx_error:
+ dev_kfree_skb(skb);
+ rc = NF_STOLEN;
+ out:
+ LeaveFunction(10);
+ return rc;
+ tx_error_put:
+ ip_rt_put(rt);
+ goto tx_error;
+}
diff --git a/net/ipv4/multipath.c b/net/ipv4/multipath.c
new file mode 100644
index 00000000000..4e9ca7c7640
--- /dev/null
+++ b/net/ipv4/multipath.c
@@ -0,0 +1,55 @@
+/* multipath.c: IPV4 multipath algorithm support.
+ *
+ * Copyright (C) 2004, 2005 Einar Lueck <elueck@de.ibm.com>
+ * Copyright (C) 2005 David S. Miller <davem@davemloft.net>
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+
+#include <net/ip_mp_alg.h>
+
+static DEFINE_SPINLOCK(alg_table_lock);
+struct ip_mp_alg_ops *ip_mp_alg_table[IP_MP_ALG_MAX + 1];
+
+int multipath_alg_register(struct ip_mp_alg_ops *ops, enum ip_mp_alg n)
+{
+ struct ip_mp_alg_ops **slot;
+ int err;
+
+ if (n < IP_MP_ALG_NONE || n > IP_MP_ALG_MAX ||
+ !ops->mp_alg_select_route)
+ return -EINVAL;
+
+ spin_lock(&alg_table_lock);
+ slot = &ip_mp_alg_table[n];
+ if (*slot != NULL) {
+ err = -EBUSY;
+ } else {
+ *slot = ops;
+ err = 0;
+ }
+ spin_unlock(&alg_table_lock);
+
+ return err;
+}
+EXPORT_SYMBOL(multipath_alg_register);
+
+void multipath_alg_unregister(struct ip_mp_alg_ops *ops, enum ip_mp_alg n)
+{
+ struct ip_mp_alg_ops **slot;
+
+ if (n < IP_MP_ALG_NONE || n > IP_MP_ALG_MAX)
+ return;
+
+ spin_lock(&alg_table_lock);
+ slot = &ip_mp_alg_table[n];
+ if (*slot == ops)
+ *slot = NULL;
+ spin_unlock(&alg_table_lock);
+
+ synchronize_net();
+}
+EXPORT_SYMBOL(multipath_alg_unregister);
diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c
new file mode 100644
index 00000000000..9349686131f
--- /dev/null
+++ b/net/ipv4/multipath_drr.c
@@ -0,0 +1,265 @@
+/*
+ * Device round robin policy for multipath.
+ *
+ *
+ * Version: $Id: multipath_drr.c,v 1.1.2.1 2004/09/16 07:42:34 elueck Exp $
+ *
+ * Authors: Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ipip.h>
+#include <net/checksum.h>
+#include <net/ip_mp_alg.h>
+
+struct multipath_device {
+ int ifi; /* interface index of device */
+ atomic_t usecount;
+ int allocated;
+};
+
+#define MULTIPATH_MAX_DEVICECANDIDATES 10
+
+static struct multipath_device state[MULTIPATH_MAX_DEVICECANDIDATES];
+static DEFINE_SPINLOCK(state_lock);
+static struct rtable *last_selection = NULL;
+
+static int inline __multipath_findslot(void)
+{
+ int i;
+
+ for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++) {
+ if (state[i].allocated == 0)
+ return i;
+ }
+ return -1;
+}
+
+static int inline __multipath_finddev(int ifindex)
+{
+ int i;
+
+ for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++) {
+ if (state[i].allocated != 0 &&
+ state[i].ifi == ifindex)
+ return i;
+ }
+ return -1;
+}
+
+static int drr_dev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = ptr;
+ int devidx;
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ case NETDEV_DOWN:
+ spin_lock_bh(&state_lock);
+
+ devidx = __multipath_finddev(dev->ifindex);
+ if (devidx != -1) {
+ state[devidx].allocated = 0;
+ state[devidx].ifi = 0;
+ atomic_set(&state[devidx].usecount, 0);
+ }
+
+ spin_unlock_bh(&state_lock);
+ break;
+ };
+
+ return NOTIFY_DONE;
+}
+
+struct notifier_block drr_dev_notifier = {
+ .notifier_call = drr_dev_event,
+};
+
+static void drr_remove(struct rtable *rt)
+{
+ if (last_selection == rt)
+ last_selection = NULL;
+}
+
+static void drr_safe_inc(atomic_t *usecount)
+{
+ int n;
+
+ atomic_inc(usecount);
+
+ n = atomic_read(usecount);
+ if (n <= 0) {
+ int i;
+
+ spin_lock_bh(&state_lock);
+
+ for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++)
+ atomic_set(&state[i].usecount, 0);
+
+ spin_unlock_bh(&state_lock);
+ }
+}
+
+static void drr_select_route(const struct flowi *flp,
+ struct rtable *first, struct rtable **rp)
+{
+ struct rtable *nh, *result, *cur_min;
+ int min_usecount = -1;
+ int devidx = -1;
+ int cur_min_devidx = -1;
+
+ /* if necessary and possible utilize the old alternative */
+ if ((flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE) != 0 &&
+ last_selection != NULL) {
+ result = last_selection;
+ *rp = result;
+ return;
+ }
+
+ /* 1. make sure all alt. nexthops have the same GC related data */
+ /* 2. determine the new candidate to be returned */
+ result = NULL;
+ cur_min = NULL;
+ for (nh = rcu_dereference(first); nh;
+ nh = rcu_dereference(nh->u.rt_next)) {
+ if ((nh->u.dst.flags & DST_BALANCED) != 0 &&
+ multipath_comparekeys(&nh->fl, flp)) {
+ int nh_ifidx = nh->u.dst.dev->ifindex;
+
+ nh->u.dst.lastuse = jiffies;
+ nh->u.dst.__use++;
+ if (result != NULL)
+ continue;
+
+ /* search for the output interface */
+
+ /* this is not SMP safe, only add/remove are
+ * SMP safe as wrong usecount updates have no big
+ * impact
+ */
+ devidx = __multipath_finddev(nh_ifidx);
+ if (devidx == -1) {
+ /* add the interface to the array
+ * SMP safe
+ */
+ spin_lock_bh(&state_lock);
+
+ /* due to SMP: search again */
+ devidx = __multipath_finddev(nh_ifidx);
+ if (devidx == -1) {
+ /* add entry for device */
+ devidx = __multipath_findslot();
+ if (devidx == -1) {
+ /* unlikely but possible */
+ continue;
+ }
+
+ state[devidx].allocated = 1;
+ state[devidx].ifi = nh_ifidx;
+ atomic_set(&state[devidx].usecount, 0);
+ min_usecount = 0;
+ }
+
+ spin_unlock_bh(&state_lock);
+ }
+
+ if (min_usecount == 0) {
+ /* if the device has not been used it is
+ * the primary target
+ */
+ drr_safe_inc(&state[devidx].usecount);
+ result = nh;
+ } else {
+ int count =
+ atomic_read(&state[devidx].usecount);
+
+ if (min_usecount == -1 ||
+ count < min_usecount) {
+ cur_min = nh;
+ cur_min_devidx = devidx;
+ min_usecount = count;
+ }
+ }
+ }
+ }
+
+ if (!result) {
+ if (cur_min) {
+ drr_safe_inc(&state[cur_min_devidx].usecount);
+ result = cur_min;
+ } else {
+ result = first;
+ }
+ }
+
+ *rp = result;
+ last_selection = result;
+}
+
+static struct ip_mp_alg_ops drr_ops = {
+ .mp_alg_select_route = drr_select_route,
+ .mp_alg_remove = drr_remove,
+};
+
+static int __init drr_init(void)
+{
+ int err = register_netdevice_notifier(&drr_dev_notifier);
+
+ if (err)
+ return err;
+
+ err = multipath_alg_register(&drr_ops, IP_MP_ALG_RR);
+ if (err)
+ goto fail;
+
+ return 0;
+
+fail:
+ unregister_netdevice_notifier(&drr_dev_notifier);
+ return err;
+}
+
+static void __exit drr_exit(void)
+{
+ unregister_netdevice_notifier(&drr_dev_notifier);
+ multipath_alg_unregister(&drr_ops, IP_MP_ALG_DRR);
+}
+
+module_init(drr_init);
+module_exit(drr_exit);
diff --git a/net/ipv4/multipath_random.c b/net/ipv4/multipath_random.c
new file mode 100644
index 00000000000..805a16e47de
--- /dev/null
+++ b/net/ipv4/multipath_random.c
@@ -0,0 +1,128 @@
+/*
+ * Random policy for multipath.
+ *
+ *
+ * Version: $Id: multipath_random.c,v 1.1.2.3 2004/09/21 08:42:11 elueck Exp $
+ *
+ * Authors: Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ipip.h>
+#include <net/checksum.h>
+#include <net/ip_mp_alg.h>
+
+#define MULTIPATH_MAX_CANDIDATES 40
+
+/* interface to random number generation */
+static unsigned int RANDOM_SEED = 93186752;
+
+static inline unsigned int random(unsigned int ubound)
+{
+ static unsigned int a = 1588635695,
+ q = 2,
+ r = 1117695901;
+
+ RANDOM_SEED = a*(RANDOM_SEED % q) - r*(RANDOM_SEED / q);
+
+ return RANDOM_SEED % ubound;
+}
+
+
+static void random_select_route(const struct flowi *flp,
+ struct rtable *first,
+ struct rtable **rp)
+{
+ struct rtable *rt;
+ struct rtable *decision;
+ unsigned char candidate_count = 0;
+
+ /* count all candidate */
+ for (rt = rcu_dereference(first); rt;
+ rt = rcu_dereference(rt->u.rt_next)) {
+ if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
+ multipath_comparekeys(&rt->fl, flp))
+ ++candidate_count;
+ }
+
+ /* choose a random candidate */
+ decision = first;
+ if (candidate_count > 1) {
+ unsigned char i = 0;
+ unsigned char candidate_no = (unsigned char)
+ random(candidate_count);
+
+ /* find chosen candidate and adjust GC data for all candidates
+ * to ensure they stay in cache
+ */
+ for (rt = first; rt; rt = rt->u.rt_next) {
+ if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
+ multipath_comparekeys(&rt->fl, flp)) {
+ rt->u.dst.lastuse = jiffies;
+
+ if (i == candidate_no)
+ decision = rt;
+
+ if (i >= candidate_count)
+ break;
+
+ i++;
+ }
+ }
+ }
+
+ decision->u.dst.__use++;
+ *rp = decision;
+}
+
+static struct ip_mp_alg_ops random_ops = {
+ .mp_alg_select_route = random_select_route,
+};
+
+static int __init random_init(void)
+{
+ return multipath_alg_register(&random_ops, IP_MP_ALG_RANDOM);
+}
+
+static void __exit random_exit(void)
+{
+ multipath_alg_unregister(&random_ops, IP_MP_ALG_RANDOM);
+}
+
+module_init(random_init);
+module_exit(random_exit);
diff --git a/net/ipv4/multipath_rr.c b/net/ipv4/multipath_rr.c
new file mode 100644
index 00000000000..554a8256816
--- /dev/null
+++ b/net/ipv4/multipath_rr.c
@@ -0,0 +1,115 @@
+/*
+ * Round robin policy for multipath.
+ *
+ *
+ * Version: $Id: multipath_rr.c,v 1.1.2.2 2004/09/16 07:42:34 elueck Exp $
+ *
+ * Authors: Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ipip.h>
+#include <net/checksum.h>
+#include <net/ip_mp_alg.h>
+
+#define MULTIPATH_MAX_CANDIDATES 40
+
+static struct rtable* last_used = NULL;
+
+static void rr_remove(struct rtable *rt)
+{
+ if (last_used == rt)
+ last_used = NULL;
+}
+
+static void rr_select_route(const struct flowi *flp,
+ struct rtable *first, struct rtable **rp)
+{
+ struct rtable *nh, *result, *min_use_cand = NULL;
+ int min_use = -1;
+
+ /* if necessary and possible utilize the old alternative */
+ if ((flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE) != 0 &&
+ last_used != NULL) {
+ result = last_used;
+ goto out;
+ }
+
+ /* 1. make sure all alt. nexthops have the same GC related data
+ * 2. determine the new candidate to be returned
+ */
+ result = NULL;
+ for (nh = rcu_dereference(first); nh;
+ nh = rcu_dereference(nh->u.rt_next)) {
+ if ((nh->u.dst.flags & DST_BALANCED) != 0 &&
+ multipath_comparekeys(&nh->fl, flp)) {
+ nh->u.dst.lastuse = jiffies;
+
+ if (min_use == -1 || nh->u.dst.__use < min_use) {
+ min_use = nh->u.dst.__use;
+ min_use_cand = nh;
+ }
+ }
+ }
+ result = min_use_cand;
+ if (!result)
+ result = first;
+
+out:
+ last_used = result;
+ result->u.dst.__use++;
+ *rp = result;
+}
+
+static struct ip_mp_alg_ops rr_ops = {
+ .mp_alg_select_route = rr_select_route,
+ .mp_alg_remove = rr_remove,
+};
+
+static int __init rr_init(void)
+{
+ return multipath_alg_register(&rr_ops, IP_MP_ALG_RR);
+}
+
+static void __exit rr_exit(void)
+{
+ multipath_alg_unregister(&rr_ops, IP_MP_ALG_RR);
+}
+
+module_init(rr_init);
+module_exit(rr_exit);
diff --git a/net/ipv4/multipath_wrandom.c b/net/ipv4/multipath_wrandom.c
new file mode 100644
index 00000000000..10b23e1bece
--- /dev/null
+++ b/net/ipv4/multipath_wrandom.c
@@ -0,0 +1,344 @@
+/*
+ * Weighted random policy for multipath.
+ *
+ *
+ * Version: $Id: multipath_wrandom.c,v 1.1.2.3 2004/09/22 07:51:40 elueck Exp $
+ *
+ * Authors: Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ipip.h>
+#include <net/checksum.h>
+#include <net/ip_fib.h>
+#include <net/ip_mp_alg.h>
+
+#define MULTIPATH_STATE_SIZE 15
+
+struct multipath_candidate {
+ struct multipath_candidate *next;
+ int power;
+ struct rtable *rt;
+};
+
+struct multipath_dest {
+ struct list_head list;
+
+ const struct fib_nh *nh_info;
+ __u32 netmask;
+ __u32 network;
+ unsigned char prefixlen;
+
+ struct rcu_head rcu;
+};
+
+struct multipath_bucket {
+ struct list_head head;
+ spinlock_t lock;
+};
+
+struct multipath_route {
+ struct list_head list;
+
+ int oif;
+ __u32 gw;
+ struct list_head dests;
+
+ struct rcu_head rcu;
+};
+
+/* state: primarily weight per route information */
+static struct multipath_bucket state[MULTIPATH_STATE_SIZE];
+
+/* interface to random number generation */
+static unsigned int RANDOM_SEED = 93186752;
+
+static inline unsigned int random(unsigned int ubound)
+{
+ static unsigned int a = 1588635695,
+ q = 2,
+ r = 1117695901;
+ RANDOM_SEED = a*(RANDOM_SEED % q) - r*(RANDOM_SEED / q);
+ return RANDOM_SEED % ubound;
+}
+
+static unsigned char __multipath_lookup_weight(const struct flowi *fl,
+ const struct rtable *rt)
+{
+ const int state_idx = rt->idev->dev->ifindex % MULTIPATH_STATE_SIZE;
+ struct multipath_route *r;
+ struct multipath_route *target_route = NULL;
+ struct multipath_dest *d;
+ int weight = 1;
+
+ /* lookup the weight information for a certain route */
+ rcu_read_lock();
+
+ /* find state entry for gateway or add one if necessary */
+ list_for_each_entry_rcu(r, &state[state_idx].head, list) {
+ if (r->gw == rt->rt_gateway &&
+ r->oif == rt->idev->dev->ifindex) {
+ target_route = r;
+ break;
+ }
+ }
+
+ if (!target_route) {
+ /* this should not happen... but we are prepared */
+ printk( KERN_CRIT"%s: missing state for gateway: %u and " \
+ "device %d\n", __FUNCTION__, rt->rt_gateway,
+ rt->idev->dev->ifindex);
+ goto out;
+ }
+
+ /* find state entry for destination */
+ list_for_each_entry_rcu(d, &target_route->dests, list) {
+ __u32 targetnetwork = fl->fl4_dst &
+ (0xFFFFFFFF >> (32 - d->prefixlen));
+
+ if ((targetnetwork & d->netmask) == d->network) {
+ weight = d->nh_info->nh_weight;
+ goto out;
+ }
+ }
+
+out:
+ rcu_read_unlock();
+ return weight;
+}
+
+static void wrandom_init_state(void)
+{
+ int i;
+
+ for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) {
+ INIT_LIST_HEAD(&state[i].head);
+ spin_lock_init(&state[i].lock);
+ }
+}
+
+static void wrandom_select_route(const struct flowi *flp,
+ struct rtable *first,
+ struct rtable **rp)
+{
+ struct rtable *rt;
+ struct rtable *decision;
+ struct multipath_candidate *first_mpc = NULL;
+ struct multipath_candidate *mpc, *last_mpc = NULL;
+ int power = 0;
+ int last_power;
+ int selector;
+ const size_t size_mpc = sizeof(struct multipath_candidate);
+
+ /* collect all candidates and identify their weights */
+ for (rt = rcu_dereference(first); rt;
+ rt = rcu_dereference(rt->u.rt_next)) {
+ if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
+ multipath_comparekeys(&rt->fl, flp)) {
+ struct multipath_candidate* mpc =
+ (struct multipath_candidate*)
+ kmalloc(size_mpc, GFP_KERNEL);
+
+ if (!mpc)
+ return;
+
+ power += __multipath_lookup_weight(flp, rt) * 10000;
+
+ mpc->power = power;
+ mpc->rt = rt;
+ mpc->next = NULL;
+
+ if (!first_mpc)
+ first_mpc = mpc;
+ else
+ last_mpc->next = mpc;
+
+ last_mpc = mpc;
+ }
+ }
+
+ /* choose a weighted random candidate */
+ decision = first;
+ selector = random(power);
+ last_power = 0;
+
+ /* select candidate, adjust GC data and cleanup local state */
+ decision = first;
+ last_mpc = NULL;
+ for (mpc = first_mpc; mpc; mpc = mpc->next) {
+ mpc->rt->u.dst.lastuse = jiffies;
+ if (last_power <= selector && selector < mpc->power)
+ decision = mpc->rt;
+
+ last_power = mpc->power;
+ if (last_mpc)
+ kfree(last_mpc);
+
+ last_mpc = mpc;
+ }
+
+ if (last_mpc) {
+ /* concurrent __multipath_flush may lead to !last_mpc */
+ kfree(last_mpc);
+ }
+
+ decision->u.dst.__use++;
+ *rp = decision;
+}
+
+static void wrandom_set_nhinfo(__u32 network,
+ __u32 netmask,
+ unsigned char prefixlen,
+ const struct fib_nh *nh)
+{
+ const int state_idx = nh->nh_oif % MULTIPATH_STATE_SIZE;
+ struct multipath_route *r, *target_route = NULL;
+ struct multipath_dest *d, *target_dest = NULL;
+
+ /* store the weight information for a certain route */
+ spin_lock(&state[state_idx].lock);
+
+ /* find state entry for gateway or add one if necessary */
+ list_for_each_entry_rcu(r, &state[state_idx].head, list) {
+ if (r->gw == nh->nh_gw && r->oif == nh->nh_oif) {
+ target_route = r;
+ break;
+ }
+ }
+
+ if (!target_route) {
+ const size_t size_rt = sizeof(struct multipath_route);
+ target_route = (struct multipath_route *)
+ kmalloc(size_rt, GFP_KERNEL);
+
+ target_route->gw = nh->nh_gw;
+ target_route->oif = nh->nh_oif;
+ memset(&target_route->rcu, 0, sizeof(struct rcu_head));
+ INIT_LIST_HEAD(&target_route->dests);
+
+ list_add_rcu(&target_route->list, &state[state_idx].head);
+ }
+
+ /* find state entry for destination or add one if necessary */
+ list_for_each_entry_rcu(d, &target_route->dests, list) {
+ if (d->nh_info == nh) {
+ target_dest = d;
+ break;
+ }
+ }
+
+ if (!target_dest) {
+ const size_t size_dst = sizeof(struct multipath_dest);
+ target_dest = (struct multipath_dest*)
+ kmalloc(size_dst, GFP_KERNEL);
+
+ target_dest->nh_info = nh;
+ target_dest->network = network;
+ target_dest->netmask = netmask;
+ target_dest->prefixlen = prefixlen;
+ memset(&target_dest->rcu, 0, sizeof(struct rcu_head));
+
+ list_add_rcu(&target_dest->list, &target_route->dests);
+ }
+ /* else: we already stored this info for another destination =>
+ * we are finished
+ */
+
+ spin_unlock(&state[state_idx].lock);
+}
+
+static void __multipath_free(struct rcu_head *head)
+{
+ struct multipath_route *rt = container_of(head, struct multipath_route,
+ rcu);
+ kfree(rt);
+}
+
+static void __multipath_free_dst(struct rcu_head *head)
+{
+ struct multipath_dest *dst = container_of(head,
+ struct multipath_dest,
+ rcu);
+ kfree(dst);
+}
+
+static void wrandom_flush(void)
+{
+ int i;
+
+ /* defere delete to all entries */
+ for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) {
+ struct multipath_route *r;
+
+ spin_lock(&state[i].lock);
+ list_for_each_entry_rcu(r, &state[i].head, list) {
+ struct multipath_dest *d;
+ list_for_each_entry_rcu(d, &r->dests, list) {
+ list_del_rcu(&d->list);
+ call_rcu(&d->rcu,
+ __multipath_free_dst);
+ }
+ list_del_rcu(&r->list);
+ call_rcu(&r->rcu,
+ __multipath_free);
+ }
+
+ spin_unlock(&state[i].lock);
+ }
+}
+
+static struct ip_mp_alg_ops wrandom_ops = {
+ .mp_alg_select_route = wrandom_select_route,
+ .mp_alg_flush = wrandom_flush,
+ .mp_alg_set_nhinfo = wrandom_set_nhinfo,
+};
+
+static int __init wrandom_init(void)
+{
+ wrandom_init_state();
+
+ return multipath_alg_register(&wrandom_ops, IP_MP_ALG_WRANDOM);
+}
+
+static void __exit wrandom_exit(void)
+{
+ multipath_alg_unregister(&wrandom_ops, IP_MP_ALG_WRANDOM);
+}
+
+module_init(wrandom_init);
+module_exit(wrandom_exit);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
new file mode 100644
index 00000000000..46d4cb1c06f
--- /dev/null
+++ b/net/ipv4/netfilter/Kconfig
@@ -0,0 +1,696 @@
+#
+# IP netfilter configuration
+#
+
+menu "IP: Netfilter Configuration"
+ depends on INET && NETFILTER
+
+# connection tracking, helpers and protocols
+config IP_NF_CONNTRACK
+ tristate "Connection tracking (required for masq/NAT)"
+ ---help---
+ Connection tracking keeps a record of what packets have passed
+ through your machine, in order to figure out how they are related
+ into connections.
+
+ This is required to do Masquerading or other kinds of Network
+ Address Translation (except for Fast NAT). It can also be used to
+ enhance packet filtering (see `Connection state match support'
+ below).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_CT_ACCT
+ bool "Connection tracking flow accounting"
+ depends on IP_NF_CONNTRACK
+ help
+ If this option is enabled, the connection tracking code will
+ keep per-flow packet and byte counters.
+
+ Those counters can be used for flow-based accounting or the
+ `connbytes' match.
+
+ If unsure, say `N'.
+
+config IP_NF_CONNTRACK_MARK
+ bool 'Connection mark tracking support'
+ help
+ This option enables support for connection marks, used by the
+ `CONNMARK' target and `connmark' match. Similar to the mark value
+ of packets, but this mark value is kept in the conntrack session
+ instead of the individual packets.
+
+config IP_NF_CT_PROTO_SCTP
+ tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)'
+ depends on IP_NF_CONNTRACK && EXPERIMENTAL
+ help
+ With this option enabled, the connection tracking code will
+ be able to do state tracking on SCTP connections.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/modules.txt>. If unsure, say `N'.
+
+config IP_NF_FTP
+ tristate "FTP protocol support"
+ depends on IP_NF_CONNTRACK
+ help
+ Tracking FTP connections is problematic: special helpers are
+ required for tracking them, and doing masquerading and other forms
+ of Network Address Translation on them.
+
+ To compile it as a module, choose M here. If unsure, say Y.
+
+config IP_NF_IRC
+ tristate "IRC protocol support"
+ depends on IP_NF_CONNTRACK
+ ---help---
+ There is a commonly-used extension to IRC called
+ Direct Client-to-Client Protocol (DCC). This enables users to send
+ files to each other, and also chat to each other without the need
+ of a server. DCC Sending is used anywhere you send files over IRC,
+ and DCC Chat is most commonly used by Eggdrop bots. If you are
+ using NAT, this extension will enable you to send files and initiate
+ chats. Note that you do NOT need this extension to get files or
+ have others initiate chats, or everything else in IRC.
+
+ To compile it as a module, choose M here. If unsure, say Y.
+
+config IP_NF_TFTP
+ tristate "TFTP protocol support"
+ depends on IP_NF_CONNTRACK
+ help
+ TFTP connection tracking helper, this is required depending
+ on how restrictive your ruleset is.
+ If you are using a tftp client behind -j SNAT or -j MASQUERADING
+ you will need this.
+
+ To compile it as a module, choose M here. If unsure, say Y.
+
+config IP_NF_AMANDA
+ tristate "Amanda backup protocol support"
+ depends on IP_NF_CONNTRACK
+ help
+ If you are running the Amanda backup package <http://www.amanda.org/>
+ on this machine or machines that will be MASQUERADED through this
+ machine, then you may want to enable this feature. This allows the
+ connection tracking and natting code to allow the sub-channels that
+ Amanda requires for communication of the backup data, messages and
+ index.
+
+ To compile it as a module, choose M here. If unsure, say Y.
+
+config IP_NF_QUEUE
+ tristate "Userspace queueing via NETLINK"
+ help
+ Netfilter has the ability to queue packets to user space: the
+ netlink device can be used to access them using this driver.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_IPTABLES
+ tristate "IP tables support (required for filtering/masq/NAT)"
+ help
+ iptables is a general, extensible packet identification framework.
+ The packet filtering and full NAT (masquerading, port forwarding,
+ etc) subsystems now use this: say `Y' or `M' here if you want to use
+ either of those.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+# The matches.
+config IP_NF_MATCH_LIMIT
+ tristate "limit match support"
+ depends on IP_NF_IPTABLES
+ help
+ limit matching allows you to control the rate at which a rule can be
+ matched: mainly useful in combination with the LOG target ("LOG
+ target support", below) and to avoid some Denial of Service attacks.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_MATCH_IPRANGE
+ tristate "IP range match support"
+ depends on IP_NF_IPTABLES
+ help
+ This option makes possible to match IP addresses against IP address
+ ranges.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_MATCH_MAC
+ tristate "MAC address match support"
+ depends on IP_NF_IPTABLES
+ help
+ MAC matching allows you to match packets based on the source
+ Ethernet address of the packet.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_MATCH_PKTTYPE
+ tristate "Packet type match support"
+ depends on IP_NF_IPTABLES
+ help
+ Packet type matching allows you to match a packet by
+ its "class", eg. BROADCAST, MULTICAST, ...
+
+ Typical usage:
+ iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_MATCH_MARK
+ tristate "netfilter MARK match support"
+ depends on IP_NF_IPTABLES
+ help
+ Netfilter mark matching allows you to match packets based on the
+ `nfmark' value in the packet. This can be set by the MARK target
+ (see below).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_MATCH_MULTIPORT
+ tristate "Multiple port match support"
+ depends on IP_NF_IPTABLES
+ help
+ Multiport matching allows you to match TCP or UDP packets based on
+ a series of source or destination ports: normally a rule can only
+ match a single range of ports.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_MATCH_TOS
+ tristate "TOS match support"
+ depends on IP_NF_IPTABLES
+ help
+ TOS matching allows you to match packets based on the Type Of
+ Service fields of the IP packet.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_MATCH_RECENT
+ tristate "recent match support"
+ depends on IP_NF_IPTABLES
+ help
+ This match is used for creating one or many lists of recently
+ used addresses and then matching against that/those list(s).
+
+ Short options are available by using 'iptables -m recent -h'
+ Official Website: <http://snowman.net/projects/ipt_recent/>
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_MATCH_ECN
+ tristate "ECN match support"
+ depends on IP_NF_IPTABLES
+ help
+ This option adds a `ECN' match, which allows you to match against
+ the IPv4 and TCP header ECN fields.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_MATCH_DSCP
+ tristate "DSCP match support"
+ depends on IP_NF_IPTABLES
+ help
+ This option adds a `DSCP' match, which allows you to match against
+ the IPv4 header DSCP field (DSCP codepoint).
+
+ The DSCP codepoint can have any value between 0x0 and 0x4f.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_MATCH_AH_ESP
+ tristate "AH/ESP match support"
+ depends on IP_NF_IPTABLES
+ help
+ These two match extensions (`ah' and `esp') allow you to match a
+ range of SPIs inside AH or ESP headers of IPSec packets.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_MATCH_LENGTH
+ tristate "LENGTH match support"
+ depends on IP_NF_IPTABLES
+ help
+ This option allows you to match the length of a packet against a
+ specific value or range of values.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_MATCH_TTL
+ tristate "TTL match support"
+ depends on IP_NF_IPTABLES
+ help
+ This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user
+ to match packets by their TTL value.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_MATCH_TCPMSS
+ tristate "tcpmss match support"
+ depends on IP_NF_IPTABLES
+ help
+ This option adds a `tcpmss' match, which allows you to examine the
+ MSS value of TCP SYN packets, which control the maximum packet size
+ for that connection.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_MATCH_HELPER
+ tristate "Helper match support"
+ depends on IP_NF_CONNTRACK && IP_NF_IPTABLES
+ help
+ Helper matching allows you to match packets in dynamic connections
+ tracked by a conntrack-helper, ie. ip_conntrack_ftp
+
+ To compile it as a module, choose M here. If unsure, say Y.
+
+config IP_NF_MATCH_STATE
+ tristate "Connection state match support"
+ depends on IP_NF_CONNTRACK && IP_NF_IPTABLES
+ help
+ Connection state matching allows you to match packets based on their
+ relationship to a tracked connection (ie. previous packets). This
+ is a powerful tool for packet classification.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_MATCH_CONNTRACK
+ tristate "Connection tracking match support"
+ depends on IP_NF_CONNTRACK && IP_NF_IPTABLES
+ help
+ This is a general conntrack match module, a superset of the state match.
+
+ It allows matching on additional conntrack information, which is
+ useful in complex configurations, such as NAT gateways with multiple
+ internet links or tunnels.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_MATCH_OWNER
+ tristate "Owner match support"
+ depends on IP_NF_IPTABLES
+ help
+ Packet owner matching allows you to match locally-generated packets
+ based on who created them: the user, group, process or session.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_MATCH_PHYSDEV
+ tristate "Physdev match support"
+ depends on IP_NF_IPTABLES && BRIDGE_NETFILTER
+ help
+ Physdev packet matching matches against the physical bridge ports
+ the IP packet arrived on or will leave by.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_MATCH_ADDRTYPE
+ tristate 'address type match support'
+ depends on IP_NF_IPTABLES
+ help
+ This option allows you to match what routing thinks of an address,
+ eg. UNICAST, LOCAL, BROADCAST, ...
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/modules.txt>. If unsure, say `N'.
+
+config IP_NF_MATCH_REALM
+ tristate 'realm match support'
+ depends on IP_NF_IPTABLES
+ select NET_CLS_ROUTE
+ help
+ This option adds a `realm' match, which allows you to use the realm
+ key from the routing subsystem inside iptables.
+
+ This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option
+ in tc world.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/modules.txt>. If unsure, say `N'.
+
+config IP_NF_MATCH_SCTP
+ tristate 'SCTP protocol match support'
+ depends on IP_NF_IPTABLES
+ help
+ With this option enabled, you will be able to use the iptables
+ `sctp' match in order to match on SCTP source/destination ports
+ and SCTP chunk types.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/modules.txt>. If unsure, say `N'.
+
+config IP_NF_MATCH_COMMENT
+ tristate 'comment match support'
+ depends on IP_NF_IPTABLES
+ help
+ This option adds a `comment' dummy-match, which allows you to put
+ comments in your iptables ruleset.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/modules.txt>. If unsure, say `N'.
+
+config IP_NF_MATCH_CONNMARK
+ tristate 'Connection mark match support'
+ depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES
+ help
+ This option adds a `connmark' match, which allows you to match the
+ connection mark value previously set for the session by `CONNMARK'.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/modules.txt>. The module will be called
+ ipt_connmark.o. If unsure, say `N'.
+
+config IP_NF_MATCH_HASHLIMIT
+ tristate 'hashlimit match support'
+ depends on IP_NF_IPTABLES
+ help
+ This option adds a new iptables `hashlimit' match.
+
+ As opposed to `limit', this match dynamically crates a hash table
+ of limit buckets, based on your selection of source/destination
+ ip addresses and/or ports.
+
+ It enables you to express policies like `10kpps for any given
+ destination IP' or `500pps from any given source IP' with a single
+ IPtables rule.
+
+# `filter', generic and specific targets
+config IP_NF_FILTER
+ tristate "Packet filtering"
+ depends on IP_NF_IPTABLES
+ help
+ Packet filtering defines a table `filter', which has a series of
+ rules for simple packet filtering at local input, forwarding and
+ local output. See the man page for iptables(8).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_REJECT
+ tristate "REJECT target support"
+ depends on IP_NF_FILTER
+ help
+ The REJECT target allows a filtering rule to specify that an ICMP
+ error should be issued in response to an incoming packet, rather
+ than silently being dropped.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_LOG
+ tristate "LOG target support"
+ depends on IP_NF_IPTABLES
+ help
+ This option adds a `LOG' target, which allows you to create rules in
+ any iptables table which records the packet header to the syslog.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_ULOG
+ tristate "ULOG target support"
+ depends on IP_NF_IPTABLES
+ ---help---
+ This option adds a `ULOG' target, which allows you to create rules in
+ any iptables table. The packet is passed to a userspace logging
+ daemon using netlink multicast sockets; unlike the LOG target
+ which can only be viewed through syslog.
+
+ The apropriate userspace logging daemon (ulogd) may be obtained from
+ <http://www.gnumonks.org/projects/ulogd/>
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_TCPMSS
+ tristate "TCPMSS target support"
+ depends on IP_NF_IPTABLES
+ ---help---
+ This option adds a `TCPMSS' target, which allows you to alter the
+ MSS value of TCP SYN packets, to control the maximum size for that
+ connection (usually limiting it to your outgoing interface's MTU
+ minus 40).
+
+ This is used to overcome criminally braindead ISPs or servers which
+ block ICMP Fragmentation Needed packets. The symptoms of this
+ problem are that everything works fine from your Linux
+ firewall/router, but machines behind it can never exchange large
+ packets:
+ 1) Web browsers connect, then hang with no data received.
+ 2) Small mail works fine, but large emails hang.
+ 3) ssh works fine, but scp hangs after initial handshaking.
+
+ Workaround: activate this option and add a rule to your firewall
+ configuration like:
+
+ iptables -A FORWARD -p tcp --tcp-flags SYN,RST SYN \
+ -j TCPMSS --clamp-mss-to-pmtu
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+# NAT + specific targets
+config IP_NF_NAT
+ tristate "Full NAT"
+ depends on IP_NF_IPTABLES && IP_NF_CONNTRACK
+ help
+ The Full NAT option allows masquerading, port forwarding and other
+ forms of full Network Address Port Translation. It is controlled by
+ the `nat' table in iptables: see the man page for iptables(8).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_NAT_NEEDED
+ bool
+ depends on IP_NF_NAT != n
+ default y
+
+config IP_NF_TARGET_MASQUERADE
+ tristate "MASQUERADE target support"
+ depends on IP_NF_NAT
+ help
+ Masquerading is a special case of NAT: all outgoing connections are
+ changed to seem to come from a particular interface's address, and
+ if the interface goes down, those connections are lost. This is
+ only useful for dialup accounts with dynamic IP address (ie. your IP
+ address will be different on next dialup).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_REDIRECT
+ tristate "REDIRECT target support"
+ depends on IP_NF_NAT
+ help
+ REDIRECT is a special case of NAT: all incoming connections are
+ mapped onto the incoming interface's address, causing the packets to
+ come to the local machine instead of passing through. This is
+ useful for transparent proxies.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_NETMAP
+ tristate "NETMAP target support"
+ depends on IP_NF_NAT
+ help
+ NETMAP is an implementation of static 1:1 NAT mapping of network
+ addresses. It maps the network address part, while keeping the host
+ address part intact. It is similar to Fast NAT, except that
+ Netfilter's connection tracking doesn't work well with Fast NAT.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_SAME
+ tristate "SAME target support"
+ depends on IP_NF_NAT
+ help
+ This option adds a `SAME' target, which works like the standard SNAT
+ target, but attempts to give clients the same IP for all connections.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_NAT_SNMP_BASIC
+ tristate "Basic SNMP-ALG support (EXPERIMENTAL)"
+ depends on EXPERIMENTAL && IP_NF_NAT
+ ---help---
+
+ This module implements an Application Layer Gateway (ALG) for
+ SNMP payloads. In conjunction with NAT, it allows a network
+ management system to access multiple private networks with
+ conflicting addresses. It works by modifying IP addresses
+ inside SNMP payloads to match IP-layer NAT mapping.
+
+ This is the "basic" form of SNMP-ALG, as described in RFC 2962
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_NAT_IRC
+ tristate
+ depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
+ default IP_NF_NAT if IP_NF_IRC=y
+ default m if IP_NF_IRC=m
+
+# If they want FTP, set to $CONFIG_IP_NF_NAT (m or y),
+# or $CONFIG_IP_NF_FTP (m or y), whichever is weaker. Argh.
+config IP_NF_NAT_FTP
+ tristate
+ depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
+ default IP_NF_NAT if IP_NF_FTP=y
+ default m if IP_NF_FTP=m
+
+config IP_NF_NAT_TFTP
+ tristate
+ depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
+ default IP_NF_NAT if IP_NF_TFTP=y
+ default m if IP_NF_TFTP=m
+
+config IP_NF_NAT_AMANDA
+ tristate
+ depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
+ default IP_NF_NAT if IP_NF_AMANDA=y
+ default m if IP_NF_AMANDA=m
+
+# mangle + specific targets
+config IP_NF_MANGLE
+ tristate "Packet mangling"
+ depends on IP_NF_IPTABLES
+ help
+ This option adds a `mangle' table to iptables: see the man page for
+ iptables(8). This table is used for various packet alterations
+ which can effect how the packet is routed.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_TOS
+ tristate "TOS target support"
+ depends on IP_NF_MANGLE
+ help
+ This option adds a `TOS' target, which allows you to create rules in
+ the `mangle' table which alter the Type Of Service field of an IP
+ packet prior to routing.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_ECN
+ tristate "ECN target support"
+ depends on IP_NF_MANGLE
+ ---help---
+ This option adds a `ECN' target, which can be used in the iptables mangle
+ table.
+
+ You can use this target to remove the ECN bits from the IPv4 header of
+ an IP packet. This is particularly useful, if you need to work around
+ existing ECN blackholes on the internet, but don't want to disable
+ ECN support in general.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_DSCP
+ tristate "DSCP target support"
+ depends on IP_NF_MANGLE
+ help
+ This option adds a `DSCP' match, which allows you to match against
+ the IPv4 header DSCP field (DSCP codepoint).
+
+ The DSCP codepoint can have any value between 0x0 and 0x4f.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_MARK
+ tristate "MARK target support"
+ depends on IP_NF_MANGLE
+ help
+ This option adds a `MARK' target, which allows you to create rules
+ in the `mangle' table which alter the netfilter mark (nfmark) field
+ associated with the packet prior to routing. This can change
+ the routing method (see `Use netfilter MARK value as routing
+ key') and can also be used by other subsystems to change their
+ behavior.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_CLASSIFY
+ tristate "CLASSIFY target support"
+ depends on IP_NF_MANGLE
+ help
+ This option adds a `CLASSIFY' target, which enables the user to set
+ the priority of a packet. Some qdiscs can use this value for
+ classification, among these are:
+
+ atm, cbq, dsmark, pfifo_fast, htb, prio
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_CONNMARK
+ tristate 'CONNMARK target support'
+ depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE
+ help
+ This option adds a `CONNMARK' target, which allows one to manipulate
+ the connection mark value. Similar to the MARK target, but
+ affects the connection mark value rather than the packet mark value.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/modules.txt>. The module will be called
+ ipt_CONNMARK.o. If unsure, say `N'.
+
+config IP_NF_TARGET_CLUSTERIP
+ tristate "CLUSTERIP target support (EXPERIMENTAL)"
+ depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES && EXPERIMENTAL
+ help
+ The CLUSTERIP target allows you to build load-balancing clusters of
+ network servers without having a dedicated load-balancing
+ router/server/switch.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+# raw + specific targets
+config IP_NF_RAW
+ tristate 'raw table support (required for NOTRACK/TRACE)'
+ depends on IP_NF_IPTABLES
+ help
+ This option adds a `raw' table to iptables. This table is the very
+ first in the netfilter framework and hooks in at the PREROUTING
+ and OUTPUT chains.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/modules.txt>. If unsure, say `N'.
+
+config IP_NF_TARGET_NOTRACK
+ tristate 'NOTRACK target support'
+ depends on IP_NF_RAW
+ depends on IP_NF_CONNTRACK
+ help
+ The NOTRACK target allows a select rule to specify
+ which packets *not* to enter the conntrack/NAT
+ subsystem with all the consequences (no ICMP error tracking,
+ no protocol helpers for the selected packets).
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/modules.txt>. If unsure, say `N'.
+
+
+# ARP tables
+config IP_NF_ARPTABLES
+ tristate "ARP tables support"
+ help
+ arptables is a general, extensible packet identification framework.
+ The ARP packet filtering and mangling (manipulation)subsystems
+ use this: say Y or M here if you want to use either of those.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_ARPFILTER
+ tristate "ARP packet filtering"
+ depends on IP_NF_ARPTABLES
+ help
+ ARP packet filtering defines a table `filter', which has a series of
+ rules for simple ARP packet filtering at local input and
+ local output. On a bridge, you can also specify filtering rules
+ for forwarded ARP packets. See the man page for arptables(8).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_ARP_MANGLE
+ tristate "ARP payload mangling"
+ depends on IP_NF_ARPTABLES
+ help
+ Allows altering the ARP packet payload: source and destination
+ hardware and network addresses.
+
+endmenu
+
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
new file mode 100644
index 00000000000..45796d5924d
--- /dev/null
+++ b/net/ipv4/netfilter/Makefile
@@ -0,0 +1,89 @@
+#
+# Makefile for the netfilter modules on top of IPv4.
+#
+
+# objects for the standalone - connection tracking / NAT
+ip_conntrack-objs := ip_conntrack_standalone.o ip_conntrack_core.o ip_conntrack_proto_generic.o ip_conntrack_proto_tcp.o ip_conntrack_proto_udp.o ip_conntrack_proto_icmp.o
+iptable_nat-objs := ip_nat_standalone.o ip_nat_rule.o ip_nat_core.o ip_nat_helper.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o
+
+# connection tracking
+obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
+
+# SCTP protocol connection tracking
+obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o
+
+# connection tracking helpers
+obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
+obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
+obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o
+obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
+
+# NAT helpers
+obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
+obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o
+obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o
+obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o
+
+# generic IP tables
+obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
+
+# the three instances of ip_tables
+obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
+obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
+obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o
+obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
+
+# matches
+obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o
+obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o
+obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o
+obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o
+obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o
+obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o
+obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o
+obj-$(CONFIG_IP_NF_MATCH_PKTTYPE) += ipt_pkttype.o
+obj-$(CONFIG_IP_NF_MATCH_MULTIPORT) += ipt_multiport.o
+obj-$(CONFIG_IP_NF_MATCH_OWNER) += ipt_owner.o
+obj-$(CONFIG_IP_NF_MATCH_TOS) += ipt_tos.o
+obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o
+obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
+obj-$(CONFIG_IP_NF_MATCH_DSCP) += ipt_dscp.o
+obj-$(CONFIG_IP_NF_MATCH_AH_ESP) += ipt_ah.o ipt_esp.o
+obj-$(CONFIG_IP_NF_MATCH_LENGTH) += ipt_length.o
+obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
+obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o
+obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o
+obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o
+obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o
+obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o
+obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
+obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o
+obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o
+
+# targets
+obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
+obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TOS.o
+obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
+obj-$(CONFIG_IP_NF_TARGET_DSCP) += ipt_DSCP.o
+obj-$(CONFIG_IP_NF_TARGET_MARK) += ipt_MARK.o
+obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
+obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
+obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
+obj-$(CONFIG_IP_NF_TARGET_SAME) += ipt_SAME.o
+obj-$(CONFIG_IP_NF_TARGET_CLASSIFY) += ipt_CLASSIFY.o
+obj-$(CONFIG_IP_NF_NAT_SNMP_BASIC) += ip_nat_snmp_basic.o
+obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o
+obj-$(CONFIG_IP_NF_TARGET_CONNMARK) += ipt_CONNMARK.o
+obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
+obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o
+obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o
+obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
+
+# generic ARP tables
+obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
+obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
+
+# just filtering instance of ARP tables for now
+obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
+
+obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
new file mode 100644
index 00000000000..df79f5ed6a0
--- /dev/null
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -0,0 +1,1333 @@
+/*
+ * Packet matching code for ARP packets.
+ *
+ * Based heavily, if not almost entirely, upon ip_tables.c framework.
+ *
+ * Some ARP specific bits are:
+ *
+ * Copyright (C) 2002 David S. Miller (davem@redhat.com)
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/kmod.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+
+#include <linux/netfilter_arp/arp_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
+MODULE_DESCRIPTION("arptables core");
+
+/*#define DEBUG_ARP_TABLES*/
+/*#define DEBUG_ARP_TABLES_USER*/
+
+#ifdef DEBUG_ARP_TABLES
+#define dprintf(format, args...) printk(format , ## args)
+#else
+#define dprintf(format, args...)
+#endif
+
+#ifdef DEBUG_ARP_TABLES_USER
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+#ifdef CONFIG_NETFILTER_DEBUG
+#define ARP_NF_ASSERT(x) \
+do { \
+ if (!(x)) \
+ printk("ARP_NF_ASSERT: %s:%s:%u\n", \
+ __FUNCTION__, __FILE__, __LINE__); \
+} while(0)
+#else
+#define ARP_NF_ASSERT(x)
+#endif
+#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
+
+static DECLARE_MUTEX(arpt_mutex);
+
+#define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
+#define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
+#include <linux/netfilter_ipv4/lockhelp.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+struct arpt_table_info {
+ unsigned int size;
+ unsigned int number;
+ unsigned int initial_entries;
+ unsigned int hook_entry[NF_ARP_NUMHOOKS];
+ unsigned int underflow[NF_ARP_NUMHOOKS];
+ char entries[0] __attribute__((aligned(SMP_CACHE_BYTES)));
+};
+
+static LIST_HEAD(arpt_target);
+static LIST_HEAD(arpt_tables);
+#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
+
+#ifdef CONFIG_SMP
+#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
+#else
+#define TABLE_OFFSET(t,p) 0
+#endif
+
+static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
+ char *hdr_addr, int len)
+{
+ int i, ret;
+
+ if (len > ARPT_DEV_ADDR_LEN_MAX)
+ len = ARPT_DEV_ADDR_LEN_MAX;
+
+ ret = 0;
+ for (i = 0; i < len; i++)
+ ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i];
+
+ return (ret != 0);
+}
+
+/* Returns whether packet matches rule or not. */
+static inline int arp_packet_match(const struct arphdr *arphdr,
+ struct net_device *dev,
+ const char *indev,
+ const char *outdev,
+ const struct arpt_arp *arpinfo)
+{
+ char *arpptr = (char *)(arphdr + 1);
+ char *src_devaddr, *tgt_devaddr;
+ u32 src_ipaddr, tgt_ipaddr;
+ int i, ret;
+
+#define FWINV(bool,invflg) ((bool) ^ !!(arpinfo->invflags & invflg))
+
+ if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop,
+ ARPT_INV_ARPOP)) {
+ dprintf("ARP operation field mismatch.\n");
+ dprintf("ar_op: %04x info->arpop: %04x info->arpop_mask: %04x\n",
+ arphdr->ar_op, arpinfo->arpop, arpinfo->arpop_mask);
+ return 0;
+ }
+
+ if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd,
+ ARPT_INV_ARPHRD)) {
+ dprintf("ARP hardware address format mismatch.\n");
+ dprintf("ar_hrd: %04x info->arhrd: %04x info->arhrd_mask: %04x\n",
+ arphdr->ar_hrd, arpinfo->arhrd, arpinfo->arhrd_mask);
+ return 0;
+ }
+
+ if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro,
+ ARPT_INV_ARPPRO)) {
+ dprintf("ARP protocol address format mismatch.\n");
+ dprintf("ar_pro: %04x info->arpro: %04x info->arpro_mask: %04x\n",
+ arphdr->ar_pro, arpinfo->arpro, arpinfo->arpro_mask);
+ return 0;
+ }
+
+ if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln,
+ ARPT_INV_ARPHLN)) {
+ dprintf("ARP hardware address length mismatch.\n");
+ dprintf("ar_hln: %02x info->arhln: %02x info->arhln_mask: %02x\n",
+ arphdr->ar_hln, arpinfo->arhln, arpinfo->arhln_mask);
+ return 0;
+ }
+
+ src_devaddr = arpptr;
+ arpptr += dev->addr_len;
+ memcpy(&src_ipaddr, arpptr, sizeof(u32));
+ arpptr += sizeof(u32);
+ tgt_devaddr = arpptr;
+ arpptr += dev->addr_len;
+ memcpy(&tgt_ipaddr, arpptr, sizeof(u32));
+
+ if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len),
+ ARPT_INV_SRCDEVADDR) ||
+ FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len),
+ ARPT_INV_TGTDEVADDR)) {
+ dprintf("Source or target device address mismatch.\n");
+
+ return 0;
+ }
+
+ if (FWINV((src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr,
+ ARPT_INV_SRCIP) ||
+ FWINV(((tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr),
+ ARPT_INV_TGTIP)) {
+ dprintf("Source or target IP address mismatch.\n");
+
+ dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n",
+ NIPQUAD(src_ipaddr),
+ NIPQUAD(arpinfo->smsk.s_addr),
+ NIPQUAD(arpinfo->src.s_addr),
+ arpinfo->invflags & ARPT_INV_SRCIP ? " (INV)" : "");
+ dprintf("TGT: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n",
+ NIPQUAD(tgt_ipaddr),
+ NIPQUAD(arpinfo->tmsk.s_addr),
+ NIPQUAD(arpinfo->tgt.s_addr),
+ arpinfo->invflags & ARPT_INV_TGTIP ? " (INV)" : "");
+ return 0;
+ }
+
+ /* Look for ifname matches. */
+ for (i = 0, ret = 0; i < IFNAMSIZ; i++) {
+ ret |= (indev[i] ^ arpinfo->iniface[i])
+ & arpinfo->iniface_mask[i];
+ }
+
+ if (FWINV(ret != 0, ARPT_INV_VIA_IN)) {
+ dprintf("VIA in mismatch (%s vs %s).%s\n",
+ indev, arpinfo->iniface,
+ arpinfo->invflags&ARPT_INV_VIA_IN ?" (INV)":"");
+ return 0;
+ }
+
+ for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
+ unsigned long odev;
+ memcpy(&odev, outdev + i*sizeof(unsigned long),
+ sizeof(unsigned long));
+ ret |= (odev
+ ^ ((const unsigned long *)arpinfo->outiface)[i])
+ & ((const unsigned long *)arpinfo->outiface_mask)[i];
+ }
+
+ if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) {
+ dprintf("VIA out mismatch (%s vs %s).%s\n",
+ outdev, arpinfo->outiface,
+ arpinfo->invflags&ARPT_INV_VIA_OUT ?" (INV)":"");
+ return 0;
+ }
+
+ return 1;
+}
+
+static inline int arp_checkentry(const struct arpt_arp *arp)
+{
+ if (arp->flags & ~ARPT_F_MASK) {
+ duprintf("Unknown flag bits set: %08X\n",
+ arp->flags & ~ARPT_F_MASK);
+ return 0;
+ }
+ if (arp->invflags & ~ARPT_INV_MASK) {
+ duprintf("Unknown invflag bits set: %08X\n",
+ arp->invflags & ~ARPT_INV_MASK);
+ return 0;
+ }
+
+ return 1;
+}
+
+static unsigned int arpt_error(struct sk_buff **pskb,
+ unsigned int hooknum,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *targinfo,
+ void *userinfo)
+{
+ if (net_ratelimit())
+ printk("arp_tables: error: '%s'\n", (char *)targinfo);
+
+ return NF_DROP;
+}
+
+static inline struct arpt_entry *get_entry(void *base, unsigned int offset)
+{
+ return (struct arpt_entry *)(base + offset);
+}
+
+unsigned int arpt_do_table(struct sk_buff **pskb,
+ unsigned int hook,
+ const struct net_device *in,
+ const struct net_device *out,
+ struct arpt_table *table,
+ void *userdata)
+{
+ static const char nulldevname[IFNAMSIZ];
+ unsigned int verdict = NF_DROP;
+ struct arphdr *arp;
+ int hotdrop = 0;
+ struct arpt_entry *e, *back;
+ const char *indev, *outdev;
+ void *table_base;
+
+ /* ARP header, plus 2 device addresses, plus 2 IP addresses. */
+ if (!pskb_may_pull((*pskb), (sizeof(struct arphdr) +
+ (2 * (*pskb)->dev->addr_len) +
+ (2 * sizeof(u32)))))
+ return NF_DROP;
+
+ indev = in ? in->name : nulldevname;
+ outdev = out ? out->name : nulldevname;
+
+ read_lock_bh(&table->lock);
+ table_base = (void *)table->private->entries
+ + TABLE_OFFSET(table->private,
+ smp_processor_id());
+ e = get_entry(table_base, table->private->hook_entry[hook]);
+ back = get_entry(table_base, table->private->underflow[hook]);
+
+ arp = (*pskb)->nh.arph;
+ do {
+ if (arp_packet_match(arp, (*pskb)->dev, indev, outdev, &e->arp)) {
+ struct arpt_entry_target *t;
+ int hdr_len;
+
+ hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
+ (2 * (*pskb)->dev->addr_len);
+ ADD_COUNTER(e->counters, hdr_len, 1);
+
+ t = arpt_get_target(e);
+
+ /* Standard target? */
+ if (!t->u.kernel.target->target) {
+ int v;
+
+ v = ((struct arpt_standard_target *)t)->verdict;
+ if (v < 0) {
+ /* Pop from stack? */
+ if (v != ARPT_RETURN) {
+ verdict = (unsigned)(-v) - 1;
+ break;
+ }
+ e = back;
+ back = get_entry(table_base,
+ back->comefrom);
+ continue;
+ }
+ if (table_base + v
+ != (void *)e + e->next_offset) {
+ /* Save old back ptr in next entry */
+ struct arpt_entry *next
+ = (void *)e + e->next_offset;
+ next->comefrom =
+ (void *)back - table_base;
+
+ /* set back pointer to next entry */
+ back = next;
+ }
+
+ e = get_entry(table_base, v);
+ } else {
+ /* Targets which reenter must return
+ * abs. verdicts
+ */
+ verdict = t->u.kernel.target->target(pskb,
+ hook,
+ in, out,
+ t->data,
+ userdata);
+
+ /* Target might have changed stuff. */
+ arp = (*pskb)->nh.arph;
+
+ if (verdict == ARPT_CONTINUE)
+ e = (void *)e + e->next_offset;
+ else
+ /* Verdict */
+ break;
+ }
+ } else {
+ e = (void *)e + e->next_offset;
+ }
+ } while (!hotdrop);
+ read_unlock_bh(&table->lock);
+
+ if (hotdrop)
+ return NF_DROP;
+ else
+ return verdict;
+}
+
+static inline void *find_inlist_lock_noload(struct list_head *head,
+ const char *name,
+ int *error,
+ struct semaphore *mutex)
+{
+ void *ret;
+
+ *error = down_interruptible(mutex);
+ if (*error != 0)
+ return NULL;
+
+ ret = list_named_find(head, name);
+ if (!ret) {
+ *error = -ENOENT;
+ up(mutex);
+ }
+ return ret;
+}
+
+#ifndef CONFIG_KMOD
+#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m))
+#else
+static void *
+find_inlist_lock(struct list_head *head,
+ const char *name,
+ const char *prefix,
+ int *error,
+ struct semaphore *mutex)
+{
+ void *ret;
+
+ ret = find_inlist_lock_noload(head, name, error, mutex);
+ if (!ret) {
+ duprintf("find_inlist: loading `%s%s'.\n", prefix, name);
+ request_module("%s%s", prefix, name);
+ ret = find_inlist_lock_noload(head, name, error, mutex);
+ }
+
+ return ret;
+}
+#endif
+
+static inline struct arpt_table *arpt_find_table_lock(const char *name, int *error, struct semaphore *mutex)
+{
+ return find_inlist_lock(&arpt_tables, name, "arptable_", error, mutex);
+}
+
+static struct arpt_target *arpt_find_target_lock(const char *name, int *error, struct semaphore *mutex)
+{
+ return find_inlist_lock(&arpt_target, name, "arpt_", error, mutex);
+}
+
+/* All zeroes == unconditional rule. */
+static inline int unconditional(const struct arpt_arp *arp)
+{
+ unsigned int i;
+
+ for (i = 0; i < sizeof(*arp)/sizeof(__u32); i++)
+ if (((__u32 *)arp)[i])
+ return 0;
+
+ return 1;
+}
+
+/* Figures out from what hook each rule can be called: returns 0 if
+ * there are loops. Puts hook bitmask in comefrom.
+ */
+static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int valid_hooks)
+{
+ unsigned int hook;
+
+ /* No recursion; use packet counter to save back ptrs (reset
+ * to 0 as we leave), and comefrom to save source hook bitmask.
+ */
+ for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) {
+ unsigned int pos = newinfo->hook_entry[hook];
+ struct arpt_entry *e
+ = (struct arpt_entry *)(newinfo->entries + pos);
+
+ if (!(valid_hooks & (1 << hook)))
+ continue;
+
+ /* Set initial back pointer. */
+ e->counters.pcnt = pos;
+
+ for (;;) {
+ struct arpt_standard_target *t
+ = (void *)arpt_get_target(e);
+
+ if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) {
+ printk("arptables: loop hook %u pos %u %08X.\n",
+ hook, pos, e->comefrom);
+ return 0;
+ }
+ e->comefrom
+ |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS));
+
+ /* Unconditional return/END. */
+ if (e->target_offset == sizeof(struct arpt_entry)
+ && (strcmp(t->target.u.user.name,
+ ARPT_STANDARD_TARGET) == 0)
+ && t->verdict < 0
+ && unconditional(&e->arp)) {
+ unsigned int oldpos, size;
+
+ /* Return: backtrack through the last
+ * big jump.
+ */
+ do {
+ e->comefrom ^= (1<<NF_ARP_NUMHOOKS);
+ oldpos = pos;
+ pos = e->counters.pcnt;
+ e->counters.pcnt = 0;
+
+ /* We're at the start. */
+ if (pos == oldpos)
+ goto next;
+
+ e = (struct arpt_entry *)
+ (newinfo->entries + pos);
+ } while (oldpos == pos + e->next_offset);
+
+ /* Move along one */
+ size = e->next_offset;
+ e = (struct arpt_entry *)
+ (newinfo->entries + pos + size);
+ e->counters.pcnt = pos;
+ pos += size;
+ } else {
+ int newpos = t->verdict;
+
+ if (strcmp(t->target.u.user.name,
+ ARPT_STANDARD_TARGET) == 0
+ && newpos >= 0) {
+ /* This a jump; chase it. */
+ duprintf("Jump rule %u -> %u\n",
+ pos, newpos);
+ } else {
+ /* ... this is a fallthru */
+ newpos = pos + e->next_offset;
+ }
+ e = (struct arpt_entry *)
+ (newinfo->entries + newpos);
+ e->counters.pcnt = pos;
+ pos = newpos;
+ }
+ }
+ next:
+ duprintf("Finished chain %u\n", hook);
+ }
+ return 1;
+}
+
+static inline int standard_check(const struct arpt_entry_target *t,
+ unsigned int max_offset)
+{
+ struct arpt_standard_target *targ = (void *)t;
+
+ /* Check standard info. */
+ if (t->u.target_size
+ != ARPT_ALIGN(sizeof(struct arpt_standard_target))) {
+ duprintf("arpt_standard_check: target size %u != %Zu\n",
+ t->u.target_size,
+ ARPT_ALIGN(sizeof(struct arpt_standard_target)));
+ return 0;
+ }
+
+ if (targ->verdict >= 0
+ && targ->verdict > max_offset - sizeof(struct arpt_entry)) {
+ duprintf("arpt_standard_check: bad verdict (%i)\n",
+ targ->verdict);
+ return 0;
+ }
+
+ if (targ->verdict < -NF_MAX_VERDICT - 1) {
+ duprintf("arpt_standard_check: bad negative verdict (%i)\n",
+ targ->verdict);
+ return 0;
+ }
+ return 1;
+}
+
+static struct arpt_target arpt_standard_target;
+
+static inline int check_entry(struct arpt_entry *e, const char *name, unsigned int size,
+ unsigned int *i)
+{
+ struct arpt_entry_target *t;
+ struct arpt_target *target;
+ int ret;
+
+ if (!arp_checkentry(&e->arp)) {
+ duprintf("arp_tables: arp check failed %p %s.\n", e, name);
+ return -EINVAL;
+ }
+
+ t = arpt_get_target(e);
+ target = arpt_find_target_lock(t->u.user.name, &ret, &arpt_mutex);
+ if (!target) {
+ duprintf("check_entry: `%s' not found\n", t->u.user.name);
+ goto out;
+ }
+ if (!try_module_get((target->me))) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+ t->u.kernel.target = target;
+ up(&arpt_mutex);
+
+ if (t->u.kernel.target == &arpt_standard_target) {
+ if (!standard_check(t, size)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ } else if (t->u.kernel.target->checkentry
+ && !t->u.kernel.target->checkentry(name, e, t->data,
+ t->u.target_size
+ - sizeof(*t),
+ e->comefrom)) {
+ module_put(t->u.kernel.target->me);
+ duprintf("arp_tables: check failed for `%s'.\n",
+ t->u.kernel.target->name);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ (*i)++;
+ return 0;
+
+out_unlock:
+ up(&arpt_mutex);
+out:
+ return ret;
+}
+
+static inline int check_entry_size_and_hooks(struct arpt_entry *e,
+ struct arpt_table_info *newinfo,
+ unsigned char *base,
+ unsigned char *limit,
+ const unsigned int *hook_entries,
+ const unsigned int *underflows,
+ unsigned int *i)
+{
+ unsigned int h;
+
+ if ((unsigned long)e % __alignof__(struct arpt_entry) != 0
+ || (unsigned char *)e + sizeof(struct arpt_entry) >= limit) {
+ duprintf("Bad offset %p\n", e);
+ return -EINVAL;
+ }
+
+ if (e->next_offset
+ < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) {
+ duprintf("checking: element %p size %u\n",
+ e, e->next_offset);
+ return -EINVAL;
+ }
+
+ /* Check hooks & underflows */
+ for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
+ if ((unsigned char *)e - base == hook_entries[h])
+ newinfo->hook_entry[h] = hook_entries[h];
+ if ((unsigned char *)e - base == underflows[h])
+ newinfo->underflow[h] = underflows[h];
+ }
+
+ /* FIXME: underflows must be unconditional, standard verdicts
+ < 0 (not ARPT_RETURN). --RR */
+
+ /* Clear counters and comefrom */
+ e->counters = ((struct arpt_counters) { 0, 0 });
+ e->comefrom = 0;
+
+ (*i)++;
+ return 0;
+}
+
+static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
+{
+ struct arpt_entry_target *t;
+
+ if (i && (*i)-- == 0)
+ return 1;
+
+ t = arpt_get_target(e);
+ if (t->u.kernel.target->destroy)
+ t->u.kernel.target->destroy(t->data,
+ t->u.target_size - sizeof(*t));
+ module_put(t->u.kernel.target->me);
+ return 0;
+}
+
+/* Checks and translates the user-supplied table segment (held in
+ * newinfo).
+ */
+static int translate_table(const char *name,
+ unsigned int valid_hooks,
+ struct arpt_table_info *newinfo,
+ unsigned int size,
+ unsigned int number,
+ const unsigned int *hook_entries,
+ const unsigned int *underflows)
+{
+ unsigned int i;
+ int ret;
+
+ newinfo->size = size;
+ newinfo->number = number;
+
+ /* Init all hooks to impossible value. */
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ newinfo->hook_entry[i] = 0xFFFFFFFF;
+ newinfo->underflow[i] = 0xFFFFFFFF;
+ }
+
+ duprintf("translate_table: size %u\n", newinfo->size);
+ i = 0;
+
+ /* Walk through entries, checking offsets. */
+ ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ check_entry_size_and_hooks,
+ newinfo,
+ newinfo->entries,
+ newinfo->entries + size,
+ hook_entries, underflows, &i);
+ duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
+ if (ret != 0)
+ return ret;
+
+ if (i != number) {
+ duprintf("translate_table: %u not %u entries\n",
+ i, number);
+ return -EINVAL;
+ }
+
+ /* Check hooks all assigned */
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ /* Only hooks which are valid */
+ if (!(valid_hooks & (1 << i)))
+ continue;
+ if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
+ duprintf("Invalid hook entry %u %u\n",
+ i, hook_entries[i]);
+ return -EINVAL;
+ }
+ if (newinfo->underflow[i] == 0xFFFFFFFF) {
+ duprintf("Invalid underflow %u %u\n",
+ i, underflows[i]);
+ return -EINVAL;
+ }
+ }
+
+ if (!mark_source_chains(newinfo, valid_hooks)) {
+ duprintf("Looping hook\n");
+ return -ELOOP;
+ }
+
+ /* Finally, each sanity check must pass */
+ i = 0;
+ ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ check_entry, name, size, &i);
+
+ if (ret != 0) {
+ ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ cleanup_entry, &i);
+ return ret;
+ }
+
+ /* And one copy for every other CPU */
+ for (i = 1; i < num_possible_cpus(); i++) {
+ memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,
+ newinfo->entries,
+ SMP_ALIGN(newinfo->size));
+ }
+
+ return ret;
+}
+
+static struct arpt_table_info *replace_table(struct arpt_table *table,
+ unsigned int num_counters,
+ struct arpt_table_info *newinfo,
+ int *error)
+{
+ struct arpt_table_info *oldinfo;
+
+ /* Do the substitution. */
+ write_lock_bh(&table->lock);
+ /* Check inside lock: is the old number correct? */
+ if (num_counters != table->private->number) {
+ duprintf("num_counters != table->private->number (%u/%u)\n",
+ num_counters, table->private->number);
+ write_unlock_bh(&table->lock);
+ *error = -EAGAIN;
+ return NULL;
+ }
+ oldinfo = table->private;
+ table->private = newinfo;
+ newinfo->initial_entries = oldinfo->initial_entries;
+ write_unlock_bh(&table->lock);
+
+ return oldinfo;
+}
+
+/* Gets counters. */
+static inline int add_entry_to_counter(const struct arpt_entry *e,
+ struct arpt_counters total[],
+ unsigned int *i)
+{
+ ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+
+ (*i)++;
+ return 0;
+}
+
+static void get_counters(const struct arpt_table_info *t,
+ struct arpt_counters counters[])
+{
+ unsigned int cpu;
+ unsigned int i;
+
+ for (cpu = 0; cpu < num_possible_cpus(); cpu++) {
+ i = 0;
+ ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
+ t->size,
+ add_entry_to_counter,
+ counters,
+ &i);
+ }
+}
+
+static int copy_entries_to_user(unsigned int total_size,
+ struct arpt_table *table,
+ void __user *userptr)
+{
+ unsigned int off, num, countersize;
+ struct arpt_entry *e;
+ struct arpt_counters *counters;
+ int ret = 0;
+
+ /* We need atomic snapshot of counters: rest doesn't change
+ * (other than comefrom, which userspace doesn't care
+ * about).
+ */
+ countersize = sizeof(struct arpt_counters) * table->private->number;
+ counters = vmalloc(countersize);
+
+ if (counters == NULL)
+ return -ENOMEM;
+
+ /* First, sum counters... */
+ memset(counters, 0, countersize);
+ write_lock_bh(&table->lock);
+ get_counters(table->private, counters);
+ write_unlock_bh(&table->lock);
+
+ /* ... then copy entire thing from CPU 0... */
+ if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+
+ /* FIXME: use iterator macros --RR */
+ /* ... then go back and fix counters and names */
+ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
+ struct arpt_entry_target *t;
+
+ e = (struct arpt_entry *)(table->private->entries + off);
+ if (copy_to_user(userptr + off
+ + offsetof(struct arpt_entry, counters),
+ &counters[num],
+ sizeof(counters[num])) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+
+ t = arpt_get_target(e);
+ if (copy_to_user(userptr + off + e->target_offset
+ + offsetof(struct arpt_entry_target,
+ u.user.name),
+ t->u.kernel.target->name,
+ strlen(t->u.kernel.target->name)+1) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+ }
+
+ free_counters:
+ vfree(counters);
+ return ret;
+}
+
+static int get_entries(const struct arpt_get_entries *entries,
+ struct arpt_get_entries __user *uptr)
+{
+ int ret;
+ struct arpt_table *t;
+
+ t = arpt_find_table_lock(entries->name, &ret, &arpt_mutex);
+ if (t) {
+ duprintf("t->private->number = %u\n",
+ t->private->number);
+ if (entries->size == t->private->size)
+ ret = copy_entries_to_user(t->private->size,
+ t, uptr->entrytable);
+ else {
+ duprintf("get_entries: I've got %u not %u!\n",
+ t->private->size,
+ entries->size);
+ ret = -EINVAL;
+ }
+ up(&arpt_mutex);
+ } else
+ duprintf("get_entries: Can't find %s!\n",
+ entries->name);
+
+ return ret;
+}
+
+static int do_replace(void __user *user, unsigned int len)
+{
+ int ret;
+ struct arpt_replace tmp;
+ struct arpt_table *t;
+ struct arpt_table_info *newinfo, *oldinfo;
+ struct arpt_counters *counters;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ /* Hack: Causes ipchains to give correct error msg --RR */
+ if (len != sizeof(tmp) + tmp.size)
+ return -ENOPROTOOPT;
+
+ /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
+ if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
+ return -ENOMEM;
+
+ newinfo = vmalloc(sizeof(struct arpt_table_info)
+ + SMP_ALIGN(tmp.size) * num_possible_cpus());
+ if (!newinfo)
+ return -ENOMEM;
+
+ if (copy_from_user(newinfo->entries, user + sizeof(tmp),
+ tmp.size) != 0) {
+ ret = -EFAULT;
+ goto free_newinfo;
+ }
+
+ counters = vmalloc(tmp.num_counters * sizeof(struct arpt_counters));
+ if (!counters) {
+ ret = -ENOMEM;
+ goto free_newinfo;
+ }
+ memset(counters, 0, tmp.num_counters * sizeof(struct arpt_counters));
+
+ ret = translate_table(tmp.name, tmp.valid_hooks,
+ newinfo, tmp.size, tmp.num_entries,
+ tmp.hook_entry, tmp.underflow);
+ if (ret != 0)
+ goto free_newinfo_counters;
+
+ duprintf("arp_tables: Translated table\n");
+
+ t = arpt_find_table_lock(tmp.name, &ret, &arpt_mutex);
+ if (!t)
+ goto free_newinfo_counters_untrans;
+
+ /* You lied! */
+ if (tmp.valid_hooks != t->valid_hooks) {
+ duprintf("Valid hook crap: %08X vs %08X\n",
+ tmp.valid_hooks, t->valid_hooks);
+ ret = -EINVAL;
+ goto free_newinfo_counters_untrans_unlock;
+ }
+
+ /* Get a reference in advance, we're not allowed fail later */
+ if (!try_module_get(t->me)) {
+ ret = -EBUSY;
+ goto free_newinfo_counters_untrans_unlock;
+ }
+
+ oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret);
+ if (!oldinfo)
+ goto put_module;
+
+ /* Update module usage count based on number of rules */
+ duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
+ oldinfo->number, oldinfo->initial_entries, newinfo->number);
+ if ((oldinfo->number > oldinfo->initial_entries) ||
+ (newinfo->number <= oldinfo->initial_entries))
+ module_put(t->me);
+ if ((oldinfo->number > oldinfo->initial_entries) &&
+ (newinfo->number <= oldinfo->initial_entries))
+ module_put(t->me);
+
+ /* Get the old counters. */
+ get_counters(oldinfo, counters);
+ /* Decrease module usage counts and free resource */
+ ARPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
+ vfree(oldinfo);
+ if (copy_to_user(tmp.counters, counters,
+ sizeof(struct arpt_counters) * tmp.num_counters) != 0)
+ ret = -EFAULT;
+ vfree(counters);
+ up(&arpt_mutex);
+ return ret;
+
+ put_module:
+ module_put(t->me);
+ free_newinfo_counters_untrans_unlock:
+ up(&arpt_mutex);
+ free_newinfo_counters_untrans:
+ ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL);
+ free_newinfo_counters:
+ vfree(counters);
+ free_newinfo:
+ vfree(newinfo);
+ return ret;
+}
+
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK.
+ */
+static inline int add_counter_to_entry(struct arpt_entry *e,
+ const struct arpt_counters addme[],
+ unsigned int *i)
+{
+
+ ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+ (*i)++;
+ return 0;
+}
+
+static int do_add_counters(void __user *user, unsigned int len)
+{
+ unsigned int i;
+ struct arpt_counters_info tmp, *paddc;
+ struct arpt_table *t;
+ int ret;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct arpt_counters))
+ return -EINVAL;
+
+ paddc = vmalloc(len);
+ if (!paddc)
+ return -ENOMEM;
+
+ if (copy_from_user(paddc, user, len) != 0) {
+ ret = -EFAULT;
+ goto free;
+ }
+
+ t = arpt_find_table_lock(tmp.name, &ret, &arpt_mutex);
+ if (!t)
+ goto free;
+
+ write_lock_bh(&t->lock);
+ if (t->private->number != paddc->num_counters) {
+ ret = -EINVAL;
+ goto unlock_up_free;
+ }
+
+ i = 0;
+ ARPT_ENTRY_ITERATE(t->private->entries,
+ t->private->size,
+ add_counter_to_entry,
+ paddc->counters,
+ &i);
+ unlock_up_free:
+ write_unlock_bh(&t->lock);
+ up(&arpt_mutex);
+ free:
+ vfree(paddc);
+
+ return ret;
+}
+
+static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case ARPT_SO_SET_REPLACE:
+ ret = do_replace(user, len);
+ break;
+
+ case ARPT_SO_SET_ADD_COUNTERS:
+ ret = do_add_counters(user, len);
+ break;
+
+ default:
+ duprintf("do_arpt_set_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case ARPT_SO_GET_INFO: {
+ char name[ARPT_TABLE_MAXNAMELEN];
+ struct arpt_table *t;
+
+ if (*len != sizeof(struct arpt_getinfo)) {
+ duprintf("length %u != %Zu\n", *len,
+ sizeof(struct arpt_getinfo));
+ ret = -EINVAL;
+ break;
+ }
+
+ if (copy_from_user(name, user, sizeof(name)) != 0) {
+ ret = -EFAULT;
+ break;
+ }
+ name[ARPT_TABLE_MAXNAMELEN-1] = '\0';
+ t = arpt_find_table_lock(name, &ret, &arpt_mutex);
+ if (t) {
+ struct arpt_getinfo info;
+
+ info.valid_hooks = t->valid_hooks;
+ memcpy(info.hook_entry, t->private->hook_entry,
+ sizeof(info.hook_entry));
+ memcpy(info.underflow, t->private->underflow,
+ sizeof(info.underflow));
+ info.num_entries = t->private->number;
+ info.size = t->private->size;
+ strcpy(info.name, name);
+
+ if (copy_to_user(user, &info, *len) != 0)
+ ret = -EFAULT;
+ else
+ ret = 0;
+
+ up(&arpt_mutex);
+ }
+ }
+ break;
+
+ case ARPT_SO_GET_ENTRIES: {
+ struct arpt_get_entries get;
+
+ if (*len < sizeof(get)) {
+ duprintf("get_entries: %u < %Zu\n", *len, sizeof(get));
+ ret = -EINVAL;
+ } else if (copy_from_user(&get, user, sizeof(get)) != 0) {
+ ret = -EFAULT;
+ } else if (*len != sizeof(struct arpt_get_entries) + get.size) {
+ duprintf("get_entries: %u != %Zu\n", *len,
+ sizeof(struct arpt_get_entries) + get.size);
+ ret = -EINVAL;
+ } else
+ ret = get_entries(&get, user);
+ break;
+ }
+
+ default:
+ duprintf("do_arpt_get_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+/* Registration hooks for targets. */
+int arpt_register_target(struct arpt_target *target)
+{
+ int ret;
+
+ ret = down_interruptible(&arpt_mutex);
+ if (ret != 0)
+ return ret;
+
+ if (!list_named_insert(&arpt_target, target)) {
+ duprintf("arpt_register_target: `%s' already in list!\n",
+ target->name);
+ ret = -EINVAL;
+ }
+ up(&arpt_mutex);
+ return ret;
+}
+
+void arpt_unregister_target(struct arpt_target *target)
+{
+ down(&arpt_mutex);
+ LIST_DELETE(&arpt_target, target);
+ up(&arpt_mutex);
+}
+
+int arpt_register_table(struct arpt_table *table,
+ const struct arpt_replace *repl)
+{
+ int ret;
+ struct arpt_table_info *newinfo;
+ static struct arpt_table_info bootstrap
+ = { 0, 0, 0, { 0 }, { 0 }, { } };
+
+ newinfo = vmalloc(sizeof(struct arpt_table_info)
+ + SMP_ALIGN(repl->size) * num_possible_cpus());
+ if (!newinfo) {
+ ret = -ENOMEM;
+ return ret;
+ }
+ memcpy(newinfo->entries, repl->entries, repl->size);
+
+ ret = translate_table(table->name, table->valid_hooks,
+ newinfo, repl->size,
+ repl->num_entries,
+ repl->hook_entry,
+ repl->underflow);
+ duprintf("arpt_register_table: translate table gives %d\n", ret);
+ if (ret != 0) {
+ vfree(newinfo);
+ return ret;
+ }
+
+ ret = down_interruptible(&arpt_mutex);
+ if (ret != 0) {
+ vfree(newinfo);
+ return ret;
+ }
+
+ /* Don't autoload: we'd eat our tail... */
+ if (list_named_find(&arpt_tables, table->name)) {
+ ret = -EEXIST;
+ goto free_unlock;
+ }
+
+ /* Simplifies replace_table code. */
+ table->private = &bootstrap;
+ if (!replace_table(table, 0, newinfo, &ret))
+ goto free_unlock;
+
+ duprintf("table->private->number = %u\n",
+ table->private->number);
+
+ /* save number of initial entries */
+ table->private->initial_entries = table->private->number;
+
+ rwlock_init(&table->lock);
+ list_prepend(&arpt_tables, table);
+
+ unlock:
+ up(&arpt_mutex);
+ return ret;
+
+ free_unlock:
+ vfree(newinfo);
+ goto unlock;
+}
+
+void arpt_unregister_table(struct arpt_table *table)
+{
+ down(&arpt_mutex);
+ LIST_DELETE(&arpt_tables, table);
+ up(&arpt_mutex);
+
+ /* Decrease module usage counts and free resources */
+ ARPT_ENTRY_ITERATE(table->private->entries, table->private->size,
+ cleanup_entry, NULL);
+ vfree(table->private);
+}
+
+/* The built-in targets: standard (NULL) and error. */
+static struct arpt_target arpt_standard_target = {
+ .name = ARPT_STANDARD_TARGET,
+};
+
+static struct arpt_target arpt_error_target = {
+ .name = ARPT_ERROR_TARGET,
+ .target = arpt_error,
+};
+
+static struct nf_sockopt_ops arpt_sockopts = {
+ .pf = PF_INET,
+ .set_optmin = ARPT_BASE_CTL,
+ .set_optmax = ARPT_SO_SET_MAX+1,
+ .set = do_arpt_set_ctl,
+ .get_optmin = ARPT_BASE_CTL,
+ .get_optmax = ARPT_SO_GET_MAX+1,
+ .get = do_arpt_get_ctl,
+};
+
+#ifdef CONFIG_PROC_FS
+static inline int print_name(const struct arpt_table *t,
+ off_t start_offset, char *buffer, int length,
+ off_t *pos, unsigned int *count)
+{
+ if ((*count)++ >= start_offset) {
+ unsigned int namelen;
+
+ namelen = sprintf(buffer + *pos, "%s\n", t->name);
+ if (*pos + namelen > length) {
+ /* Stop iterating */
+ return 1;
+ }
+ *pos += namelen;
+ }
+ return 0;
+}
+
+static int arpt_get_tables(char *buffer, char **start, off_t offset, int length)
+{
+ off_t pos = 0;
+ unsigned int count = 0;
+
+ if (down_interruptible(&arpt_mutex) != 0)
+ return 0;
+
+ LIST_FIND(&arpt_tables, print_name, struct arpt_table *,
+ offset, buffer, length, &pos, &count);
+
+ up(&arpt_mutex);
+
+ /* `start' hack - see fs/proc/generic.c line ~105 */
+ *start=(char *)((unsigned long)count-offset);
+ return pos;
+}
+#endif /*CONFIG_PROC_FS*/
+
+static int __init init(void)
+{
+ int ret;
+
+ /* Noone else will be downing sem now, so we won't sleep */
+ down(&arpt_mutex);
+ list_append(&arpt_target, &arpt_standard_target);
+ list_append(&arpt_target, &arpt_error_target);
+ up(&arpt_mutex);
+
+ /* Register setsockopt */
+ ret = nf_register_sockopt(&arpt_sockopts);
+ if (ret < 0) {
+ duprintf("Unable to register sockopts.\n");
+ return ret;
+ }
+
+#ifdef CONFIG_PROC_FS
+ {
+ struct proc_dir_entry *proc;
+
+ proc = proc_net_create("arp_tables_names", 0, arpt_get_tables);
+ if (!proc) {
+ nf_unregister_sockopt(&arpt_sockopts);
+ return -ENOMEM;
+ }
+ proc->owner = THIS_MODULE;
+ }
+#endif
+
+ printk("arp_tables: (C) 2002 David S. Miller\n");
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ nf_unregister_sockopt(&arpt_sockopts);
+#ifdef CONFIG_PROC_FS
+ proc_net_remove("arp_tables_names");
+#endif
+}
+
+EXPORT_SYMBOL(arpt_register_table);
+EXPORT_SYMBOL(arpt_unregister_table);
+EXPORT_SYMBOL(arpt_do_table);
+EXPORT_SYMBOL(arpt_register_target);
+EXPORT_SYMBOL(arpt_unregister_target);
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
new file mode 100644
index 00000000000..3e592ec8648
--- /dev/null
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -0,0 +1,104 @@
+/* module that allows mangling of the arp payload */
+#include <linux/module.h>
+#include <linux/netfilter_arp/arpt_mangle.h>
+#include <net/sock.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
+MODULE_DESCRIPTION("arptables arp payload mangle target");
+
+static unsigned int
+target(struct sk_buff **pskb, unsigned int hooknum, const struct net_device *in,
+ const struct net_device *out, const void *targinfo, void *userinfo)
+{
+ const struct arpt_mangle *mangle = targinfo;
+ struct arphdr *arp;
+ unsigned char *arpptr;
+ int pln, hln;
+
+ if (skb_shared(*pskb) || skb_cloned(*pskb)) {
+ struct sk_buff *nskb;
+
+ nskb = skb_copy(*pskb, GFP_ATOMIC);
+ if (!nskb)
+ return NF_DROP;
+ if ((*pskb)->sk)
+ skb_set_owner_w(nskb, (*pskb)->sk);
+ kfree_skb(*pskb);
+ *pskb = nskb;
+ }
+
+ arp = (*pskb)->nh.arph;
+ arpptr = (*pskb)->nh.raw + sizeof(*arp);
+ pln = arp->ar_pln;
+ hln = arp->ar_hln;
+ /* We assume that pln and hln were checked in the match */
+ if (mangle->flags & ARPT_MANGLE_SDEV) {
+ if (ARPT_DEV_ADDR_LEN_MAX < hln ||
+ (arpptr + hln > (**pskb).tail))
+ return NF_DROP;
+ memcpy(arpptr, mangle->src_devaddr, hln);
+ }
+ arpptr += hln;
+ if (mangle->flags & ARPT_MANGLE_SIP) {
+ if (ARPT_MANGLE_ADDR_LEN_MAX < pln ||
+ (arpptr + pln > (**pskb).tail))
+ return NF_DROP;
+ memcpy(arpptr, &mangle->u_s.src_ip, pln);
+ }
+ arpptr += pln;
+ if (mangle->flags & ARPT_MANGLE_TDEV) {
+ if (ARPT_DEV_ADDR_LEN_MAX < hln ||
+ (arpptr + hln > (**pskb).tail))
+ return NF_DROP;
+ memcpy(arpptr, mangle->tgt_devaddr, hln);
+ }
+ arpptr += hln;
+ if (mangle->flags & ARPT_MANGLE_TIP) {
+ if (ARPT_MANGLE_ADDR_LEN_MAX < pln ||
+ (arpptr + pln > (**pskb).tail))
+ return NF_DROP;
+ memcpy(arpptr, &mangle->u_t.tgt_ip, pln);
+ }
+ return mangle->target;
+}
+
+static int
+checkentry(const char *tablename, const struct arpt_entry *e, void *targinfo,
+ unsigned int targinfosize, unsigned int hook_mask)
+{
+ const struct arpt_mangle *mangle = targinfo;
+
+ if (mangle->flags & ~ARPT_MANGLE_MASK ||
+ !(mangle->flags & ARPT_MANGLE_MASK))
+ return 0;
+
+ if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
+ mangle->target != ARPT_CONTINUE)
+ return 0;
+ return 1;
+}
+
+static struct arpt_target arpt_mangle_reg
+= {
+ .name = "mangle",
+ .target = target,
+ .checkentry = checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ if (arpt_register_target(&arpt_mangle_reg))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ arpt_unregister_target(&arpt_mangle_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
new file mode 100644
index 00000000000..0d759f5a4ef
--- /dev/null
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -0,0 +1,214 @@
+/*
+ * Filtering ARP tables module.
+ *
+ * Copyright (C) 2002 David S. Miller (davem@redhat.com)
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter_arp/arp_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
+MODULE_DESCRIPTION("arptables filter table");
+
+#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \
+ (1 << NF_ARP_FORWARD))
+
+/* Standard entry. */
+struct arpt_standard
+{
+ struct arpt_entry entry;
+ struct arpt_standard_target target;
+};
+
+struct arpt_error_target
+{
+ struct arpt_entry_target target;
+ char errorname[ARPT_FUNCTION_MAXNAMELEN];
+};
+
+struct arpt_error
+{
+ struct arpt_entry entry;
+ struct arpt_error_target target;
+};
+
+static struct
+{
+ struct arpt_replace repl;
+ struct arpt_standard entries[3];
+ struct arpt_error term;
+} initial_table __initdata
+= { { "filter", FILTER_VALID_HOOKS, 4,
+ sizeof(struct arpt_standard) * 3 + sizeof(struct arpt_error),
+ { [NF_ARP_IN] = 0,
+ [NF_ARP_OUT] = sizeof(struct arpt_standard),
+ [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard), },
+ { [NF_ARP_IN] = 0,
+ [NF_ARP_OUT] = sizeof(struct arpt_standard),
+ [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard), },
+ 0, NULL, { } },
+ {
+ /* ARP_IN */
+ {
+ {
+ {
+ { 0 }, { 0 }, { 0 }, { 0 },
+ 0, 0,
+ { { 0, }, { 0, } },
+ { { 0, }, { 0, } },
+ 0, 0,
+ 0, 0,
+ 0, 0,
+ "", "", { 0 }, { 0 },
+ 0, 0
+ },
+ sizeof(struct arpt_entry),
+ sizeof(struct arpt_standard),
+ 0,
+ { 0, 0 }, { } },
+ { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } },
+ -NF_ACCEPT - 1 }
+ },
+ /* ARP_OUT */
+ {
+ {
+ {
+ { 0 }, { 0 }, { 0 }, { 0 },
+ 0, 0,
+ { { 0, }, { 0, } },
+ { { 0, }, { 0, } },
+ 0, 0,
+ 0, 0,
+ 0, 0,
+ "", "", { 0 }, { 0 },
+ 0, 0
+ },
+ sizeof(struct arpt_entry),
+ sizeof(struct arpt_standard),
+ 0,
+ { 0, 0 }, { } },
+ { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } },
+ -NF_ACCEPT - 1 }
+ },
+ /* ARP_FORWARD */
+ {
+ {
+ {
+ { 0 }, { 0 }, { 0 }, { 0 },
+ 0, 0,
+ { { 0, }, { 0, } },
+ { { 0, }, { 0, } },
+ 0, 0,
+ 0, 0,
+ 0, 0,
+ "", "", { 0 }, { 0 },
+ 0, 0
+ },
+ sizeof(struct arpt_entry),
+ sizeof(struct arpt_standard),
+ 0,
+ { 0, 0 }, { } },
+ { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } },
+ -NF_ACCEPT - 1 }
+ }
+ },
+ /* ERROR */
+ {
+ {
+ {
+ { 0 }, { 0 }, { 0 }, { 0 },
+ 0, 0,
+ { { 0, }, { 0, } },
+ { { 0, }, { 0, } },
+ 0, 0,
+ 0, 0,
+ 0, 0,
+ "", "", { 0 }, { 0 },
+ 0, 0
+ },
+ sizeof(struct arpt_entry),
+ sizeof(struct arpt_error),
+ 0,
+ { 0, 0 }, { } },
+ { { { { ARPT_ALIGN(sizeof(struct arpt_error_target)), ARPT_ERROR_TARGET } },
+ { } },
+ "ERROR"
+ }
+ }
+};
+
+static struct arpt_table packet_filter = {
+ .name = "filter",
+ .valid_hooks = FILTER_VALID_HOOKS,
+ .lock = RW_LOCK_UNLOCKED,
+ .private = NULL,
+ .me = THIS_MODULE,
+};
+
+/* The work comes in here from netfilter.c */
+static unsigned int arpt_hook(unsigned int hook,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return arpt_do_table(pskb, hook, in, out, &packet_filter, NULL);
+}
+
+static struct nf_hook_ops arpt_ops[] = {
+ {
+ .hook = arpt_hook,
+ .owner = THIS_MODULE,
+ .pf = NF_ARP,
+ .hooknum = NF_ARP_IN,
+ },
+ {
+ .hook = arpt_hook,
+ .owner = THIS_MODULE,
+ .pf = NF_ARP,
+ .hooknum = NF_ARP_OUT,
+ },
+ {
+ .hook = arpt_hook,
+ .owner = THIS_MODULE,
+ .pf = NF_ARP,
+ .hooknum = NF_ARP_FORWARD,
+ },
+};
+
+static int __init init(void)
+{
+ int ret, i;
+
+ /* Register table */
+ ret = arpt_register_table(&packet_filter, &initial_table.repl);
+ if (ret < 0)
+ return ret;
+
+ for (i = 0; i < ARRAY_SIZE(arpt_ops); i++)
+ if ((ret = nf_register_hook(&arpt_ops[i])) < 0)
+ goto cleanup_hooks;
+ return ret;
+
+cleanup_hooks:
+ while (--i >= 0)
+ nf_unregister_hook(&arpt_ops[i]);
+
+ arpt_unregister_table(&packet_filter);
+ return ret;
+}
+
+static void __exit fini(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(arpt_ops); i++)
+ nf_unregister_hook(&arpt_ops[i]);
+
+ arpt_unregister_table(&packet_filter);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
new file mode 100644
index 00000000000..3dbddd06260
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -0,0 +1,167 @@
+/* Amanda extension for IP connection tracking, Version 0.2
+ * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
+ * based on HW's ip_conntrack_irc.c as well as other modules
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Module load syntax:
+ * insmod ip_conntrack_amanda.o [master_timeout=n]
+ *
+ * Where master_timeout is the timeout (in seconds) of the master
+ * connection (port 10080). This defaults to 5 minutes but if
+ * your clients take longer than 5 minutes to do their work
+ * before getting back to the Amanda server, you can increase
+ * this value.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/moduleparam.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+
+#include <linux/netfilter_ipv4/lockhelp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
+
+static unsigned int master_timeout = 300;
+
+MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
+MODULE_DESCRIPTION("Amanda connection tracking module");
+MODULE_LICENSE("GPL");
+module_param(master_timeout, int, 0600);
+MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
+
+static char *conns[] = { "DATA ", "MESG ", "INDEX " };
+
+/* This is slow, but it's simple. --RR */
+static char amanda_buffer[65536];
+static DECLARE_LOCK(amanda_buffer_lock);
+
+unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
+ enum ip_conntrack_info ctinfo,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct ip_conntrack_expect *exp);
+EXPORT_SYMBOL_GPL(ip_nat_amanda_hook);
+
+static int help(struct sk_buff **pskb,
+ struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
+{
+ struct ip_conntrack_expect *exp;
+ char *data, *data_limit, *tmp;
+ unsigned int dataoff, i;
+ u_int16_t port, len;
+ int ret = NF_ACCEPT;
+
+ /* Only look at packets from the Amanda server */
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
+ return NF_ACCEPT;
+
+ /* increase the UDP timeout of the master connection as replies from
+ * Amanda clients to the server can be quite delayed */
+ ip_ct_refresh_acct(ct, ctinfo, NULL, master_timeout * HZ);
+
+ /* No data? */
+ dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+ if (dataoff >= (*pskb)->len) {
+ if (net_ratelimit())
+ printk("amanda_help: skblen = %u\n", (*pskb)->len);
+ return NF_ACCEPT;
+ }
+
+ LOCK_BH(&amanda_buffer_lock);
+ skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff);
+ data = amanda_buffer;
+ data_limit = amanda_buffer + (*pskb)->len - dataoff;
+ *data_limit = '\0';
+
+ /* Search for the CONNECT string */
+ data = strstr(data, "CONNECT ");
+ if (!data)
+ goto out;
+ data += strlen("CONNECT ");
+
+ /* Only search first line. */
+ if ((tmp = strchr(data, '\n')))
+ *tmp = '\0';
+
+ for (i = 0; i < ARRAY_SIZE(conns); i++) {
+ char *match = strstr(data, conns[i]);
+ if (!match)
+ continue;
+ tmp = data = match + strlen(conns[i]);
+ port = simple_strtoul(data, &data, 10);
+ len = data - tmp;
+ if (port == 0 || len > 5)
+ break;
+
+ exp = ip_conntrack_expect_alloc();
+ if (exp == NULL) {
+ ret = NF_DROP;
+ goto out;
+ }
+
+ exp->expectfn = NULL;
+ exp->master = ct;
+
+ exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
+ exp->tuple.src.u.tcp.port = 0;
+ exp->tuple.dst.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip;
+ exp->tuple.dst.protonum = IPPROTO_TCP;
+ exp->tuple.dst.u.tcp.port = htons(port);
+
+ exp->mask.src.ip = 0xFFFFFFFF;
+ exp->mask.src.u.tcp.port = 0;
+ exp->mask.dst.ip = 0xFFFFFFFF;
+ exp->mask.dst.protonum = 0xFF;
+ exp->mask.dst.u.tcp.port = 0xFFFF;
+
+ if (ip_nat_amanda_hook)
+ ret = ip_nat_amanda_hook(pskb, ctinfo,
+ tmp - amanda_buffer,
+ len, exp);
+ else if (ip_conntrack_expect_related(exp) != 0) {
+ ip_conntrack_expect_free(exp);
+ ret = NF_DROP;
+ }
+ }
+
+out:
+ UNLOCK_BH(&amanda_buffer_lock);
+ return ret;
+}
+
+static struct ip_conntrack_helper amanda_helper = {
+ .max_expected = ARRAY_SIZE(conns),
+ .timeout = 180,
+ .me = THIS_MODULE,
+ .help = help,
+ .name = "amanda",
+
+ .tuple = { .src = { .u = { __constant_htons(10080) } },
+ .dst = { .protonum = IPPROTO_UDP },
+ },
+ .mask = { .src = { .u = { 0xFFFF } },
+ .dst = { .protonum = 0xFF },
+ },
+};
+
+static void __exit fini(void)
+{
+ ip_conntrack_helper_unregister(&amanda_helper);
+}
+
+static int __init init(void)
+{
+ return ip_conntrack_helper_register(&amanda_helper);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
new file mode 100644
index 00000000000..28d9425d5c3
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -0,0 +1,1247 @@
+/* Connection state tracking for netfilter. This is separated from,
+ but required by, the NAT layer; it can also be used by an iptables
+ extension. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
+ * - new API and handling of conntrack/nat helpers
+ * - now capable of multiple expectations for one master
+ * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
+ * - add usage/reference counts to ip_conntrack_expect
+ * - export ip_conntrack[_expect]_{find_get,put} functions
+ * */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/icmp.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+#include <linux/stddef.h>
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/moduleparam.h>
+
+/* This rwlock protects the main hash table, protocol/helper/expected
+ registrations, conntrack timers*/
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
+
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#define IP_CONNTRACK_VERSION "2.1"
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+DECLARE_RWLOCK(ip_conntrack_lock);
+
+/* ip_conntrack_standalone needs this */
+atomic_t ip_conntrack_count = ATOMIC_INIT(0);
+
+void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
+LIST_HEAD(ip_conntrack_expect_list);
+struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
+static LIST_HEAD(helpers);
+unsigned int ip_conntrack_htable_size = 0;
+int ip_conntrack_max;
+struct list_head *ip_conntrack_hash;
+static kmem_cache_t *ip_conntrack_cachep;
+static kmem_cache_t *ip_conntrack_expect_cachep;
+struct ip_conntrack ip_conntrack_untracked;
+unsigned int ip_ct_log_invalid;
+static LIST_HEAD(unconfirmed);
+static int ip_conntrack_vmalloc;
+
+DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
+
+void
+ip_conntrack_put(struct ip_conntrack *ct)
+{
+ IP_NF_ASSERT(ct);
+ nf_conntrack_put(&ct->ct_general);
+}
+
+static int ip_conntrack_hash_rnd_initted;
+static unsigned int ip_conntrack_hash_rnd;
+
+static u_int32_t
+hash_conntrack(const struct ip_conntrack_tuple *tuple)
+{
+#if 0
+ dump_tuple(tuple);
+#endif
+ return (jhash_3words(tuple->src.ip,
+ (tuple->dst.ip ^ tuple->dst.protonum),
+ (tuple->src.u.all | (tuple->dst.u.all << 16)),
+ ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
+}
+
+int
+ip_ct_get_tuple(const struct iphdr *iph,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack_protocol *protocol)
+{
+ /* Never happen */
+ if (iph->frag_off & htons(IP_OFFSET)) {
+ printk("ip_conntrack_core: Frag of proto %u.\n",
+ iph->protocol);
+ return 0;
+ }
+
+ tuple->src.ip = iph->saddr;
+ tuple->dst.ip = iph->daddr;
+ tuple->dst.protonum = iph->protocol;
+ tuple->dst.dir = IP_CT_DIR_ORIGINAL;
+
+ return protocol->pkt_to_tuple(skb, dataoff, tuple);
+}
+
+int
+ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
+ const struct ip_conntrack_tuple *orig,
+ const struct ip_conntrack_protocol *protocol)
+{
+ inverse->src.ip = orig->dst.ip;
+ inverse->dst.ip = orig->src.ip;
+ inverse->dst.protonum = orig->dst.protonum;
+ inverse->dst.dir = !orig->dst.dir;
+
+ return protocol->invert_tuple(inverse, orig);
+}
+
+
+/* ip_conntrack_expect helper functions */
+static void destroy_expect(struct ip_conntrack_expect *exp)
+{
+ ip_conntrack_put(exp->master);
+ IP_NF_ASSERT(!timer_pending(&exp->timeout));
+ kmem_cache_free(ip_conntrack_expect_cachep, exp);
+ CONNTRACK_STAT_INC(expect_delete);
+}
+
+static void unlink_expect(struct ip_conntrack_expect *exp)
+{
+ MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
+ list_del(&exp->list);
+ /* Logically in destroy_expect, but we hold the lock here. */
+ exp->master->expecting--;
+}
+
+static void expectation_timed_out(unsigned long ul_expect)
+{
+ struct ip_conntrack_expect *exp = (void *)ul_expect;
+
+ WRITE_LOCK(&ip_conntrack_lock);
+ unlink_expect(exp);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+ destroy_expect(exp);
+}
+
+/* If an expectation for this connection is found, it gets delete from
+ * global list then returned. */
+static struct ip_conntrack_expect *
+find_expectation(const struct ip_conntrack_tuple *tuple)
+{
+ struct ip_conntrack_expect *i;
+
+ list_for_each_entry(i, &ip_conntrack_expect_list, list) {
+ /* If master is not in hash table yet (ie. packet hasn't left
+ this machine yet), how can other end know about expected?
+ Hence these are not the droids you are looking for (if
+ master ct never got confirmed, we'd hold a reference to it
+ and weird things would happen to future packets). */
+ if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
+ && is_confirmed(i->master)
+ && del_timer(&i->timeout)) {
+ unlink_expect(i);
+ return i;
+ }
+ }
+ return NULL;
+}
+
+/* delete all expectations for this conntrack */
+static void remove_expectations(struct ip_conntrack *ct)
+{
+ struct ip_conntrack_expect *i, *tmp;
+
+ /* Optimization: most connection never expect any others. */
+ if (ct->expecting == 0)
+ return;
+
+ list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
+ if (i->master == ct && del_timer(&i->timeout)) {
+ unlink_expect(i);
+ destroy_expect(i);
+ }
+ }
+}
+
+static void
+clean_from_lists(struct ip_conntrack *ct)
+{
+ unsigned int ho, hr;
+
+ DEBUGP("clean_from_lists(%p)\n", ct);
+ MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
+
+ ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
+ LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
+
+ /* Destroy all pending expectations */
+ remove_expectations(ct);
+}
+
+static void
+destroy_conntrack(struct nf_conntrack *nfct)
+{
+ struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
+ struct ip_conntrack_protocol *proto;
+
+ DEBUGP("destroy_conntrack(%p)\n", ct);
+ IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
+ IP_NF_ASSERT(!timer_pending(&ct->timeout));
+
+ /* To make sure we don't get any weird locking issues here:
+ * destroy_conntrack() MUST NOT be called with a write lock
+ * to ip_conntrack_lock!!! -HW */
+ proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
+ if (proto && proto->destroy)
+ proto->destroy(ct);
+
+ if (ip_conntrack_destroyed)
+ ip_conntrack_destroyed(ct);
+
+ WRITE_LOCK(&ip_conntrack_lock);
+ /* Expectations will have been removed in clean_from_lists,
+ * except TFTP can create an expectation on the first packet,
+ * before connection is in the list, so we need to clean here,
+ * too. */
+ remove_expectations(ct);
+
+ /* We overload first tuple to link into unconfirmed list. */
+ if (!is_confirmed(ct)) {
+ BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
+ list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+ }
+
+ CONNTRACK_STAT_INC(delete);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+
+ if (ct->master)
+ ip_conntrack_put(ct->master);
+
+ DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
+ kmem_cache_free(ip_conntrack_cachep, ct);
+ atomic_dec(&ip_conntrack_count);
+}
+
+static void death_by_timeout(unsigned long ul_conntrack)
+{
+ struct ip_conntrack *ct = (void *)ul_conntrack;
+
+ WRITE_LOCK(&ip_conntrack_lock);
+ /* Inside lock so preempt is disabled on module removal path.
+ * Otherwise we can get spurious warnings. */
+ CONNTRACK_STAT_INC(delete_list);
+ clean_from_lists(ct);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+ ip_conntrack_put(ct);
+}
+
+static inline int
+conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
+ const struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack *ignored_conntrack)
+{
+ MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+ return tuplehash_to_ctrack(i) != ignored_conntrack
+ && ip_ct_tuple_equal(tuple, &i->tuple);
+}
+
+static struct ip_conntrack_tuple_hash *
+__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack *ignored_conntrack)
+{
+ struct ip_conntrack_tuple_hash *h;
+ unsigned int hash = hash_conntrack(tuple);
+
+ MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+ list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
+ if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
+ CONNTRACK_STAT_INC(found);
+ return h;
+ }
+ CONNTRACK_STAT_INC(searched);
+ }
+
+ return NULL;
+}
+
+/* Find a connection corresponding to a tuple. */
+struct ip_conntrack_tuple_hash *
+ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack *ignored_conntrack)
+{
+ struct ip_conntrack_tuple_hash *h;
+
+ READ_LOCK(&ip_conntrack_lock);
+ h = __ip_conntrack_find(tuple, ignored_conntrack);
+ if (h)
+ atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
+ READ_UNLOCK(&ip_conntrack_lock);
+
+ return h;
+}
+
+/* Confirm a connection given skb; places it in hash table */
+int
+__ip_conntrack_confirm(struct sk_buff **pskb)
+{
+ unsigned int hash, repl_hash;
+ struct ip_conntrack *ct;
+ enum ip_conntrack_info ctinfo;
+
+ ct = ip_conntrack_get(*pskb, &ctinfo);
+
+ /* ipt_REJECT uses ip_conntrack_attach to attach related
+ ICMP/TCP RST packets in other direction. Actual packet
+ which created connection will be IP_CT_NEW or for an
+ expected connection, IP_CT_RELATED. */
+ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ return NF_ACCEPT;
+
+ hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+ /* We're not in hash table, and we refuse to set up related
+ connections for unconfirmed conns. But packet copies and
+ REJECT will give spurious warnings here. */
+ /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
+
+ /* No external references means noone else could have
+ confirmed us. */
+ IP_NF_ASSERT(!is_confirmed(ct));
+ DEBUGP("Confirming conntrack %p\n", ct);
+
+ WRITE_LOCK(&ip_conntrack_lock);
+
+ /* See if there's one in the list already, including reverse:
+ NAT could have grabbed it without realizing, since we're
+ not in the hash. If there is, we lost race. */
+ if (!LIST_FIND(&ip_conntrack_hash[hash],
+ conntrack_tuple_cmp,
+ struct ip_conntrack_tuple_hash *,
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
+ && !LIST_FIND(&ip_conntrack_hash[repl_hash],
+ conntrack_tuple_cmp,
+ struct ip_conntrack_tuple_hash *,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
+ /* Remove from unconfirmed list */
+ list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+
+ list_prepend(&ip_conntrack_hash[hash],
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
+ list_prepend(&ip_conntrack_hash[repl_hash],
+ &ct->tuplehash[IP_CT_DIR_REPLY]);
+ /* Timer relative to confirmation time, not original
+ setting time, otherwise we'd get timer wrap in
+ weird delay cases. */
+ ct->timeout.expires += jiffies;
+ add_timer(&ct->timeout);
+ atomic_inc(&ct->ct_general.use);
+ set_bit(IPS_CONFIRMED_BIT, &ct->status);
+ CONNTRACK_STAT_INC(insert);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+ return NF_ACCEPT;
+ }
+
+ CONNTRACK_STAT_INC(insert_failed);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+
+ return NF_DROP;
+}
+
+/* Returns true if a connection correspondings to the tuple (required
+ for NAT). */
+int
+ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack *ignored_conntrack)
+{
+ struct ip_conntrack_tuple_hash *h;
+
+ READ_LOCK(&ip_conntrack_lock);
+ h = __ip_conntrack_find(tuple, ignored_conntrack);
+ READ_UNLOCK(&ip_conntrack_lock);
+
+ return h != NULL;
+}
+
+/* There's a small race here where we may free a just-assured
+ connection. Too bad: we're in trouble anyway. */
+static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
+{
+ return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
+}
+
+static int early_drop(struct list_head *chain)
+{
+ /* Traverse backwards: gives us oldest, which is roughly LRU */
+ struct ip_conntrack_tuple_hash *h;
+ struct ip_conntrack *ct = NULL;
+ int dropped = 0;
+
+ READ_LOCK(&ip_conntrack_lock);
+ h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
+ if (h) {
+ ct = tuplehash_to_ctrack(h);
+ atomic_inc(&ct->ct_general.use);
+ }
+ READ_UNLOCK(&ip_conntrack_lock);
+
+ if (!ct)
+ return dropped;
+
+ if (del_timer(&ct->timeout)) {
+ death_by_timeout((unsigned long)ct);
+ dropped = 1;
+ CONNTRACK_STAT_INC(early_drop);
+ }
+ ip_conntrack_put(ct);
+ return dropped;
+}
+
+static inline int helper_cmp(const struct ip_conntrack_helper *i,
+ const struct ip_conntrack_tuple *rtuple)
+{
+ return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
+}
+
+static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
+{
+ return LIST_FIND(&helpers, helper_cmp,
+ struct ip_conntrack_helper *,
+ tuple);
+}
+
+/* Allocate a new conntrack: we return -ENOMEM if classification
+ failed due to stress. Otherwise it really is unclassifiable. */
+static struct ip_conntrack_tuple_hash *
+init_conntrack(const struct ip_conntrack_tuple *tuple,
+ struct ip_conntrack_protocol *protocol,
+ struct sk_buff *skb)
+{
+ struct ip_conntrack *conntrack;
+ struct ip_conntrack_tuple repl_tuple;
+ size_t hash;
+ struct ip_conntrack_expect *exp;
+
+ if (!ip_conntrack_hash_rnd_initted) {
+ get_random_bytes(&ip_conntrack_hash_rnd, 4);
+ ip_conntrack_hash_rnd_initted = 1;
+ }
+
+ hash = hash_conntrack(tuple);
+
+ if (ip_conntrack_max
+ && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
+ /* Try dropping from this hash chain. */
+ if (!early_drop(&ip_conntrack_hash[hash])) {
+ if (net_ratelimit())
+ printk(KERN_WARNING
+ "ip_conntrack: table full, dropping"
+ " packet.\n");
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+
+ if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
+ DEBUGP("Can't invert tuple.\n");
+ return NULL;
+ }
+
+ conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
+ if (!conntrack) {
+ DEBUGP("Can't allocate conntrack.\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ memset(conntrack, 0, sizeof(*conntrack));
+ atomic_set(&conntrack->ct_general.use, 1);
+ conntrack->ct_general.destroy = destroy_conntrack;
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
+ conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
+ if (!protocol->new(conntrack, skb)) {
+ kmem_cache_free(ip_conntrack_cachep, conntrack);
+ return NULL;
+ }
+ /* Don't set timer yet: wait for confirmation */
+ init_timer(&conntrack->timeout);
+ conntrack->timeout.data = (unsigned long)conntrack;
+ conntrack->timeout.function = death_by_timeout;
+
+ WRITE_LOCK(&ip_conntrack_lock);
+ exp = find_expectation(tuple);
+
+ if (exp) {
+ DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
+ conntrack, exp);
+ /* Welcome, Mr. Bond. We've been expecting you... */
+ __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
+ conntrack->master = exp->master;
+#if CONFIG_IP_NF_CONNTRACK_MARK
+ conntrack->mark = exp->master->mark;
+#endif
+ nf_conntrack_get(&conntrack->master->ct_general);
+ CONNTRACK_STAT_INC(expect_new);
+ } else {
+ conntrack->helper = ip_ct_find_helper(&repl_tuple);
+
+ CONNTRACK_STAT_INC(new);
+ }
+
+ /* Overload tuple linked list to put us in unconfirmed list. */
+ list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
+
+ atomic_inc(&ip_conntrack_count);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+
+ if (exp) {
+ if (exp->expectfn)
+ exp->expectfn(conntrack, exp);
+ destroy_expect(exp);
+ }
+
+ return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
+}
+
+/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
+static inline struct ip_conntrack *
+resolve_normal_ct(struct sk_buff *skb,
+ struct ip_conntrack_protocol *proto,
+ int *set_reply,
+ unsigned int hooknum,
+ enum ip_conntrack_info *ctinfo)
+{
+ struct ip_conntrack_tuple tuple;
+ struct ip_conntrack_tuple_hash *h;
+ struct ip_conntrack *ct;
+
+ IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
+
+ if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
+ &tuple,proto))
+ return NULL;
+
+ /* look for tuple match */
+ h = ip_conntrack_find_get(&tuple, NULL);
+ if (!h) {
+ h = init_conntrack(&tuple, proto, skb);
+ if (!h)
+ return NULL;
+ if (IS_ERR(h))
+ return (void *)h;
+ }
+ ct = tuplehash_to_ctrack(h);
+
+ /* It exists; we have (non-exclusive) reference. */
+ if (DIRECTION(h) == IP_CT_DIR_REPLY) {
+ *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
+ /* Please set reply bit if this packet OK */
+ *set_reply = 1;
+ } else {
+ /* Once we've had two way comms, always ESTABLISHED. */
+ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+ DEBUGP("ip_conntrack_in: normal packet for %p\n",
+ ct);
+ *ctinfo = IP_CT_ESTABLISHED;
+ } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
+ DEBUGP("ip_conntrack_in: related packet for %p\n",
+ ct);
+ *ctinfo = IP_CT_RELATED;
+ } else {
+ DEBUGP("ip_conntrack_in: new packet for %p\n",
+ ct);
+ *ctinfo = IP_CT_NEW;
+ }
+ *set_reply = 0;
+ }
+ skb->nfct = &ct->ct_general;
+ skb->nfctinfo = *ctinfo;
+ return ct;
+}
+
+/* Netfilter hook itself. */
+unsigned int ip_conntrack_in(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct ip_conntrack *ct;
+ enum ip_conntrack_info ctinfo;
+ struct ip_conntrack_protocol *proto;
+ int set_reply;
+ int ret;
+
+ /* Previously seen (loopback or untracked)? Ignore. */
+ if ((*pskb)->nfct) {
+ CONNTRACK_STAT_INC(ignore);
+ return NF_ACCEPT;
+ }
+
+ /* Never happen */
+ if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
+ if (net_ratelimit()) {
+ printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
+ (*pskb)->nh.iph->protocol, hooknum);
+ }
+ return NF_DROP;
+ }
+
+ /* FIXME: Do this right please. --RR */
+ (*pskb)->nfcache |= NFC_UNKNOWN;
+
+/* Doesn't cover locally-generated broadcast, so not worth it. */
+#if 0
+ /* Ignore broadcast: no `connection'. */
+ if ((*pskb)->pkt_type == PACKET_BROADCAST) {
+ printk("Broadcast packet!\n");
+ return NF_ACCEPT;
+ } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
+ == htonl(0x000000FF)) {
+ printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
+ NIPQUAD((*pskb)->nh.iph->saddr),
+ NIPQUAD((*pskb)->nh.iph->daddr),
+ (*pskb)->sk, (*pskb)->pkt_type);
+ }
+#endif
+
+ proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
+
+ /* It may be an special packet, error, unclean...
+ * inverse of the return code tells to the netfilter
+ * core what to do with the packet. */
+ if (proto->error != NULL
+ && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
+ CONNTRACK_STAT_INC(error);
+ CONNTRACK_STAT_INC(invalid);
+ return -ret;
+ }
+
+ if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
+ /* Not valid part of a connection */
+ CONNTRACK_STAT_INC(invalid);
+ return NF_ACCEPT;
+ }
+
+ if (IS_ERR(ct)) {
+ /* Too stressed to deal. */
+ CONNTRACK_STAT_INC(drop);
+ return NF_DROP;
+ }
+
+ IP_NF_ASSERT((*pskb)->nfct);
+
+ ret = proto->packet(ct, *pskb, ctinfo);
+ if (ret < 0) {
+ /* Invalid: inverse of the return code tells
+ * the netfilter core what to do*/
+ nf_conntrack_put((*pskb)->nfct);
+ (*pskb)->nfct = NULL;
+ CONNTRACK_STAT_INC(invalid);
+ return -ret;
+ }
+
+ if (set_reply)
+ set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
+
+ return ret;
+}
+
+int invert_tuplepr(struct ip_conntrack_tuple *inverse,
+ const struct ip_conntrack_tuple *orig)
+{
+ return ip_ct_invert_tuple(inverse, orig,
+ ip_ct_find_proto(orig->dst.protonum));
+}
+
+/* Would two expected things clash? */
+static inline int expect_clash(const struct ip_conntrack_expect *a,
+ const struct ip_conntrack_expect *b)
+{
+ /* Part covered by intersection of masks must be unequal,
+ otherwise they clash */
+ struct ip_conntrack_tuple intersect_mask
+ = { { a->mask.src.ip & b->mask.src.ip,
+ { a->mask.src.u.all & b->mask.src.u.all } },
+ { a->mask.dst.ip & b->mask.dst.ip,
+ { a->mask.dst.u.all & b->mask.dst.u.all },
+ a->mask.dst.protonum & b->mask.dst.protonum } };
+
+ return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
+}
+
+static inline int expect_matches(const struct ip_conntrack_expect *a,
+ const struct ip_conntrack_expect *b)
+{
+ return a->master == b->master
+ && ip_ct_tuple_equal(&a->tuple, &b->tuple)
+ && ip_ct_tuple_equal(&a->mask, &b->mask);
+}
+
+/* Generally a bad idea to call this: could have matched already. */
+void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
+{
+ struct ip_conntrack_expect *i;
+
+ WRITE_LOCK(&ip_conntrack_lock);
+ /* choose the the oldest expectation to evict */
+ list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
+ if (expect_matches(i, exp) && del_timer(&i->timeout)) {
+ unlink_expect(i);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+ destroy_expect(i);
+ return;
+ }
+ }
+ WRITE_UNLOCK(&ip_conntrack_lock);
+}
+
+struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
+{
+ struct ip_conntrack_expect *new;
+
+ new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
+ if (!new) {
+ DEBUGP("expect_related: OOM allocating expect\n");
+ return NULL;
+ }
+ new->master = NULL;
+ return new;
+}
+
+void ip_conntrack_expect_free(struct ip_conntrack_expect *expect)
+{
+ kmem_cache_free(ip_conntrack_expect_cachep, expect);
+}
+
+static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
+{
+ atomic_inc(&exp->master->ct_general.use);
+ exp->master->expecting++;
+ list_add(&exp->list, &ip_conntrack_expect_list);
+
+ if (exp->master->helper->timeout) {
+ init_timer(&exp->timeout);
+ exp->timeout.data = (unsigned long)exp;
+ exp->timeout.function = expectation_timed_out;
+ exp->timeout.expires
+ = jiffies + exp->master->helper->timeout * HZ;
+ add_timer(&exp->timeout);
+ } else
+ exp->timeout.function = NULL;
+
+ CONNTRACK_STAT_INC(expect_create);
+}
+
+/* Race with expectations being used means we could have none to find; OK. */
+static void evict_oldest_expect(struct ip_conntrack *master)
+{
+ struct ip_conntrack_expect *i;
+
+ list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
+ if (i->master == master) {
+ if (del_timer(&i->timeout)) {
+ unlink_expect(i);
+ destroy_expect(i);
+ }
+ break;
+ }
+ }
+}
+
+static inline int refresh_timer(struct ip_conntrack_expect *i)
+{
+ if (!del_timer(&i->timeout))
+ return 0;
+
+ i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
+ add_timer(&i->timeout);
+ return 1;
+}
+
+int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
+{
+ struct ip_conntrack_expect *i;
+ int ret;
+
+ DEBUGP("ip_conntrack_expect_related %p\n", related_to);
+ DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
+ DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
+
+ WRITE_LOCK(&ip_conntrack_lock);
+ list_for_each_entry(i, &ip_conntrack_expect_list, list) {
+ if (expect_matches(i, expect)) {
+ /* Refresh timer: if it's dying, ignore.. */
+ if (refresh_timer(i)) {
+ ret = 0;
+ /* We don't need the one they've given us. */
+ ip_conntrack_expect_free(expect);
+ goto out;
+ }
+ } else if (expect_clash(i, expect)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ }
+
+ /* Will be over limit? */
+ if (expect->master->helper->max_expected &&
+ expect->master->expecting >= expect->master->helper->max_expected)
+ evict_oldest_expect(expect->master);
+
+ ip_conntrack_expect_insert(expect);
+ ret = 0;
+out:
+ WRITE_UNLOCK(&ip_conntrack_lock);
+ return ret;
+}
+
+/* Alter reply tuple (maybe alter helper). This is for NAT, and is
+ implicitly racy: see __ip_conntrack_confirm */
+void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
+ const struct ip_conntrack_tuple *newreply)
+{
+ WRITE_LOCK(&ip_conntrack_lock);
+ /* Should be unconfirmed, so not in hash table yet */
+ IP_NF_ASSERT(!is_confirmed(conntrack));
+
+ DEBUGP("Altering reply tuple of %p to ", conntrack);
+ DUMP_TUPLE(newreply);
+
+ conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
+ if (!conntrack->master && conntrack->expecting == 0)
+ conntrack->helper = ip_ct_find_helper(newreply);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+}
+
+int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
+{
+ BUG_ON(me->timeout == 0);
+ WRITE_LOCK(&ip_conntrack_lock);
+ list_prepend(&helpers, me);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+
+ return 0;
+}
+
+static inline int unhelp(struct ip_conntrack_tuple_hash *i,
+ const struct ip_conntrack_helper *me)
+{
+ if (tuplehash_to_ctrack(i)->helper == me)
+ tuplehash_to_ctrack(i)->helper = NULL;
+ return 0;
+}
+
+void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
+{
+ unsigned int i;
+ struct ip_conntrack_expect *exp, *tmp;
+
+ /* Need write lock here, to delete helper. */
+ WRITE_LOCK(&ip_conntrack_lock);
+ LIST_DELETE(&helpers, me);
+
+ /* Get rid of expectations */
+ list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
+ if (exp->master->helper == me && del_timer(&exp->timeout)) {
+ unlink_expect(exp);
+ destroy_expect(exp);
+ }
+ }
+ /* Get rid of expecteds, set helpers to NULL. */
+ LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
+ for (i = 0; i < ip_conntrack_htable_size; i++)
+ LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
+ struct ip_conntrack_tuple_hash *, me);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+
+ /* Someone could be still looking at the helper in a bh. */
+ synchronize_net();
+}
+
+static inline void ct_add_counters(struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ const struct sk_buff *skb)
+{
+#ifdef CONFIG_IP_NF_CT_ACCT
+ if (skb) {
+ ct->counters[CTINFO2DIR(ctinfo)].packets++;
+ ct->counters[CTINFO2DIR(ctinfo)].bytes +=
+ ntohs(skb->nh.iph->tot_len);
+ }
+#endif
+}
+
+/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
+void ip_ct_refresh_acct(struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ const struct sk_buff *skb,
+ unsigned long extra_jiffies)
+{
+ IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
+
+ /* If not in hash table, timer will not be active yet */
+ if (!is_confirmed(ct)) {
+ ct->timeout.expires = extra_jiffies;
+ ct_add_counters(ct, ctinfo, skb);
+ } else {
+ WRITE_LOCK(&ip_conntrack_lock);
+ /* Need del_timer for race avoidance (may already be dying). */
+ if (del_timer(&ct->timeout)) {
+ ct->timeout.expires = jiffies + extra_jiffies;
+ add_timer(&ct->timeout);
+ }
+ ct_add_counters(ct, ctinfo, skb);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+ }
+}
+
+/* Returns new sk_buff, or NULL */
+struct sk_buff *
+ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
+{
+ struct sock *sk = skb->sk;
+#ifdef CONFIG_NETFILTER_DEBUG
+ unsigned int olddebug = skb->nf_debug;
+#endif
+
+ if (sk) {
+ sock_hold(sk);
+ skb_orphan(skb);
+ }
+
+ local_bh_disable();
+ skb = ip_defrag(skb, user);
+ local_bh_enable();
+
+ if (!skb) {
+ if (sk)
+ sock_put(sk);
+ return skb;
+ }
+
+ if (sk) {
+ skb_set_owner_w(skb, sk);
+ sock_put(sk);
+ }
+
+ ip_send_check(skb->nh.iph);
+ skb->nfcache |= NFC_ALTERED;
+#ifdef CONFIG_NETFILTER_DEBUG
+ /* Packet path as if nothing had happened. */
+ skb->nf_debug = olddebug;
+#endif
+ return skb;
+}
+
+/* Used by ipt_REJECT. */
+static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
+{
+ struct ip_conntrack *ct;
+ enum ip_conntrack_info ctinfo;
+
+ /* This ICMP is in reverse direction to the packet which caused it */
+ ct = ip_conntrack_get(skb, &ctinfo);
+
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
+ ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
+ else
+ ctinfo = IP_CT_RELATED;
+
+ /* Attach to new skbuff, and increment count */
+ nskb->nfct = &ct->ct_general;
+ nskb->nfctinfo = ctinfo;
+ nf_conntrack_get(nskb->nfct);
+}
+
+static inline int
+do_iter(const struct ip_conntrack_tuple_hash *i,
+ int (*iter)(struct ip_conntrack *i, void *data),
+ void *data)
+{
+ return iter(tuplehash_to_ctrack(i), data);
+}
+
+/* Bring out ya dead! */
+static struct ip_conntrack_tuple_hash *
+get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
+ void *data, unsigned int *bucket)
+{
+ struct ip_conntrack_tuple_hash *h = NULL;
+
+ WRITE_LOCK(&ip_conntrack_lock);
+ for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
+ h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
+ struct ip_conntrack_tuple_hash *, iter, data);
+ if (h)
+ break;
+ }
+ if (!h)
+ h = LIST_FIND_W(&unconfirmed, do_iter,
+ struct ip_conntrack_tuple_hash *, iter, data);
+ if (h)
+ atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+
+ return h;
+}
+
+void
+ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
+{
+ struct ip_conntrack_tuple_hash *h;
+ unsigned int bucket = 0;
+
+ while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
+ struct ip_conntrack *ct = tuplehash_to_ctrack(h);
+ /* Time to push up daises... */
+ if (del_timer(&ct->timeout))
+ death_by_timeout((unsigned long)ct);
+ /* ... else the timer will get him soon. */
+
+ ip_conntrack_put(ct);
+ }
+}
+
+/* Fast function for those who don't want to parse /proc (and I don't
+ blame them). */
+/* Reversing the socket's dst/src point of view gives us the reply
+ mapping. */
+static int
+getorigdst(struct sock *sk, int optval, void __user *user, int *len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_conntrack_tuple_hash *h;
+ struct ip_conntrack_tuple tuple;
+
+ IP_CT_TUPLE_U_BLANK(&tuple);
+ tuple.src.ip = inet->rcv_saddr;
+ tuple.src.u.tcp.port = inet->sport;
+ tuple.dst.ip = inet->daddr;
+ tuple.dst.u.tcp.port = inet->dport;
+ tuple.dst.protonum = IPPROTO_TCP;
+
+ /* We only do TCP at the moment: is there a better way? */
+ if (strcmp(sk->sk_prot->name, "TCP")) {
+ DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
+ return -ENOPROTOOPT;
+ }
+
+ if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
+ DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
+ *len, sizeof(struct sockaddr_in));
+ return -EINVAL;
+ }
+
+ h = ip_conntrack_find_get(&tuple, NULL);
+ if (h) {
+ struct sockaddr_in sin;
+ struct ip_conntrack *ct = tuplehash_to_ctrack(h);
+
+ sin.sin_family = AF_INET;
+ sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.u.tcp.port;
+ sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.ip;
+
+ DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
+ NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
+ ip_conntrack_put(ct);
+ if (copy_to_user(user, &sin, sizeof(sin)) != 0)
+ return -EFAULT;
+ else
+ return 0;
+ }
+ DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
+ NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
+ NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
+ return -ENOENT;
+}
+
+static struct nf_sockopt_ops so_getorigdst = {
+ .pf = PF_INET,
+ .get_optmin = SO_ORIGINAL_DST,
+ .get_optmax = SO_ORIGINAL_DST+1,
+ .get = &getorigdst,
+};
+
+static int kill_all(struct ip_conntrack *i, void *data)
+{
+ return 1;
+}
+
+static void free_conntrack_hash(void)
+{
+ if (ip_conntrack_vmalloc)
+ vfree(ip_conntrack_hash);
+ else
+ free_pages((unsigned long)ip_conntrack_hash,
+ get_order(sizeof(struct list_head)
+ * ip_conntrack_htable_size));
+}
+
+/* Mishearing the voices in his head, our hero wonders how he's
+ supposed to kill the mall. */
+void ip_conntrack_cleanup(void)
+{
+ ip_ct_attach = NULL;
+ /* This makes sure all current packets have passed through
+ netfilter framework. Roll on, two-stage module
+ delete... */
+ synchronize_net();
+
+ i_see_dead_people:
+ ip_ct_iterate_cleanup(kill_all, NULL);
+ if (atomic_read(&ip_conntrack_count) != 0) {
+ schedule();
+ goto i_see_dead_people;
+ }
+
+ kmem_cache_destroy(ip_conntrack_cachep);
+ kmem_cache_destroy(ip_conntrack_expect_cachep);
+ free_conntrack_hash();
+ nf_unregister_sockopt(&so_getorigdst);
+}
+
+static int hashsize;
+module_param(hashsize, int, 0400);
+
+int __init ip_conntrack_init(void)
+{
+ unsigned int i;
+ int ret;
+
+ /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
+ * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
+ if (hashsize) {
+ ip_conntrack_htable_size = hashsize;
+ } else {
+ ip_conntrack_htable_size
+ = (((num_physpages << PAGE_SHIFT) / 16384)
+ / sizeof(struct list_head));
+ if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
+ ip_conntrack_htable_size = 8192;
+ if (ip_conntrack_htable_size < 16)
+ ip_conntrack_htable_size = 16;
+ }
+ ip_conntrack_max = 8 * ip_conntrack_htable_size;
+
+ printk("ip_conntrack version %s (%u buckets, %d max)"
+ " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
+ ip_conntrack_htable_size, ip_conntrack_max,
+ sizeof(struct ip_conntrack));
+
+ ret = nf_register_sockopt(&so_getorigdst);
+ if (ret != 0) {
+ printk(KERN_ERR "Unable to register netfilter socket option\n");
+ return ret;
+ }
+
+ /* AK: the hash table is twice as big than needed because it
+ uses list_head. it would be much nicer to caches to use a
+ single pointer list head here. */
+ ip_conntrack_vmalloc = 0;
+ ip_conntrack_hash
+ =(void*)__get_free_pages(GFP_KERNEL,
+ get_order(sizeof(struct list_head)
+ *ip_conntrack_htable_size));
+ if (!ip_conntrack_hash) {
+ ip_conntrack_vmalloc = 1;
+ printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
+ ip_conntrack_hash = vmalloc(sizeof(struct list_head)
+ * ip_conntrack_htable_size);
+ }
+ if (!ip_conntrack_hash) {
+ printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
+ goto err_unreg_sockopt;
+ }
+
+ ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
+ sizeof(struct ip_conntrack), 0,
+ 0, NULL, NULL);
+ if (!ip_conntrack_cachep) {
+ printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
+ goto err_free_hash;
+ }
+
+ ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
+ sizeof(struct ip_conntrack_expect),
+ 0, 0, NULL, NULL);
+ if (!ip_conntrack_expect_cachep) {
+ printk(KERN_ERR "Unable to create ip_expect slab cache\n");
+ goto err_free_conntrack_slab;
+ }
+
+ /* Don't NEED lock here, but good form anyway. */
+ WRITE_LOCK(&ip_conntrack_lock);
+ for (i = 0; i < MAX_IP_CT_PROTO; i++)
+ ip_ct_protos[i] = &ip_conntrack_generic_protocol;
+ /* Sew in builtin protocols. */
+ ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
+ ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
+ ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
+ WRITE_UNLOCK(&ip_conntrack_lock);
+
+ for (i = 0; i < ip_conntrack_htable_size; i++)
+ INIT_LIST_HEAD(&ip_conntrack_hash[i]);
+
+ /* For use by ipt_REJECT */
+ ip_ct_attach = ip_conntrack_attach;
+
+ /* Set up fake conntrack:
+ - to never be deleted, not in any hashes */
+ atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
+ /* - and look it like as a confirmed connection */
+ set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
+
+ return ret;
+
+err_free_conntrack_slab:
+ kmem_cache_destroy(ip_conntrack_cachep);
+err_free_hash:
+ free_conntrack_hash();
+err_unreg_sockopt:
+ nf_unregister_sockopt(&so_getorigdst);
+
+ return -ENOMEM;
+}
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
new file mode 100644
index 00000000000..12b88cbb11d
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -0,0 +1,501 @@
+/* FTP extension for IP connection tracking. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/ctype.h>
+#include <net/checksum.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter_ipv4/lockhelp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
+#include <linux/moduleparam.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+MODULE_DESCRIPTION("ftp connection tracking helper");
+
+/* This is slow, but it's simple. --RR */
+static char ftp_buffer[65536];
+
+static DECLARE_LOCK(ip_ftp_lock);
+
+#define MAX_PORTS 8
+static int ports[MAX_PORTS];
+static int ports_c;
+module_param_array(ports, int, &ports_c, 0400);
+
+static int loose;
+module_param(loose, int, 0600);
+
+unsigned int (*ip_nat_ftp_hook)(struct sk_buff **pskb,
+ enum ip_conntrack_info ctinfo,
+ enum ip_ct_ftp_type type,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct ip_conntrack_expect *exp,
+ u32 *seq);
+EXPORT_SYMBOL_GPL(ip_nat_ftp_hook);
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static int try_rfc959(const char *, size_t, u_int32_t [], char);
+static int try_eprt(const char *, size_t, u_int32_t [], char);
+static int try_epsv_response(const char *, size_t, u_int32_t [], char);
+
+static struct ftp_search {
+ enum ip_conntrack_dir dir;
+ const char *pattern;
+ size_t plen;
+ char skip;
+ char term;
+ enum ip_ct_ftp_type ftptype;
+ int (*getnum)(const char *, size_t, u_int32_t[], char);
+} search[] = {
+ {
+ IP_CT_DIR_ORIGINAL,
+ "PORT", sizeof("PORT") - 1, ' ', '\r',
+ IP_CT_FTP_PORT,
+ try_rfc959,
+ },
+ {
+ IP_CT_DIR_REPLY,
+ "227 ", sizeof("227 ") - 1, '(', ')',
+ IP_CT_FTP_PASV,
+ try_rfc959,
+ },
+ {
+ IP_CT_DIR_ORIGINAL,
+ "EPRT", sizeof("EPRT") - 1, ' ', '\r',
+ IP_CT_FTP_EPRT,
+ try_eprt,
+ },
+ {
+ IP_CT_DIR_REPLY,
+ "229 ", sizeof("229 ") - 1, '(', ')',
+ IP_CT_FTP_EPSV,
+ try_epsv_response,
+ },
+};
+
+static int try_number(const char *data, size_t dlen, u_int32_t array[],
+ int array_size, char sep, char term)
+{
+ u_int32_t i, len;
+
+ memset(array, 0, sizeof(array[0])*array_size);
+
+ /* Keep data pointing at next char. */
+ for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) {
+ if (*data >= '0' && *data <= '9') {
+ array[i] = array[i]*10 + *data - '0';
+ }
+ else if (*data == sep)
+ i++;
+ else {
+ /* Unexpected character; true if it's the
+ terminator and we're finished. */
+ if (*data == term && i == array_size - 1)
+ return len;
+
+ DEBUGP("Char %u (got %u nums) `%u' unexpected\n",
+ len, i, *data);
+ return 0;
+ }
+ }
+ DEBUGP("Failed to fill %u numbers separated by %c\n", array_size, sep);
+
+ return 0;
+}
+
+/* Returns 0, or length of numbers: 192,168,1,1,5,6 */
+static int try_rfc959(const char *data, size_t dlen, u_int32_t array[6],
+ char term)
+{
+ return try_number(data, dlen, array, 6, ',', term);
+}
+
+/* Grab port: number up to delimiter */
+static int get_port(const char *data, int start, size_t dlen, char delim,
+ u_int32_t array[2])
+{
+ u_int16_t port = 0;
+ int i;
+
+ for (i = start; i < dlen; i++) {
+ /* Finished? */
+ if (data[i] == delim) {
+ if (port == 0)
+ break;
+ array[0] = port >> 8;
+ array[1] = port;
+ return i + 1;
+ }
+ else if (data[i] >= '0' && data[i] <= '9')
+ port = port*10 + data[i] - '0';
+ else /* Some other crap */
+ break;
+ }
+ return 0;
+}
+
+/* Returns 0, or length of numbers: |1|132.235.1.2|6275| */
+static int try_eprt(const char *data, size_t dlen, u_int32_t array[6],
+ char term)
+{
+ char delim;
+ int length;
+
+ /* First character is delimiter, then "1" for IPv4, then
+ delimiter again. */
+ if (dlen <= 3) return 0;
+ delim = data[0];
+ if (isdigit(delim) || delim < 33 || delim > 126
+ || data[1] != '1' || data[2] != delim)
+ return 0;
+
+ DEBUGP("EPRT: Got |1|!\n");
+ /* Now we have IP address. */
+ length = try_number(data + 3, dlen - 3, array, 4, '.', delim);
+ if (length == 0)
+ return 0;
+
+ DEBUGP("EPRT: Got IP address!\n");
+ /* Start offset includes initial "|1|", and trailing delimiter */
+ return get_port(data, 3 + length + 1, dlen, delim, array+4);
+}
+
+/* Returns 0, or length of numbers: |||6446| */
+static int try_epsv_response(const char *data, size_t dlen, u_int32_t array[6],
+ char term)
+{
+ char delim;
+
+ /* Three delimiters. */
+ if (dlen <= 3) return 0;
+ delim = data[0];
+ if (isdigit(delim) || delim < 33 || delim > 126
+ || data[1] != delim || data[2] != delim)
+ return 0;
+
+ return get_port(data, 3, dlen, delim, array+4);
+}
+
+/* Return 1 for match, 0 for accept, -1 for partial. */
+static int find_pattern(const char *data, size_t dlen,
+ const char *pattern, size_t plen,
+ char skip, char term,
+ unsigned int *numoff,
+ unsigned int *numlen,
+ u_int32_t array[6],
+ int (*getnum)(const char *, size_t, u_int32_t[], char))
+{
+ size_t i;
+
+ DEBUGP("find_pattern `%s': dlen = %u\n", pattern, dlen);
+ if (dlen == 0)
+ return 0;
+
+ if (dlen <= plen) {
+ /* Short packet: try for partial? */
+ if (strnicmp(data, pattern, dlen) == 0)
+ return -1;
+ else return 0;
+ }
+
+ if (strnicmp(data, pattern, plen) != 0) {
+#if 0
+ size_t i;
+
+ DEBUGP("ftp: string mismatch\n");
+ for (i = 0; i < plen; i++) {
+ DEBUGP("ftp:char %u `%c'(%u) vs `%c'(%u)\n",
+ i, data[i], data[i],
+ pattern[i], pattern[i]);
+ }
+#endif
+ return 0;
+ }
+
+ DEBUGP("Pattern matches!\n");
+ /* Now we've found the constant string, try to skip
+ to the 'skip' character */
+ for (i = plen; data[i] != skip; i++)
+ if (i == dlen - 1) return -1;
+
+ /* Skip over the last character */
+ i++;
+
+ DEBUGP("Skipped up to `%c'!\n", skip);
+
+ *numoff = i;
+ *numlen = getnum(data + i, dlen - i, array, term);
+ if (!*numlen)
+ return -1;
+
+ DEBUGP("Match succeeded!\n");
+ return 1;
+}
+
+/* Look up to see if we're just after a \n. */
+static int find_nl_seq(u16 seq, const struct ip_ct_ftp_master *info, int dir)
+{
+ unsigned int i;
+
+ for (i = 0; i < info->seq_aft_nl_num[dir]; i++)
+ if (info->seq_aft_nl[dir][i] == seq)
+ return 1;
+ return 0;
+}
+
+/* We don't update if it's older than what we have. */
+static void update_nl_seq(u16 nl_seq, struct ip_ct_ftp_master *info, int dir)
+{
+ unsigned int i, oldest = NUM_SEQ_TO_REMEMBER;
+
+ /* Look for oldest: if we find exact match, we're done. */
+ for (i = 0; i < info->seq_aft_nl_num[dir]; i++) {
+ if (info->seq_aft_nl[dir][i] == nl_seq)
+ return;
+
+ if (oldest == info->seq_aft_nl_num[dir]
+ || before(info->seq_aft_nl[dir][i], oldest))
+ oldest = i;
+ }
+
+ if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER)
+ info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq;
+ else if (oldest != NUM_SEQ_TO_REMEMBER)
+ info->seq_aft_nl[dir][oldest] = nl_seq;
+}
+
+static int help(struct sk_buff **pskb,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ unsigned int dataoff, datalen;
+ struct tcphdr _tcph, *th;
+ char *fb_ptr;
+ int ret;
+ u32 seq, array[6] = { 0 };
+ int dir = CTINFO2DIR(ctinfo);
+ unsigned int matchlen, matchoff;
+ struct ip_ct_ftp_master *ct_ftp_info = &ct->help.ct_ftp_info;
+ struct ip_conntrack_expect *exp;
+ unsigned int i;
+ int found = 0, ends_in_nl;
+
+ /* Until there's been traffic both ways, don't look in packets. */
+ if (ctinfo != IP_CT_ESTABLISHED
+ && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) {
+ DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo);
+ return NF_ACCEPT;
+ }
+
+ th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4,
+ sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return NF_ACCEPT;
+
+ dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4;
+ /* No data? */
+ if (dataoff >= (*pskb)->len) {
+ DEBUGP("ftp: pskblen = %u\n", (*pskb)->len);
+ return NF_ACCEPT;
+ }
+ datalen = (*pskb)->len - dataoff;
+
+ LOCK_BH(&ip_ftp_lock);
+ fb_ptr = skb_header_pointer(*pskb, dataoff,
+ (*pskb)->len - dataoff, ftp_buffer);
+ BUG_ON(fb_ptr == NULL);
+
+ ends_in_nl = (fb_ptr[datalen - 1] == '\n');
+ seq = ntohl(th->seq) + datalen;
+
+ /* Look up to see if we're just after a \n. */
+ if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) {
+ /* Now if this ends in \n, update ftp info. */
+ DEBUGP("ip_conntrack_ftp_help: wrong seq pos %s(%u) or %s(%u)\n",
+ ct_ftp_info->seq_aft_nl[0][dir]
+ old_seq_aft_nl_set ? "":"(UNSET) ", old_seq_aft_nl);
+ ret = NF_ACCEPT;
+ goto out_update_nl;
+ }
+
+ /* Initialize IP array to expected address (it's not mentioned
+ in EPSV responses) */
+ array[0] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 24) & 0xFF;
+ array[1] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 16) & 0xFF;
+ array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF;
+ array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF;
+
+ for (i = 0; i < ARRAY_SIZE(search); i++) {
+ if (search[i].dir != dir) continue;
+
+ found = find_pattern(fb_ptr, (*pskb)->len - dataoff,
+ search[i].pattern,
+ search[i].plen,
+ search[i].skip,
+ search[i].term,
+ &matchoff, &matchlen,
+ array,
+ search[i].getnum);
+ if (found) break;
+ }
+ if (found == -1) {
+ /* We don't usually drop packets. After all, this is
+ connection tracking, not packet filtering.
+ However, it is necessary for accurate tracking in
+ this case. */
+ if (net_ratelimit())
+ printk("conntrack_ftp: partial %s %u+%u\n",
+ search[i].pattern,
+ ntohl(th->seq), datalen);
+ ret = NF_DROP;
+ goto out;
+ } else if (found == 0) { /* No match */
+ ret = NF_ACCEPT;
+ goto out_update_nl;
+ }
+
+ DEBUGP("conntrack_ftp: match `%s' (%u bytes at %u)\n",
+ fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff);
+
+ /* Allocate expectation which will be inserted */
+ exp = ip_conntrack_expect_alloc();
+ if (exp == NULL) {
+ ret = NF_DROP;
+ goto out;
+ }
+
+ /* We refer to the reverse direction ("!dir") tuples here,
+ * because we're expecting something in the other direction.
+ * Doesn't matter unless NAT is happening. */
+ exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
+
+ if (htonl((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3])
+ != ct->tuplehash[dir].tuple.src.ip) {
+ /* Enrico Scholz's passive FTP to partially RNAT'd ftp
+ server: it really wants us to connect to a
+ different IP address. Simply don't record it for
+ NAT. */
+ DEBUGP("conntrack_ftp: NOT RECORDING: %u,%u,%u,%u != %u.%u.%u.%u\n",
+ array[0], array[1], array[2], array[3],
+ NIPQUAD(ct->tuplehash[dir].tuple.src.ip));
+
+ /* Thanks to Cristiano Lincoln Mattos
+ <lincoln@cesar.org.br> for reporting this potential
+ problem (DMZ machines opening holes to internal
+ networks, or the packet filter itself). */
+ if (!loose) {
+ ret = NF_ACCEPT;
+ ip_conntrack_expect_free(exp);
+ goto out_update_nl;
+ }
+ exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16)
+ | (array[2] << 8) | array[3]);
+ }
+
+ exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
+ exp->tuple.dst.u.tcp.port = htons(array[4] << 8 | array[5]);
+ exp->tuple.src.u.tcp.port = 0; /* Don't care. */
+ exp->tuple.dst.protonum = IPPROTO_TCP;
+ exp->mask = ((struct ip_conntrack_tuple)
+ { { 0xFFFFFFFF, { 0 } },
+ { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
+
+ exp->expectfn = NULL;
+ exp->master = ct;
+
+ /* Now, NAT might want to mangle the packet, and register the
+ * (possibly changed) expectation itself. */
+ if (ip_nat_ftp_hook)
+ ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype,
+ matchoff, matchlen, exp, &seq);
+ else {
+ /* Can't expect this? Best to drop packet now. */
+ if (ip_conntrack_expect_related(exp) != 0) {
+ ip_conntrack_expect_free(exp);
+ ret = NF_DROP;
+ } else
+ ret = NF_ACCEPT;
+ }
+
+out_update_nl:
+ /* Now if this ends in \n, update ftp info. Seq may have been
+ * adjusted by NAT code. */
+ if (ends_in_nl)
+ update_nl_seq(seq, ct_ftp_info,dir);
+ out:
+ UNLOCK_BH(&ip_ftp_lock);
+ return ret;
+}
+
+static struct ip_conntrack_helper ftp[MAX_PORTS];
+static char ftp_names[MAX_PORTS][10];
+
+/* Not __exit: called from init() */
+static void fini(void)
+{
+ int i;
+ for (i = 0; i < ports_c; i++) {
+ DEBUGP("ip_ct_ftp: unregistering helper for port %d\n",
+ ports[i]);
+ ip_conntrack_helper_unregister(&ftp[i]);
+ }
+}
+
+static int __init init(void)
+{
+ int i, ret;
+ char *tmpname;
+
+ if (ports_c == 0)
+ ports[ports_c++] = FTP_PORT;
+
+ for (i = 0; i < ports_c; i++) {
+ ftp[i].tuple.src.u.tcp.port = htons(ports[i]);
+ ftp[i].tuple.dst.protonum = IPPROTO_TCP;
+ ftp[i].mask.src.u.tcp.port = 0xFFFF;
+ ftp[i].mask.dst.protonum = 0xFF;
+ ftp[i].max_expected = 1;
+ ftp[i].timeout = 5 * 60; /* 5 minutes */
+ ftp[i].me = THIS_MODULE;
+ ftp[i].help = help;
+
+ tmpname = &ftp_names[i][0];
+ if (ports[i] == FTP_PORT)
+ sprintf(tmpname, "ftp");
+ else
+ sprintf(tmpname, "ftp-%d", ports[i]);
+ ftp[i].name = tmpname;
+
+ DEBUGP("ip_ct_ftp: registering helper for port %d\n",
+ ports[i]);
+ ret = ip_conntrack_helper_register(&ftp[i]);
+
+ if (ret) {
+ fini();
+ return ret;
+ }
+ }
+ return 0;
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
new file mode 100644
index 00000000000..33cc7348b6e
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -0,0 +1,313 @@
+/* IRC extension for IP connection tracking, Version 1.21
+ * (C) 2000-2002 by Harald Welte <laforge@gnumonks.org>
+ * based on RR's ip_conntrack_ftp.c
+ *
+ * ip_conntrack_irc.c,v 1.21 2002/02/05 14:49:26 laforge Exp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ **
+ * Module load syntax:
+ * insmod ip_conntrack_irc.o ports=port1,port2,...port<MAX_PORTS>
+ * max_dcc_channels=n dcc_timeout=secs
+ *
+ * please give the ports of all IRC servers You wish to connect to.
+ * If You don't specify ports, the default will be port 6667.
+ * With max_dcc_channels you can define the maximum number of not
+ * yet answered DCC channels per IRC session (default 8).
+ * With dcc_timeout you can specify how long the system waits for
+ * an expected DCC channel (default 300 seconds).
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter_ipv4/lockhelp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_irc.h>
+#include <linux/moduleparam.h>
+
+#define MAX_PORTS 8
+static int ports[MAX_PORTS];
+static int ports_c;
+static int max_dcc_channels = 8;
+static unsigned int dcc_timeout = 300;
+/* This is slow, but it's simple. --RR */
+static char irc_buffer[65536];
+static DECLARE_LOCK(irc_buffer_lock);
+
+unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb,
+ enum ip_conntrack_info ctinfo,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct ip_conntrack_expect *exp);
+EXPORT_SYMBOL_GPL(ip_nat_irc_hook);
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("IRC (DCC) connection tracking helper");
+MODULE_LICENSE("GPL");
+module_param_array(ports, int, &ports_c, 0400);
+MODULE_PARM_DESC(ports, "port numbers of IRC servers");
+module_param(max_dcc_channels, int, 0400);
+MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session");
+module_param(dcc_timeout, int, 0400);
+MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels");
+
+static char *dccprotos[] = { "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " };
+#define MINMATCHLEN 5
+
+#if 0
+#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s:" format, \
+ __FILE__, __FUNCTION__ , ## args)
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static int parse_dcc(char *data, char *data_end, u_int32_t *ip,
+ u_int16_t *port, char **ad_beg_p, char **ad_end_p)
+/* tries to get the ip_addr and port out of a dcc command
+ return value: -1 on failure, 0 on success
+ data pointer to first byte of DCC command data
+ data_end pointer to last byte of dcc command data
+ ip returns parsed ip of dcc command
+ port returns parsed port of dcc command
+ ad_beg_p returns pointer to first byte of addr data
+ ad_end_p returns pointer to last byte of addr data */
+{
+
+ /* at least 12: "AAAAAAAA P\1\n" */
+ while (*data++ != ' ')
+ if (data > data_end - 12)
+ return -1;
+
+ *ad_beg_p = data;
+ *ip = simple_strtoul(data, &data, 10);
+
+ /* skip blanks between ip and port */
+ while (*data == ' ') {
+ if (data >= data_end)
+ return -1;
+ data++;
+ }
+
+ *port = simple_strtoul(data, &data, 10);
+ *ad_end_p = data;
+
+ return 0;
+}
+
+static int help(struct sk_buff **pskb,
+ struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
+{
+ unsigned int dataoff;
+ struct tcphdr _tcph, *th;
+ char *data, *data_limit, *ib_ptr;
+ int dir = CTINFO2DIR(ctinfo);
+ struct ip_conntrack_expect *exp;
+ u32 seq;
+ u_int32_t dcc_ip;
+ u_int16_t dcc_port;
+ int i, ret = NF_ACCEPT;
+ char *addr_beg_p, *addr_end_p;
+
+ DEBUGP("entered\n");
+
+ /* If packet is coming from IRC server */
+ if (dir == IP_CT_DIR_REPLY)
+ return NF_ACCEPT;
+
+ /* Until there's been traffic both ways, don't look in packets. */
+ if (ctinfo != IP_CT_ESTABLISHED
+ && ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) {
+ DEBUGP("Conntrackinfo = %u\n", ctinfo);
+ return NF_ACCEPT;
+ }
+
+ /* Not a full tcp header? */
+ th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4,
+ sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return NF_ACCEPT;
+
+ /* No data? */
+ dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4;
+ if (dataoff >= (*pskb)->len)
+ return NF_ACCEPT;
+
+ LOCK_BH(&irc_buffer_lock);
+ ib_ptr = skb_header_pointer(*pskb, dataoff,
+ (*pskb)->len - dataoff, irc_buffer);
+ BUG_ON(ib_ptr == NULL);
+
+ data = ib_ptr;
+ data_limit = ib_ptr + (*pskb)->len - dataoff;
+
+ /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24
+ * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */
+ while (data < (data_limit - (19 + MINMATCHLEN))) {
+ if (memcmp(data, "\1DCC ", 5)) {
+ data++;
+ continue;
+ }
+
+ data += 5;
+ /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */
+
+ DEBUGP("DCC found in master %u.%u.%u.%u:%u %u.%u.%u.%u:%u...\n",
+ NIPQUAD(iph->saddr), ntohs(th->source),
+ NIPQUAD(iph->daddr), ntohs(th->dest));
+
+ for (i = 0; i < ARRAY_SIZE(dccprotos); i++) {
+ if (memcmp(data, dccprotos[i], strlen(dccprotos[i]))) {
+ /* no match */
+ continue;
+ }
+
+ DEBUGP("DCC %s detected\n", dccprotos[i]);
+ data += strlen(dccprotos[i]);
+ /* we have at least
+ * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid
+ * data left (== 14/13 bytes) */
+ if (parse_dcc((char *)data, data_limit, &dcc_ip,
+ &dcc_port, &addr_beg_p, &addr_end_p)) {
+ /* unable to parse */
+ DEBUGP("unable to parse dcc command\n");
+ continue;
+ }
+ DEBUGP("DCC bound ip/port: %u.%u.%u.%u:%u\n",
+ HIPQUAD(dcc_ip), dcc_port);
+
+ /* dcc_ip can be the internal OR external (NAT'ed) IP
+ * Tiago Sousa <mirage@kaotik.org> */
+ if (ct->tuplehash[dir].tuple.src.ip != htonl(dcc_ip)
+ && ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip != htonl(dcc_ip)) {
+ if (net_ratelimit())
+ printk(KERN_WARNING
+ "Forged DCC command from "
+ "%u.%u.%u.%u: %u.%u.%u.%u:%u\n",
+ NIPQUAD(ct->tuplehash[dir].tuple.src.ip),
+ HIPQUAD(dcc_ip), dcc_port);
+
+ continue;
+ }
+
+ exp = ip_conntrack_expect_alloc();
+ if (exp == NULL) {
+ ret = NF_DROP;
+ goto out;
+ }
+
+ /* save position of address in dcc string,
+ * necessary for NAT */
+ DEBUGP("tcph->seq = %u\n", th->seq);
+ seq = ntohl(th->seq) + (addr_beg_p - ib_ptr);
+
+ /* We refer to the reverse direction ("!dir")
+ * tuples here, because we're expecting
+ * something in the other * direction.
+ * Doesn't matter unless NAT is happening. */
+ exp->tuple = ((struct ip_conntrack_tuple)
+ { { 0, { 0 } },
+ { ct->tuplehash[!dir].tuple.dst.ip,
+ { .tcp = { htons(dcc_port) } },
+ IPPROTO_TCP }});
+ exp->mask = ((struct ip_conntrack_tuple)
+ { { 0, { 0 } },
+ { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
+ exp->expectfn = NULL;
+ exp->master = ct;
+ if (ip_nat_irc_hook)
+ ret = ip_nat_irc_hook(pskb, ctinfo,
+ addr_beg_p - ib_ptr,
+ addr_end_p - addr_beg_p,
+ exp);
+ else if (ip_conntrack_expect_related(exp) != 0) {
+ ip_conntrack_expect_free(exp);
+ ret = NF_DROP;
+ }
+ goto out;
+ } /* for .. NUM_DCCPROTO */
+ } /* while data < ... */
+
+ out:
+ UNLOCK_BH(&irc_buffer_lock);
+ return ret;
+}
+
+static struct ip_conntrack_helper irc_helpers[MAX_PORTS];
+static char irc_names[MAX_PORTS][10];
+
+static void fini(void);
+
+static int __init init(void)
+{
+ int i, ret;
+ struct ip_conntrack_helper *hlpr;
+ char *tmpname;
+
+ if (max_dcc_channels < 1) {
+ printk("ip_conntrack_irc: max_dcc_channels must be a positive integer\n");
+ return -EBUSY;
+ }
+ if (dcc_timeout < 0) {
+ printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n");
+ return -EBUSY;
+ }
+
+ /* If no port given, default to standard irc port */
+ if (ports_c == 0)
+ ports[ports_c++] = IRC_PORT;
+
+ for (i = 0; i < ports_c; i++) {
+ hlpr = &irc_helpers[i];
+ hlpr->tuple.src.u.tcp.port = htons(ports[i]);
+ hlpr->tuple.dst.protonum = IPPROTO_TCP;
+ hlpr->mask.src.u.tcp.port = 0xFFFF;
+ hlpr->mask.dst.protonum = 0xFF;
+ hlpr->max_expected = max_dcc_channels;
+ hlpr->timeout = dcc_timeout;
+ hlpr->me = THIS_MODULE;
+ hlpr->help = help;
+
+ tmpname = &irc_names[i][0];
+ if (ports[i] == IRC_PORT)
+ sprintf(tmpname, "irc");
+ else
+ sprintf(tmpname, "irc-%d", i);
+ hlpr->name = tmpname;
+
+ DEBUGP("port #%d: %d\n", i, ports[i]);
+
+ ret = ip_conntrack_helper_register(hlpr);
+
+ if (ret) {
+ printk("ip_conntrack_irc: ERROR registering port %d\n",
+ ports[i]);
+ fini();
+ return -EBUSY;
+ }
+ }
+ return 0;
+}
+
+/* This function is intentionally _NOT_ defined as __exit, because
+ * it is needed by the init function */
+static void fini(void)
+{
+ int i;
+ for (i = 0; i < ports_c; i++) {
+ DEBUGP("unregistering port %d\n",
+ ports[i]);
+ ip_conntrack_helper_unregister(&irc_helpers[i]);
+ }
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_generic.c b/net/ipv4/netfilter/ip_conntrack_proto_generic.c
new file mode 100644
index 00000000000..88c3712bd25
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_generic.c
@@ -0,0 +1,75 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+
+unsigned long ip_ct_generic_timeout = 600*HZ;
+
+static int generic_pkt_to_tuple(const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct ip_conntrack_tuple *tuple)
+{
+ tuple->src.u.all = 0;
+ tuple->dst.u.all = 0;
+
+ return 1;
+}
+
+static int generic_invert_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack_tuple *orig)
+{
+ tuple->src.u.all = 0;
+ tuple->dst.u.all = 0;
+
+ return 1;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int generic_print_tuple(struct seq_file *s,
+ const struct ip_conntrack_tuple *tuple)
+{
+ return 0;
+}
+
+/* Print out the private part of the conntrack. */
+static int generic_print_conntrack(struct seq_file *s,
+ const struct ip_conntrack *state)
+{
+ return 0;
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int packet(struct ip_conntrack *conntrack,
+ const struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo)
+{
+ ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout);
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static int new(struct ip_conntrack *conntrack, const struct sk_buff *skb)
+{
+ return 1;
+}
+
+struct ip_conntrack_protocol ip_conntrack_generic_protocol =
+{
+ .proto = 0,
+ .name = "unknown",
+ .pkt_to_tuple = generic_pkt_to_tuple,
+ .invert_tuple = generic_invert_tuple,
+ .print_tuple = generic_print_tuple,
+ .print_conntrack = generic_print_conntrack,
+ .packet = packet,
+ .new = new,
+};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
new file mode 100644
index 00000000000..602c74db325
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -0,0 +1,279 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/in.h>
+#include <linux/icmp.h>
+#include <linux/seq_file.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+
+unsigned long ip_ct_icmp_timeout = 30*HZ;
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static int icmp_pkt_to_tuple(const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct ip_conntrack_tuple *tuple)
+{
+ struct icmphdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return 0;
+
+ tuple->dst.u.icmp.type = hp->type;
+ tuple->src.u.icmp.id = hp->un.echo.id;
+ tuple->dst.u.icmp.code = hp->code;
+
+ return 1;
+}
+
+static int icmp_invert_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack_tuple *orig)
+{
+ /* Add 1; spaces filled with 0. */
+ static u_int8_t invmap[]
+ = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1,
+ [ICMP_ECHOREPLY] = ICMP_ECHO + 1,
+ [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1,
+ [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1,
+ [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1,
+ [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1,
+ [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1,
+ [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1};
+
+ if (orig->dst.u.icmp.type >= sizeof(invmap)
+ || !invmap[orig->dst.u.icmp.type])
+ return 0;
+
+ tuple->src.u.icmp.id = orig->src.u.icmp.id;
+ tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;
+ tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
+ return 1;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int icmp_print_tuple(struct seq_file *s,
+ const struct ip_conntrack_tuple *tuple)
+{
+ return seq_printf(s, "type=%u code=%u id=%u ",
+ tuple->dst.u.icmp.type,
+ tuple->dst.u.icmp.code,
+ ntohs(tuple->src.u.icmp.id));
+}
+
+/* Print out the private part of the conntrack. */
+static int icmp_print_conntrack(struct seq_file *s,
+ const struct ip_conntrack *conntrack)
+{
+ return 0;
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int icmp_packet(struct ip_conntrack *ct,
+ const struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo)
+{
+ /* Try to delete connection immediately after all replies:
+ won't actually vanish as we still have skb, and del_timer
+ means this will only run once even if count hits zero twice
+ (theoretically possible with SMP) */
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
+ if (atomic_dec_and_test(&ct->proto.icmp.count)
+ && del_timer(&ct->timeout))
+ ct->timeout.function((unsigned long)ct);
+ } else {
+ atomic_inc(&ct->proto.icmp.count);
+ ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout);
+ }
+
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static int icmp_new(struct ip_conntrack *conntrack,
+ const struct sk_buff *skb)
+{
+ static u_int8_t valid_new[]
+ = { [ICMP_ECHO] = 1,
+ [ICMP_TIMESTAMP] = 1,
+ [ICMP_INFO_REQUEST] = 1,
+ [ICMP_ADDRESS] = 1 };
+
+ if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new)
+ || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) {
+ /* Can't create a new ICMP `conn' with this. */
+ DEBUGP("icmp: can't create new conn with type %u\n",
+ conntrack->tuplehash[0].tuple.dst.u.icmp.type);
+ DUMP_TUPLE(&conntrack->tuplehash[0].tuple);
+ return 0;
+ }
+ atomic_set(&conntrack->proto.icmp.count, 0);
+ return 1;
+}
+
+static int
+icmp_error_message(struct sk_buff *skb,
+ enum ip_conntrack_info *ctinfo,
+ unsigned int hooknum)
+{
+ struct ip_conntrack_tuple innertuple, origtuple;
+ struct {
+ struct icmphdr icmp;
+ struct iphdr ip;
+ } _in, *inside;
+ struct ip_conntrack_protocol *innerproto;
+ struct ip_conntrack_tuple_hash *h;
+ int dataoff;
+
+ IP_NF_ASSERT(skb->nfct == NULL);
+
+ /* Not enough header? */
+ inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in);
+ if (inside == NULL)
+ return NF_ACCEPT;
+
+ /* Ignore ICMP's containing fragments (shouldn't happen) */
+ if (inside->ip.frag_off & htons(IP_OFFSET)) {
+ DEBUGP("icmp_error_track: fragment of proto %u\n",
+ inside->ip.protocol);
+ return NF_ACCEPT;
+ }
+
+ innerproto = ip_ct_find_proto(inside->ip.protocol);
+ dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4;
+ /* Are they talking about one of our connections? */
+ if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) {
+ DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol);
+ return NF_ACCEPT;
+ }
+
+ /* Ordinarily, we'd expect the inverted tupleproto, but it's
+ been preserved inside the ICMP. */
+ if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) {
+ DEBUGP("icmp_error_track: Can't invert tuple\n");
+ return NF_ACCEPT;
+ }
+
+ *ctinfo = IP_CT_RELATED;
+
+ h = ip_conntrack_find_get(&innertuple, NULL);
+ if (!h) {
+ /* Locally generated ICMPs will match inverted if they
+ haven't been SNAT'ed yet */
+ /* FIXME: NAT code has to handle half-done double NAT --RR */
+ if (hooknum == NF_IP_LOCAL_OUT)
+ h = ip_conntrack_find_get(&origtuple, NULL);
+
+ if (!h) {
+ DEBUGP("icmp_error_track: no match\n");
+ return NF_ACCEPT;
+ }
+ /* Reverse direction from that found */
+ if (DIRECTION(h) != IP_CT_DIR_REPLY)
+ *ctinfo += IP_CT_IS_REPLY;
+ } else {
+ if (DIRECTION(h) == IP_CT_DIR_REPLY)
+ *ctinfo += IP_CT_IS_REPLY;
+ }
+
+ /* Update skb to refer to this connection */
+ skb->nfct = &tuplehash_to_ctrack(h)->ct_general;
+ skb->nfctinfo = *ctinfo;
+ return -NF_ACCEPT;
+}
+
+/* Small and modified version of icmp_rcv */
+static int
+icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
+ unsigned int hooknum)
+{
+ struct icmphdr _ih, *icmph;
+
+ /* Not enough header? */
+ icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih);
+ if (icmph == NULL) {
+ if (LOG_INVALID(IPPROTO_ICMP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ "ip_ct_icmp: short packet ");
+ return -NF_ACCEPT;
+ }
+
+ /* See ip_conntrack_proto_tcp.c */
+ if (hooknum != NF_IP_PRE_ROUTING)
+ goto checksum_skipped;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_HW:
+ if (!(u16)csum_fold(skb->csum))
+ break;
+ if (LOG_INVALID(IPPROTO_ICMP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ "ip_ct_icmp: bad HW ICMP checksum ");
+ return -NF_ACCEPT;
+ case CHECKSUM_NONE:
+ if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) {
+ if (LOG_INVALID(IPPROTO_ICMP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ "ip_ct_icmp: bad ICMP checksum ");
+ return -NF_ACCEPT;
+ }
+ default:
+ break;
+ }
+
+checksum_skipped:
+ /*
+ * 18 is the highest 'known' ICMP type. Anything else is a mystery
+ *
+ * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
+ * discarded.
+ */
+ if (icmph->type > NR_ICMP_TYPES) {
+ if (LOG_INVALID(IPPROTO_ICMP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ "ip_ct_icmp: invalid ICMP type ");
+ return -NF_ACCEPT;
+ }
+
+ /* Need to track icmp error message? */
+ if (icmph->type != ICMP_DEST_UNREACH
+ && icmph->type != ICMP_SOURCE_QUENCH
+ && icmph->type != ICMP_TIME_EXCEEDED
+ && icmph->type != ICMP_PARAMETERPROB
+ && icmph->type != ICMP_REDIRECT)
+ return NF_ACCEPT;
+
+ return icmp_error_message(skb, ctinfo, hooknum);
+}
+
+struct ip_conntrack_protocol ip_conntrack_protocol_icmp =
+{
+ .proto = IPPROTO_ICMP,
+ .name = "icmp",
+ .pkt_to_tuple = icmp_pkt_to_tuple,
+ .invert_tuple = icmp_invert_tuple,
+ .print_tuple = icmp_print_tuple,
+ .print_conntrack = icmp_print_conntrack,
+ .packet = icmp_packet,
+ .new = icmp_new,
+ .error = icmp_error,
+};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
new file mode 100644
index 00000000000..ff8c34a860f
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
@@ -0,0 +1,649 @@
+/*
+ * Connection tracking protocol helper module for SCTP.
+ *
+ * SCTP is defined in RFC 2960. References to various sections in this code
+ * are to this RFC.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * Added support for proc manipulation of timeouts.
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/sctp.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/lockhelp.h>
+
+#if 0
+#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__)
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/* Protects conntrack->proto.sctp */
+static DECLARE_RWLOCK(sctp_lock);
+
+/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
+ closely. They're more complex. --RR
+
+ And so for me for SCTP :D -Kiran */
+
+static const char *sctp_conntrack_names[] = {
+ "NONE",
+ "CLOSED",
+ "COOKIE_WAIT",
+ "COOKIE_ECHOED",
+ "ESTABLISHED",
+ "SHUTDOWN_SENT",
+ "SHUTDOWN_RECD",
+ "SHUTDOWN_ACK_SENT",
+};
+
+#define SECS * HZ
+#define MINS * 60 SECS
+#define HOURS * 60 MINS
+#define DAYS * 24 HOURS
+
+static unsigned long ip_ct_sctp_timeout_closed = 10 SECS;
+static unsigned long ip_ct_sctp_timeout_cookie_wait = 3 SECS;
+static unsigned long ip_ct_sctp_timeout_cookie_echoed = 3 SECS;
+static unsigned long ip_ct_sctp_timeout_established = 5 DAYS;
+static unsigned long ip_ct_sctp_timeout_shutdown_sent = 300 SECS / 1000;
+static unsigned long ip_ct_sctp_timeout_shutdown_recd = 300 SECS / 1000;
+static unsigned long ip_ct_sctp_timeout_shutdown_ack_sent = 3 SECS;
+
+static unsigned long * sctp_timeouts[]
+= { NULL, /* SCTP_CONNTRACK_NONE */
+ &ip_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */
+ &ip_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */
+ &ip_ct_sctp_timeout_cookie_echoed, /* SCTP_CONNTRACK_COOKIE_ECHOED */
+ &ip_ct_sctp_timeout_established, /* SCTP_CONNTRACK_ESTABLISHED */
+ &ip_ct_sctp_timeout_shutdown_sent, /* SCTP_CONNTRACK_SHUTDOWN_SENT */
+ &ip_ct_sctp_timeout_shutdown_recd, /* SCTP_CONNTRACK_SHUTDOWN_RECD */
+ &ip_ct_sctp_timeout_shutdown_ack_sent /* SCTP_CONNTRACK_SHUTDOWN_ACK_SENT */
+ };
+
+#define sNO SCTP_CONNTRACK_NONE
+#define sCL SCTP_CONNTRACK_CLOSED
+#define sCW SCTP_CONNTRACK_COOKIE_WAIT
+#define sCE SCTP_CONNTRACK_COOKIE_ECHOED
+#define sES SCTP_CONNTRACK_ESTABLISHED
+#define sSS SCTP_CONNTRACK_SHUTDOWN_SENT
+#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD
+#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT
+#define sIV SCTP_CONNTRACK_MAX
+
+/*
+ These are the descriptions of the states:
+
+NOTE: These state names are tantalizingly similar to the states of an
+SCTP endpoint. But the interpretation of the states is a little different,
+considering that these are the states of the connection and not of an end
+point. Please note the subtleties. -Kiran
+
+NONE - Nothing so far.
+COOKIE WAIT - We have seen an INIT chunk in the original direction, or also
+ an INIT_ACK chunk in the reply direction.
+COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction.
+ESTABLISHED - We have seen a COOKIE_ACK in the reply direction.
+SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction.
+SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin.
+SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite
+ to that of the SHUTDOWN chunk.
+CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
+ the SHUTDOWN chunk. Connection is closed.
+*/
+
+/* TODO
+ - I have assumed that the first INIT is in the original direction.
+ This messes things when an INIT comes in the reply direction in CLOSED
+ state.
+ - Check the error type in the reply dir before transitioning from
+cookie echoed to closed.
+ - Sec 5.2.4 of RFC 2960
+ - Multi Homing support.
+*/
+
+/* SCTP conntrack state transitions */
+static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {
+ {
+/* ORIGINAL */
+/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
+/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA},
+/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},
+/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA},
+/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA},
+/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant have Stale cookie*/
+/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */
+/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in orig dir */
+/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL}
+ },
+ {
+/* REPLY */
+/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
+/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */
+/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},
+/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA},
+/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA},
+/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA},
+/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in reply dir */
+/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA},
+/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL}
+ }
+};
+
+static int sctp_pkt_to_tuple(const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct ip_conntrack_tuple *tuple)
+{
+ sctp_sctphdr_t _hdr, *hp;
+
+ DEBUGP(__FUNCTION__);
+ DEBUGP("\n");
+
+ /* Actually only need first 8 bytes. */
+ hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
+ if (hp == NULL)
+ return 0;
+
+ tuple->src.u.sctp.port = hp->source;
+ tuple->dst.u.sctp.port = hp->dest;
+ return 1;
+}
+
+static int sctp_invert_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack_tuple *orig)
+{
+ DEBUGP(__FUNCTION__);
+ DEBUGP("\n");
+
+ tuple->src.u.sctp.port = orig->dst.u.sctp.port;
+ tuple->dst.u.sctp.port = orig->src.u.sctp.port;
+ return 1;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int sctp_print_tuple(struct seq_file *s,
+ const struct ip_conntrack_tuple *tuple)
+{
+ DEBUGP(__FUNCTION__);
+ DEBUGP("\n");
+
+ return seq_printf(s, "sport=%hu dport=%hu ",
+ ntohs(tuple->src.u.sctp.port),
+ ntohs(tuple->dst.u.sctp.port));
+}
+
+/* Print out the private part of the conntrack. */
+static int sctp_print_conntrack(struct seq_file *s,
+ const struct ip_conntrack *conntrack)
+{
+ enum sctp_conntrack state;
+
+ DEBUGP(__FUNCTION__);
+ DEBUGP("\n");
+
+ READ_LOCK(&sctp_lock);
+ state = conntrack->proto.sctp.state;
+ READ_UNLOCK(&sctp_lock);
+
+ return seq_printf(s, "%s ", sctp_conntrack_names[state]);
+}
+
+#define for_each_sctp_chunk(skb, sch, _sch, offset, count) \
+for (offset = skb->nh.iph->ihl * 4 + sizeof(sctp_sctphdr_t), count = 0; \
+ offset < skb->len && \
+ (sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch)); \
+ offset += (htons(sch->length) + 3) & ~3, count++)
+
+/* Some validity checks to make sure the chunks are fine */
+static int do_basic_checks(struct ip_conntrack *conntrack,
+ const struct sk_buff *skb,
+ char *map)
+{
+ u_int32_t offset, count;
+ sctp_chunkhdr_t _sch, *sch;
+ int flag;
+
+ DEBUGP(__FUNCTION__);
+ DEBUGP("\n");
+
+ flag = 0;
+
+ for_each_sctp_chunk (skb, sch, _sch, offset, count) {
+ DEBUGP("Chunk Num: %d Type: %d\n", count, sch->type);
+
+ if (sch->type == SCTP_CID_INIT
+ || sch->type == SCTP_CID_INIT_ACK
+ || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
+ flag = 1;
+ }
+
+ /* Cookie Ack/Echo chunks not the first OR
+ Init / Init Ack / Shutdown compl chunks not the only chunks */
+ if ((sch->type == SCTP_CID_COOKIE_ACK
+ || sch->type == SCTP_CID_COOKIE_ECHO
+ || flag)
+ && count !=0 ) {
+ DEBUGP("Basic checks failed\n");
+ return 1;
+ }
+
+ if (map) {
+ set_bit(sch->type, (void *)map);
+ }
+ }
+
+ DEBUGP("Basic checks passed\n");
+ return 0;
+}
+
+static int new_state(enum ip_conntrack_dir dir,
+ enum sctp_conntrack cur_state,
+ int chunk_type)
+{
+ int i;
+
+ DEBUGP(__FUNCTION__);
+ DEBUGP("\n");
+
+ DEBUGP("Chunk type: %d\n", chunk_type);
+
+ switch (chunk_type) {
+ case SCTP_CID_INIT:
+ DEBUGP("SCTP_CID_INIT\n");
+ i = 0; break;
+ case SCTP_CID_INIT_ACK:
+ DEBUGP("SCTP_CID_INIT_ACK\n");
+ i = 1; break;
+ case SCTP_CID_ABORT:
+ DEBUGP("SCTP_CID_ABORT\n");
+ i = 2; break;
+ case SCTP_CID_SHUTDOWN:
+ DEBUGP("SCTP_CID_SHUTDOWN\n");
+ i = 3; break;
+ case SCTP_CID_SHUTDOWN_ACK:
+ DEBUGP("SCTP_CID_SHUTDOWN_ACK\n");
+ i = 4; break;
+ case SCTP_CID_ERROR:
+ DEBUGP("SCTP_CID_ERROR\n");
+ i = 5; break;
+ case SCTP_CID_COOKIE_ECHO:
+ DEBUGP("SCTP_CID_COOKIE_ECHO\n");
+ i = 6; break;
+ case SCTP_CID_COOKIE_ACK:
+ DEBUGP("SCTP_CID_COOKIE_ACK\n");
+ i = 7; break;
+ case SCTP_CID_SHUTDOWN_COMPLETE:
+ DEBUGP("SCTP_CID_SHUTDOWN_COMPLETE\n");
+ i = 8; break;
+ default:
+ /* Other chunks like DATA, SACK, HEARTBEAT and
+ its ACK do not cause a change in state */
+ DEBUGP("Unknown chunk type, Will stay in %s\n",
+ sctp_conntrack_names[cur_state]);
+ return cur_state;
+ }
+
+ DEBUGP("dir: %d cur_state: %s chunk_type: %d new_state: %s\n",
+ dir, sctp_conntrack_names[cur_state], chunk_type,
+ sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]);
+
+ return sctp_conntracks[dir][i][cur_state];
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int sctp_packet(struct ip_conntrack *conntrack,
+ const struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo)
+{
+ enum sctp_conntrack newconntrack, oldsctpstate;
+ struct iphdr *iph = skb->nh.iph;
+ sctp_sctphdr_t _sctph, *sh;
+ sctp_chunkhdr_t _sch, *sch;
+ u_int32_t offset, count;
+ char map[256 / sizeof (char)] = {0};
+
+ DEBUGP(__FUNCTION__);
+ DEBUGP("\n");
+
+ sh = skb_header_pointer(skb, iph->ihl * 4, sizeof(_sctph), &_sctph);
+ if (sh == NULL)
+ return -1;
+
+ if (do_basic_checks(conntrack, skb, map) != 0)
+ return -1;
+
+ /* Check the verification tag (Sec 8.5) */
+ if (!test_bit(SCTP_CID_INIT, (void *)map)
+ && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, (void *)map)
+ && !test_bit(SCTP_CID_COOKIE_ECHO, (void *)map)
+ && !test_bit(SCTP_CID_ABORT, (void *)map)
+ && !test_bit(SCTP_CID_SHUTDOWN_ACK, (void *)map)
+ && (sh->vtag != conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
+ DEBUGP("Verification tag check failed\n");
+ return -1;
+ }
+
+ oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX;
+ for_each_sctp_chunk (skb, sch, _sch, offset, count) {
+ WRITE_LOCK(&sctp_lock);
+
+ /* Special cases of Verification tag check (Sec 8.5.1) */
+ if (sch->type == SCTP_CID_INIT) {
+ /* Sec 8.5.1 (A) */
+ if (sh->vtag != 0) {
+ WRITE_UNLOCK(&sctp_lock);
+ return -1;
+ }
+ } else if (sch->type == SCTP_CID_ABORT) {
+ /* Sec 8.5.1 (B) */
+ if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
+ && !(sh->vtag == conntrack->proto.sctp.vtag
+ [1 - CTINFO2DIR(ctinfo)])) {
+ WRITE_UNLOCK(&sctp_lock);
+ return -1;
+ }
+ } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
+ /* Sec 8.5.1 (C) */
+ if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
+ && !(sh->vtag == conntrack->proto.sctp.vtag
+ [1 - CTINFO2DIR(ctinfo)]
+ && (sch->flags & 1))) {
+ WRITE_UNLOCK(&sctp_lock);
+ return -1;
+ }
+ } else if (sch->type == SCTP_CID_COOKIE_ECHO) {
+ /* Sec 8.5.1 (D) */
+ if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
+ WRITE_UNLOCK(&sctp_lock);
+ return -1;
+ }
+ }
+
+ oldsctpstate = conntrack->proto.sctp.state;
+ newconntrack = new_state(CTINFO2DIR(ctinfo), oldsctpstate, sch->type);
+
+ /* Invalid */
+ if (newconntrack == SCTP_CONNTRACK_MAX) {
+ DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n",
+ CTINFO2DIR(ctinfo), sch->type, oldsctpstate);
+ WRITE_UNLOCK(&sctp_lock);
+ return -1;
+ }
+
+ /* If it is an INIT or an INIT ACK note down the vtag */
+ if (sch->type == SCTP_CID_INIT
+ || sch->type == SCTP_CID_INIT_ACK) {
+ sctp_inithdr_t _inithdr, *ih;
+
+ ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
+ sizeof(_inithdr), &_inithdr);
+ if (ih == NULL) {
+ WRITE_UNLOCK(&sctp_lock);
+ return -1;
+ }
+ DEBUGP("Setting vtag %x for dir %d\n",
+ ih->init_tag, !CTINFO2DIR(ctinfo));
+ conntrack->proto.sctp.vtag[!CTINFO2DIR(ctinfo)] = ih->init_tag;
+ }
+
+ conntrack->proto.sctp.state = newconntrack;
+ WRITE_UNLOCK(&sctp_lock);
+ }
+
+ ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]);
+
+ if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED
+ && CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY
+ && newconntrack == SCTP_CONNTRACK_ESTABLISHED) {
+ DEBUGP("Setting assured bit\n");
+ set_bit(IPS_ASSURED_BIT, &conntrack->status);
+ }
+
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static int sctp_new(struct ip_conntrack *conntrack,
+ const struct sk_buff *skb)
+{
+ enum sctp_conntrack newconntrack;
+ struct iphdr *iph = skb->nh.iph;
+ sctp_sctphdr_t _sctph, *sh;
+ sctp_chunkhdr_t _sch, *sch;
+ u_int32_t offset, count;
+ char map[256 / sizeof (char)] = {0};
+
+ DEBUGP(__FUNCTION__);
+ DEBUGP("\n");
+
+ sh = skb_header_pointer(skb, iph->ihl * 4, sizeof(_sctph), &_sctph);
+ if (sh == NULL)
+ return 0;
+
+ if (do_basic_checks(conntrack, skb, map) != 0)
+ return 0;
+
+ /* If an OOTB packet has any of these chunks discard (Sec 8.4) */
+ if ((test_bit (SCTP_CID_ABORT, (void *)map))
+ || (test_bit (SCTP_CID_SHUTDOWN_COMPLETE, (void *)map))
+ || (test_bit (SCTP_CID_COOKIE_ACK, (void *)map))) {
+ return 0;
+ }
+
+ newconntrack = SCTP_CONNTRACK_MAX;
+ for_each_sctp_chunk (skb, sch, _sch, offset, count) {
+ /* Don't need lock here: this conntrack not in circulation yet */
+ newconntrack = new_state (IP_CT_DIR_ORIGINAL,
+ SCTP_CONNTRACK_NONE, sch->type);
+
+ /* Invalid: delete conntrack */
+ if (newconntrack == SCTP_CONNTRACK_MAX) {
+ DEBUGP("ip_conntrack_sctp: invalid new deleting.\n");
+ return 0;
+ }
+
+ /* Copy the vtag into the state info */
+ if (sch->type == SCTP_CID_INIT) {
+ if (sh->vtag == 0) {
+ sctp_inithdr_t _inithdr, *ih;
+
+ ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
+ sizeof(_inithdr), &_inithdr);
+ if (ih == NULL)
+ return 0;
+
+ DEBUGP("Setting vtag %x for new conn\n",
+ ih->init_tag);
+
+ conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] =
+ ih->init_tag;
+ } else {
+ /* Sec 8.5.1 (A) */
+ return 0;
+ }
+ }
+ /* If it is a shutdown ack OOTB packet, we expect a return
+ shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
+ else {
+ DEBUGP("Setting vtag %x for new conn OOTB\n",
+ sh->vtag);
+ conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
+ }
+
+ conntrack->proto.sctp.state = newconntrack;
+ }
+
+ return 1;
+}
+
+static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = {
+ .proto = IPPROTO_SCTP,
+ .name = "sctp",
+ .pkt_to_tuple = sctp_pkt_to_tuple,
+ .invert_tuple = sctp_invert_tuple,
+ .print_tuple = sctp_print_tuple,
+ .print_conntrack = sctp_print_conntrack,
+ .packet = sctp_packet,
+ .new = sctp_new,
+ .destroy = NULL,
+ .me = THIS_MODULE
+};
+
+#ifdef CONFIG_SYSCTL
+static ctl_table ip_ct_sysctl_table[] = {
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED,
+ .procname = "ip_conntrack_sctp_timeout_closed",
+ .data = &ip_ct_sctp_timeout_closed,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT,
+ .procname = "ip_conntrack_sctp_timeout_cookie_wait",
+ .data = &ip_ct_sctp_timeout_cookie_wait,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED,
+ .procname = "ip_conntrack_sctp_timeout_cookie_echoed",
+ .data = &ip_ct_sctp_timeout_cookie_echoed,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED,
+ .procname = "ip_conntrack_sctp_timeout_established",
+ .data = &ip_ct_sctp_timeout_established,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT,
+ .procname = "ip_conntrack_sctp_timeout_shutdown_sent",
+ .data = &ip_ct_sctp_timeout_shutdown_sent,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD,
+ .procname = "ip_conntrack_sctp_timeout_shutdown_recd",
+ .data = &ip_ct_sctp_timeout_shutdown_recd,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT,
+ .procname = "ip_conntrack_sctp_timeout_shutdown_ack_sent",
+ .data = &ip_ct_sctp_timeout_shutdown_ack_sent,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table ip_ct_netfilter_table[] = {
+ {
+ .ctl_name = NET_IPV4_NETFILTER,
+ .procname = "netfilter",
+ .mode = 0555,
+ .child = ip_ct_sysctl_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table ip_ct_ipv4_table[] = {
+ {
+ .ctl_name = NET_IPV4,
+ .procname = "ipv4",
+ .mode = 0555,
+ .child = ip_ct_netfilter_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table ip_ct_net_table[] = {
+ {
+ .ctl_name = CTL_NET,
+ .procname = "net",
+ .mode = 0555,
+ .child = ip_ct_ipv4_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static struct ctl_table_header *ip_ct_sysctl_header;
+#endif
+
+static int __init init(void)
+{
+ int ret;
+
+ ret = ip_conntrack_protocol_register(&ip_conntrack_protocol_sctp);
+ if (ret) {
+ printk("ip_conntrack_proto_sctp: protocol register failed\n");
+ goto out;
+ }
+
+#ifdef CONFIG_SYSCTL
+ ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table, 0);
+ if (ip_ct_sysctl_header == NULL) {
+ ret = -ENOMEM;
+ printk("ip_conntrack_proto_sctp: can't register to sysctl.\n");
+ goto cleanup;
+ }
+#endif
+
+ return ret;
+
+#ifdef CONFIG_SYSCTL
+ cleanup:
+ ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp);
+#endif
+ out:
+ DEBUGP("SCTP conntrack module loading %s\n",
+ ret ? "failed": "succeeded");
+ return ret;
+}
+
+static void __exit fini(void)
+{
+ ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp);
+#ifdef CONFIG_SYSCTL
+ unregister_sysctl_table(ip_ct_sysctl_header);
+#endif
+ DEBUGP("SCTP conntrack module unloaded\n");
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kiran Kumar Immidi");
+MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP");
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
new file mode 100644
index 00000000000..e800b16fc92
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -0,0 +1,1098 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>:
+ * - Real stateful connection tracking
+ * - Modified state transitions table
+ * - Window scaling support added
+ * - SACK support added
+ *
+ * Willy Tarreau:
+ * - State table bugfixes
+ * - More robust state changes
+ * - Tuning timer parameters
+ *
+ * version 2.2
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/spinlock.h>
+
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/lockhelp.h>
+
+#if 0
+#define DEBUGP printk
+#define DEBUGP_VARS
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/* Protects conntrack->proto.tcp */
+static DECLARE_RWLOCK(tcp_lock);
+
+/* "Be conservative in what you do,
+ be liberal in what you accept from others."
+ If it's non-zero, we mark only out of window RST segments as INVALID. */
+int ip_ct_tcp_be_liberal = 0;
+
+/* When connection is picked up from the middle, how many packets are required
+ to pass in each direction when we assume we are in sync - if any side uses
+ window scaling, we lost the game.
+ If it is set to zero, we disable picking up already established
+ connections. */
+int ip_ct_tcp_loose = 3;
+
+/* Max number of the retransmitted packets without receiving an (acceptable)
+ ACK from the destination. If this number is reached, a shorter timer
+ will be started. */
+int ip_ct_tcp_max_retrans = 3;
+
+ /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
+ closely. They're more complex. --RR */
+
+static const char *tcp_conntrack_names[] = {
+ "NONE",
+ "SYN_SENT",
+ "SYN_RECV",
+ "ESTABLISHED",
+ "FIN_WAIT",
+ "CLOSE_WAIT",
+ "LAST_ACK",
+ "TIME_WAIT",
+ "CLOSE",
+ "LISTEN"
+};
+
+#define SECS * HZ
+#define MINS * 60 SECS
+#define HOURS * 60 MINS
+#define DAYS * 24 HOURS
+
+unsigned long ip_ct_tcp_timeout_syn_sent = 2 MINS;
+unsigned long ip_ct_tcp_timeout_syn_recv = 60 SECS;
+unsigned long ip_ct_tcp_timeout_established = 5 DAYS;
+unsigned long ip_ct_tcp_timeout_fin_wait = 2 MINS;
+unsigned long ip_ct_tcp_timeout_close_wait = 60 SECS;
+unsigned long ip_ct_tcp_timeout_last_ack = 30 SECS;
+unsigned long ip_ct_tcp_timeout_time_wait = 2 MINS;
+unsigned long ip_ct_tcp_timeout_close = 10 SECS;
+
+/* RFC1122 says the R2 limit should be at least 100 seconds.
+ Linux uses 15 packets as limit, which corresponds
+ to ~13-30min depending on RTO. */
+unsigned long ip_ct_tcp_timeout_max_retrans = 5 MINS;
+
+static unsigned long * tcp_timeouts[]
+= { NULL, /* TCP_CONNTRACK_NONE */
+ &ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */
+ &ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */
+ &ip_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */
+ &ip_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */
+ &ip_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */
+ &ip_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */
+ &ip_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */
+ &ip_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */
+ NULL, /* TCP_CONNTRACK_LISTEN */
+ };
+
+#define sNO TCP_CONNTRACK_NONE
+#define sSS TCP_CONNTRACK_SYN_SENT
+#define sSR TCP_CONNTRACK_SYN_RECV
+#define sES TCP_CONNTRACK_ESTABLISHED
+#define sFW TCP_CONNTRACK_FIN_WAIT
+#define sCW TCP_CONNTRACK_CLOSE_WAIT
+#define sLA TCP_CONNTRACK_LAST_ACK
+#define sTW TCP_CONNTRACK_TIME_WAIT
+#define sCL TCP_CONNTRACK_CLOSE
+#define sLI TCP_CONNTRACK_LISTEN
+#define sIV TCP_CONNTRACK_MAX
+#define sIG TCP_CONNTRACK_IGNORE
+
+/* What TCP flags are set from RST/SYN/FIN/ACK. */
+enum tcp_bit_set {
+ TCP_SYN_SET,
+ TCP_SYNACK_SET,
+ TCP_FIN_SET,
+ TCP_ACK_SET,
+ TCP_RST_SET,
+ TCP_NONE_SET,
+};
+
+/*
+ * The TCP state transition table needs a few words...
+ *
+ * We are the man in the middle. All the packets go through us
+ * but might get lost in transit to the destination.
+ * It is assumed that the destinations can't receive segments
+ * we haven't seen.
+ *
+ * The checked segment is in window, but our windows are *not*
+ * equivalent with the ones of the sender/receiver. We always
+ * try to guess the state of the current sender.
+ *
+ * The meaning of the states are:
+ *
+ * NONE: initial state
+ * SYN_SENT: SYN-only packet seen
+ * SYN_RECV: SYN-ACK packet seen
+ * ESTABLISHED: ACK packet seen
+ * FIN_WAIT: FIN packet seen
+ * CLOSE_WAIT: ACK seen (after FIN)
+ * LAST_ACK: FIN seen (after FIN)
+ * TIME_WAIT: last ACK seen
+ * CLOSE: closed connection
+ *
+ * LISTEN state is not used.
+ *
+ * Packets marked as IGNORED (sIG):
+ * if they may be either invalid or valid
+ * and the receiver may send back a connection
+ * closing RST or a SYN/ACK.
+ *
+ * Packets marked as INVALID (sIV):
+ * if they are invalid
+ * or we do not support the request (simultaneous open)
+ */
+static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
+ {
+/* ORIGINAL */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV },
+/*
+ * sNO -> sSS Initialize a new connection
+ * sSS -> sSS Retransmitted SYN
+ * sSR -> sIG Late retransmitted SYN?
+ * sES -> sIG Error: SYNs in window outside the SYN_SENT state
+ * are errors. Receiver will reply with RST
+ * and close the connection.
+ * Or we are not in sync and hold a dead connection.
+ * sFW -> sIG
+ * sCW -> sIG
+ * sLA -> sIG
+ * sTW -> sSS Reopened connection (RFC 1122).
+ * sCL -> sSS
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
+/*
+ * A SYN/ACK from the client is always invalid:
+ * - either it tries to set up a simultaneous open, which is
+ * not supported;
+ * - or the firewall has just been inserted between the two hosts
+ * during the session set-up. The SYN will be retransmitted
+ * by the true client (or it'll time out).
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
+/*
+ * sNO -> sIV Too late and no reason to do anything...
+ * sSS -> sIV Client migth not send FIN in this state:
+ * we enforce waiting for a SYN/ACK reply first.
+ * sSR -> sFW Close started.
+ * sES -> sFW
+ * sFW -> sLA FIN seen in both directions, waiting for
+ * the last ACK.
+ * Migth be a retransmitted FIN as well...
+ * sCW -> sLA
+ * sLA -> sLA Retransmitted FIN. Remain in the same state.
+ * sTW -> sTW
+ * sCL -> sCL
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
+/*
+ * sNO -> sES Assumed.
+ * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet.
+ * sSR -> sES Established state is reached.
+ * sES -> sES :-)
+ * sFW -> sCW Normal close request answered by ACK.
+ * sCW -> sCW
+ * sLA -> sTW Last ACK detected.
+ * sTW -> sTW Retransmitted last ACK. Remain in the same state.
+ * sCL -> sCL
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
+/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
+ },
+ {
+/* REPLY */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
+/*
+ * sNO -> sIV Never reached.
+ * sSS -> sIV Simultaneous open, not supported
+ * sSR -> sIV Simultaneous open, not supported.
+ * sES -> sIV Server may not initiate a connection.
+ * sFW -> sIV
+ * sCW -> sIV
+ * sLA -> sIV
+ * sTW -> sIV Reopened connection, but server may not do it.
+ * sCL -> sIV
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV },
+/*
+ * sSS -> sSR Standard open.
+ * sSR -> sSR Retransmitted SYN/ACK.
+ * sES -> sIG Late retransmitted SYN/ACK?
+ * sFW -> sIG Might be SYN/ACK answering ignored SYN
+ * sCW -> sIG
+ * sLA -> sIG
+ * sTW -> sIG
+ * sCL -> sIG
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
+/*
+ * sSS -> sIV Server might not send FIN in this state.
+ * sSR -> sFW Close started.
+ * sES -> sFW
+ * sFW -> sLA FIN seen in both directions.
+ * sCW -> sLA
+ * sLA -> sLA Retransmitted FIN.
+ * sTW -> sTW
+ * sCL -> sCL
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/*ack*/ { sIV, sIV, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV },
+/*
+ * sSS -> sIV Might be a half-open connection.
+ * sSR -> sSR Might answer late resent SYN.
+ * sES -> sES :-)
+ * sFW -> sCW Normal close request answered by ACK.
+ * sCW -> sCW
+ * sLA -> sTW Last ACK detected.
+ * sTW -> sTW Retransmitted last ACK.
+ * sCL -> sCL
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
+/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
+ }
+};
+
+static int tcp_pkt_to_tuple(const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct ip_conntrack_tuple *tuple)
+{
+ struct tcphdr _hdr, *hp;
+
+ /* Actually only need first 8 bytes. */
+ hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
+ if (hp == NULL)
+ return 0;
+
+ tuple->src.u.tcp.port = hp->source;
+ tuple->dst.u.tcp.port = hp->dest;
+
+ return 1;
+}
+
+static int tcp_invert_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack_tuple *orig)
+{
+ tuple->src.u.tcp.port = orig->dst.u.tcp.port;
+ tuple->dst.u.tcp.port = orig->src.u.tcp.port;
+ return 1;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int tcp_print_tuple(struct seq_file *s,
+ const struct ip_conntrack_tuple *tuple)
+{
+ return seq_printf(s, "sport=%hu dport=%hu ",
+ ntohs(tuple->src.u.tcp.port),
+ ntohs(tuple->dst.u.tcp.port));
+}
+
+/* Print out the private part of the conntrack. */
+static int tcp_print_conntrack(struct seq_file *s,
+ const struct ip_conntrack *conntrack)
+{
+ enum tcp_conntrack state;
+
+ READ_LOCK(&tcp_lock);
+ state = conntrack->proto.tcp.state;
+ READ_UNLOCK(&tcp_lock);
+
+ return seq_printf(s, "%s ", tcp_conntrack_names[state]);
+}
+
+static unsigned int get_conntrack_index(const struct tcphdr *tcph)
+{
+ if (tcph->rst) return TCP_RST_SET;
+ else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
+ else if (tcph->fin) return TCP_FIN_SET;
+ else if (tcph->ack) return TCP_ACK_SET;
+ else return TCP_NONE_SET;
+}
+
+/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
+ in IP Filter' by Guido van Rooij.
+
+ http://www.nluug.nl/events/sane2000/papers.html
+ http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz
+
+ The boundaries and the conditions are changed according to RFC793:
+ the packet must intersect the window (i.e. segments may be
+ after the right or before the left edge) and thus receivers may ACK
+ segments after the right edge of the window.
+
+ td_maxend = max(sack + max(win,1)) seen in reply packets
+ td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
+ td_maxwin += seq + len - sender.td_maxend
+ if seq + len > sender.td_maxend
+ td_end = max(seq + len) seen in sent packets
+
+ I. Upper bound for valid data: seq <= sender.td_maxend
+ II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin
+ III. Upper bound for valid ack: sack <= receiver.td_end
+ IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW
+
+ where sack is the highest right edge of sack block found in the packet.
+
+ The upper bound limit for a valid ack is not ignored -
+ we doesn't have to deal with fragments.
+*/
+
+static inline __u32 segment_seq_plus_len(__u32 seq,
+ size_t len,
+ struct iphdr *iph,
+ struct tcphdr *tcph)
+{
+ return (seq + len - (iph->ihl + tcph->doff)*4
+ + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
+}
+
+/* Fixme: what about big packets? */
+#define MAXACKWINCONST 66000
+#define MAXACKWINDOW(sender) \
+ ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \
+ : MAXACKWINCONST)
+
+/*
+ * Simplified tcp_parse_options routine from tcp_input.c
+ */
+static void tcp_options(const struct sk_buff *skb,
+ struct iphdr *iph,
+ struct tcphdr *tcph,
+ struct ip_ct_tcp_state *state)
+{
+ unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
+ unsigned char *ptr;
+ int length = (tcph->doff*4) - sizeof(struct tcphdr);
+
+ if (!length)
+ return;
+
+ ptr = skb_header_pointer(skb,
+ (iph->ihl * 4) + sizeof(struct tcphdr),
+ length, buff);
+ BUG_ON(ptr == NULL);
+
+ state->td_scale =
+ state->flags = 0;
+
+ while (length > 0) {
+ int opcode=*ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ opsize=*ptr++;
+ if (opsize < 2) /* "silly options" */
+ return;
+ if (opsize > length)
+ break; /* don't parse partial options */
+
+ if (opcode == TCPOPT_SACK_PERM
+ && opsize == TCPOLEN_SACK_PERM)
+ state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
+ else if (opcode == TCPOPT_WINDOW
+ && opsize == TCPOLEN_WINDOW) {
+ state->td_scale = *(u_int8_t *)ptr;
+
+ if (state->td_scale > 14) {
+ /* See RFC1323 */
+ state->td_scale = 14;
+ }
+ state->flags |=
+ IP_CT_TCP_FLAG_WINDOW_SCALE;
+ }
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ }
+}
+
+static void tcp_sack(const struct sk_buff *skb,
+ struct iphdr *iph,
+ struct tcphdr *tcph,
+ __u32 *sack)
+{
+ unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
+ unsigned char *ptr;
+ int length = (tcph->doff*4) - sizeof(struct tcphdr);
+ __u32 tmp;
+
+ if (!length)
+ return;
+
+ ptr = skb_header_pointer(skb,
+ (iph->ihl * 4) + sizeof(struct tcphdr),
+ length, buff);
+ BUG_ON(ptr == NULL);
+
+ /* Fast path for timestamp-only option */
+ if (length == TCPOLEN_TSTAMP_ALIGNED*4
+ && *(__u32 *)ptr ==
+ __constant_ntohl((TCPOPT_NOP << 24)
+ | (TCPOPT_NOP << 16)
+ | (TCPOPT_TIMESTAMP << 8)
+ | TCPOLEN_TIMESTAMP))
+ return;
+
+ while (length > 0) {
+ int opcode=*ptr++;
+ int opsize, i;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ opsize=*ptr++;
+ if (opsize < 2) /* "silly options" */
+ return;
+ if (opsize > length)
+ break; /* don't parse partial options */
+
+ if (opcode == TCPOPT_SACK
+ && opsize >= (TCPOLEN_SACK_BASE
+ + TCPOLEN_SACK_PERBLOCK)
+ && !((opsize - TCPOLEN_SACK_BASE)
+ % TCPOLEN_SACK_PERBLOCK)) {
+ for (i = 0;
+ i < (opsize - TCPOLEN_SACK_BASE);
+ i += TCPOLEN_SACK_PERBLOCK) {
+ tmp = ntohl(*((u_int32_t *)(ptr+i)+1));
+
+ if (after(tmp, *sack))
+ *sack = tmp;
+ }
+ return;
+ }
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ }
+}
+
+static int tcp_in_window(struct ip_ct_tcp *state,
+ enum ip_conntrack_dir dir,
+ unsigned int index,
+ const struct sk_buff *skb,
+ struct iphdr *iph,
+ struct tcphdr *tcph)
+{
+ struct ip_ct_tcp_state *sender = &state->seen[dir];
+ struct ip_ct_tcp_state *receiver = &state->seen[!dir];
+ __u32 seq, ack, sack, end, win, swin;
+ int res;
+
+ /*
+ * Get the required data from the packet.
+ */
+ seq = ntohl(tcph->seq);
+ ack = sack = ntohl(tcph->ack_seq);
+ win = ntohs(tcph->window);
+ end = segment_seq_plus_len(seq, skb->len, iph, tcph);
+
+ if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
+ tcp_sack(skb, iph, tcph, &sack);
+
+ DEBUGP("tcp_in_window: START\n");
+ DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
+ "seq=%u ack=%u sack=%u win=%u end=%u\n",
+ NIPQUAD(iph->saddr), ntohs(tcph->source),
+ NIPQUAD(iph->daddr), ntohs(tcph->dest),
+ seq, ack, sack, win, end);
+ DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
+ "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+ sender->td_end, sender->td_maxend, sender->td_maxwin,
+ sender->td_scale,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+ receiver->td_scale);
+
+ if (sender->td_end == 0) {
+ /*
+ * Initialize sender data.
+ */
+ if (tcph->syn && tcph->ack) {
+ /*
+ * Outgoing SYN-ACK in reply to a SYN.
+ */
+ sender->td_end =
+ sender->td_maxend = end;
+ sender->td_maxwin = (win == 0 ? 1 : win);
+
+ tcp_options(skb, iph, tcph, sender);
+ /*
+ * RFC 1323:
+ * Both sides must send the Window Scale option
+ * to enable window scaling in either direction.
+ */
+ if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
+ && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
+ sender->td_scale =
+ receiver->td_scale = 0;
+ } else {
+ /*
+ * We are in the middle of a connection,
+ * its history is lost for us.
+ * Let's try to use the data from the packet.
+ */
+ sender->td_end = end;
+ sender->td_maxwin = (win == 0 ? 1 : win);
+ sender->td_maxend = end + sender->td_maxwin;
+ }
+ } else if (((state->state == TCP_CONNTRACK_SYN_SENT
+ && dir == IP_CT_DIR_ORIGINAL)
+ || (state->state == TCP_CONNTRACK_SYN_RECV
+ && dir == IP_CT_DIR_REPLY))
+ && after(end, sender->td_end)) {
+ /*
+ * RFC 793: "if a TCP is reinitialized ... then it need
+ * not wait at all; it must only be sure to use sequence
+ * numbers larger than those recently used."
+ */
+ sender->td_end =
+ sender->td_maxend = end;
+ sender->td_maxwin = (win == 0 ? 1 : win);
+
+ tcp_options(skb, iph, tcph, sender);
+ }
+
+ if (!(tcph->ack)) {
+ /*
+ * If there is no ACK, just pretend it was set and OK.
+ */
+ ack = sack = receiver->td_end;
+ } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
+ (TCP_FLAG_ACK|TCP_FLAG_RST))
+ && (ack == 0)) {
+ /*
+ * Broken TCP stacks, that set ACK in RST packets as well
+ * with zero ack value.
+ */
+ ack = sack = receiver->td_end;
+ }
+
+ if (seq == end
+ && (!tcph->rst
+ || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)))
+ /*
+ * Packets contains no data: we assume it is valid
+ * and check the ack value only.
+ * However RST segments are always validated by their
+ * SEQ number, except when seq == 0 (reset sent answering
+ * SYN.
+ */
+ seq = end = sender->td_end;
+
+ DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
+ "seq=%u ack=%u sack =%u win=%u end=%u\n",
+ NIPQUAD(iph->saddr), ntohs(tcph->source),
+ NIPQUAD(iph->daddr), ntohs(tcph->dest),
+ seq, ack, sack, win, end);
+ DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
+ "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+ sender->td_end, sender->td_maxend, sender->td_maxwin,
+ sender->td_scale,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+ receiver->td_scale);
+
+ DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
+ before(seq, sender->td_maxend + 1),
+ after(end, sender->td_end - receiver->td_maxwin - 1),
+ before(sack, receiver->td_end + 1),
+ after(ack, receiver->td_end - MAXACKWINDOW(sender)));
+
+ if (sender->loose || receiver->loose ||
+ (before(seq, sender->td_maxend + 1) &&
+ after(end, sender->td_end - receiver->td_maxwin - 1) &&
+ before(sack, receiver->td_end + 1) &&
+ after(ack, receiver->td_end - MAXACKWINDOW(sender)))) {
+ /*
+ * Take into account window scaling (RFC 1323).
+ */
+ if (!tcph->syn)
+ win <<= sender->td_scale;
+
+ /*
+ * Update sender data.
+ */
+ swin = win + (sack - ack);
+ if (sender->td_maxwin < swin)
+ sender->td_maxwin = swin;
+ if (after(end, sender->td_end))
+ sender->td_end = end;
+ /*
+ * Update receiver data.
+ */
+ if (after(end, sender->td_maxend))
+ receiver->td_maxwin += end - sender->td_maxend;
+ if (after(sack + win, receiver->td_maxend - 1)) {
+ receiver->td_maxend = sack + win;
+ if (win == 0)
+ receiver->td_maxend++;
+ }
+
+ /*
+ * Check retransmissions.
+ */
+ if (index == TCP_ACK_SET) {
+ if (state->last_dir == dir
+ && state->last_seq == seq
+ && state->last_ack == ack
+ && state->last_end == end)
+ state->retrans++;
+ else {
+ state->last_dir = dir;
+ state->last_seq = seq;
+ state->last_ack = ack;
+ state->last_end = end;
+ state->retrans = 0;
+ }
+ }
+ /*
+ * Close the window of disabled window tracking :-)
+ */
+ if (sender->loose)
+ sender->loose--;
+
+ res = 1;
+ } else {
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ "ip_ct_tcp: %s ",
+ before(seq, sender->td_maxend + 1) ?
+ after(end, sender->td_end - receiver->td_maxwin - 1) ?
+ before(sack, receiver->td_end + 1) ?
+ after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG"
+ : "ACK is under the lower bound (possible overly delayed ACK)"
+ : "ACK is over the upper bound (ACKed data not seen yet)"
+ : "SEQ is under the lower bound (already ACKed data retransmitted)"
+ : "SEQ is over the upper bound (over the window of the receiver)");
+
+ res = ip_ct_tcp_be_liberal;
+ }
+
+ DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u "
+ "receiver end=%u maxend=%u maxwin=%u\n",
+ res, sender->td_end, sender->td_maxend, sender->td_maxwin,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
+
+ return res;
+}
+
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+/* Update sender->td_end after NAT successfully mangled the packet */
+void ip_conntrack_tcp_update(struct sk_buff *skb,
+ struct ip_conntrack *conntrack,
+ enum ip_conntrack_dir dir)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct tcphdr *tcph = (void *)skb->nh.iph + skb->nh.iph->ihl*4;
+ __u32 end;
+#ifdef DEBUGP_VARS
+ struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir];
+ struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir];
+#endif
+
+ end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph);
+
+ WRITE_LOCK(&tcp_lock);
+ /*
+ * We have to worry for the ack in the reply packet only...
+ */
+ if (after(end, conntrack->proto.tcp.seen[dir].td_end))
+ conntrack->proto.tcp.seen[dir].td_end = end;
+ conntrack->proto.tcp.last_end = end;
+ WRITE_UNLOCK(&tcp_lock);
+ DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
+ "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+ sender->td_end, sender->td_maxend, sender->td_maxwin,
+ sender->td_scale,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+ receiver->td_scale);
+}
+
+#endif
+
+#define TH_FIN 0x01
+#define TH_SYN 0x02
+#define TH_RST 0x04
+#define TH_PUSH 0x08
+#define TH_ACK 0x10
+#define TH_URG 0x20
+#define TH_ECE 0x40
+#define TH_CWR 0x80
+
+/* table of valid flag combinations - ECE and CWR are always valid */
+static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] =
+{
+ [TH_SYN] = 1,
+ [TH_SYN|TH_ACK] = 1,
+ [TH_RST] = 1,
+ [TH_RST|TH_ACK] = 1,
+ [TH_RST|TH_ACK|TH_PUSH] = 1,
+ [TH_FIN|TH_ACK] = 1,
+ [TH_ACK] = 1,
+ [TH_ACK|TH_PUSH] = 1,
+ [TH_ACK|TH_URG] = 1,
+ [TH_ACK|TH_URG|TH_PUSH] = 1,
+ [TH_FIN|TH_ACK|TH_PUSH] = 1,
+ [TH_FIN|TH_ACK|TH_URG] = 1,
+ [TH_FIN|TH_ACK|TH_URG|TH_PUSH] = 1,
+};
+
+/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */
+static int tcp_error(struct sk_buff *skb,
+ enum ip_conntrack_info *ctinfo,
+ unsigned int hooknum)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct tcphdr _tcph, *th;
+ unsigned int tcplen = skb->len - iph->ihl * 4;
+ u_int8_t tcpflags;
+
+ /* Smaller that minimal TCP header? */
+ th = skb_header_pointer(skb, iph->ihl * 4,
+ sizeof(_tcph), &_tcph);
+ if (th == NULL) {
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ "ip_ct_tcp: short packet ");
+ return -NF_ACCEPT;
+ }
+
+ /* Not whole TCP header or malformed packet */
+ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ "ip_ct_tcp: truncated/malformed packet ");
+ return -NF_ACCEPT;
+ }
+
+ /* Checksum invalid? Ignore.
+ * We skip checking packets on the outgoing path
+ * because the semantic of CHECKSUM_HW is different there
+ * and moreover root might send raw packets.
+ */
+ /* FIXME: Source route IP option packets --RR */
+ if (hooknum == NF_IP_PRE_ROUTING
+ && csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP,
+ skb->ip_summed == CHECKSUM_HW ? skb->csum
+ : skb_checksum(skb, iph->ihl*4, tcplen, 0))) {
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ "ip_ct_tcp: bad TCP checksum ");
+ return -NF_ACCEPT;
+ }
+
+ /* Check TCP flags. */
+ tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR));
+ if (!tcp_valid_flags[tcpflags]) {
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ "ip_ct_tcp: invalid TCP flag combination ");
+ return -NF_ACCEPT;
+ }
+
+ return NF_ACCEPT;
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int tcp_packet(struct ip_conntrack *conntrack,
+ const struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo)
+{
+ enum tcp_conntrack new_state, old_state;
+ enum ip_conntrack_dir dir;
+ struct iphdr *iph = skb->nh.iph;
+ struct tcphdr *th, _tcph;
+ unsigned long timeout;
+ unsigned int index;
+
+ th = skb_header_pointer(skb, iph->ihl * 4,
+ sizeof(_tcph), &_tcph);
+ BUG_ON(th == NULL);
+
+ WRITE_LOCK(&tcp_lock);
+ old_state = conntrack->proto.tcp.state;
+ dir = CTINFO2DIR(ctinfo);
+ index = get_conntrack_index(th);
+ new_state = tcp_conntracks[dir][index][old_state];
+
+ switch (new_state) {
+ case TCP_CONNTRACK_IGNORE:
+ /* Either SYN in ORIGINAL
+ * or SYN/ACK in REPLY. */
+ if (index == TCP_SYNACK_SET
+ && conntrack->proto.tcp.last_index == TCP_SYN_SET
+ && conntrack->proto.tcp.last_dir != dir
+ && ntohl(th->ack_seq) ==
+ conntrack->proto.tcp.last_end) {
+ /* This SYN/ACK acknowledges a SYN that we earlier
+ * ignored as invalid. This means that the client and
+ * the server are both in sync, while the firewall is
+ * not. We kill this session and block the SYN/ACK so
+ * that the client cannot but retransmit its SYN and
+ * thus initiate a clean new session.
+ */
+ WRITE_UNLOCK(&tcp_lock);
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ "ip_ct_tcp: killing out of sync session ");
+ if (del_timer(&conntrack->timeout))
+ conntrack->timeout.function((unsigned long)
+ conntrack);
+ return -NF_DROP;
+ }
+ conntrack->proto.tcp.last_index = index;
+ conntrack->proto.tcp.last_dir = dir;
+ conntrack->proto.tcp.last_seq = ntohl(th->seq);
+ conntrack->proto.tcp.last_end =
+ segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th);
+
+ WRITE_UNLOCK(&tcp_lock);
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ "ip_ct_tcp: invalid packet ignored ");
+ return NF_ACCEPT;
+ case TCP_CONNTRACK_MAX:
+ /* Invalid packet */
+ DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
+ dir, get_conntrack_index(th),
+ old_state);
+ WRITE_UNLOCK(&tcp_lock);
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ "ip_ct_tcp: invalid state ");
+ return -NF_ACCEPT;
+ case TCP_CONNTRACK_SYN_SENT:
+ if (old_state < TCP_CONNTRACK_TIME_WAIT)
+ break;
+ if ((conntrack->proto.tcp.seen[dir].flags &
+ IP_CT_TCP_FLAG_CLOSE_INIT)
+ || after(ntohl(th->seq),
+ conntrack->proto.tcp.seen[dir].td_end)) {
+ /* Attempt to reopen a closed connection.
+ * Delete this connection and look up again. */
+ WRITE_UNLOCK(&tcp_lock);
+ if (del_timer(&conntrack->timeout))
+ conntrack->timeout.function((unsigned long)
+ conntrack);
+ return -NF_REPEAT;
+ } else {
+ WRITE_UNLOCK(&tcp_lock);
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ "ip_ct_tcp: invalid SYN");
+ return -NF_ACCEPT;
+ }
+ case TCP_CONNTRACK_CLOSE:
+ if (index == TCP_RST_SET
+ && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)
+ && conntrack->proto.tcp.last_index == TCP_SYN_SET
+ && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) {
+ /* RST sent to invalid SYN we had let trough
+ * SYN was in window then, tear down connection.
+ * We skip window checking, because packet might ACK
+ * segments we ignored in the SYN. */
+ goto in_window;
+ }
+ /* Just fall trough */
+ default:
+ /* Keep compilers happy. */
+ break;
+ }
+
+ if (!tcp_in_window(&conntrack->proto.tcp, dir, index,
+ skb, iph, th)) {
+ WRITE_UNLOCK(&tcp_lock);
+ return -NF_ACCEPT;
+ }
+ in_window:
+ /* From now on we have got in-window packets */
+ conntrack->proto.tcp.last_index = index;
+
+ DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
+ "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
+ NIPQUAD(iph->saddr), ntohs(th->source),
+ NIPQUAD(iph->daddr), ntohs(th->dest),
+ (th->syn ? 1 : 0), (th->ack ? 1 : 0),
+ (th->fin ? 1 : 0), (th->rst ? 1 : 0),
+ old_state, new_state);
+
+ conntrack->proto.tcp.state = new_state;
+ if (old_state != new_state
+ && (new_state == TCP_CONNTRACK_FIN_WAIT
+ || new_state == TCP_CONNTRACK_CLOSE))
+ conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
+ timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans
+ && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans
+ ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
+ WRITE_UNLOCK(&tcp_lock);
+
+ if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
+ /* If only reply is a RST, we can consider ourselves not to
+ have an established connection: this is a fairly common
+ problem case, so we can delete the conntrack
+ immediately. --RR */
+ if (th->rst) {
+ if (del_timer(&conntrack->timeout))
+ conntrack->timeout.function((unsigned long)
+ conntrack);
+ return NF_ACCEPT;
+ }
+ } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status)
+ && (old_state == TCP_CONNTRACK_SYN_RECV
+ || old_state == TCP_CONNTRACK_ESTABLISHED)
+ && new_state == TCP_CONNTRACK_ESTABLISHED) {
+ /* Set ASSURED if we see see valid ack in ESTABLISHED
+ after SYN_RECV or a valid answer for a picked up
+ connection. */
+ set_bit(IPS_ASSURED_BIT, &conntrack->status);
+ }
+ ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout);
+
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static int tcp_new(struct ip_conntrack *conntrack,
+ const struct sk_buff *skb)
+{
+ enum tcp_conntrack new_state;
+ struct iphdr *iph = skb->nh.iph;
+ struct tcphdr *th, _tcph;
+#ifdef DEBUGP_VARS
+ struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0];
+ struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1];
+#endif
+
+ th = skb_header_pointer(skb, iph->ihl * 4,
+ sizeof(_tcph), &_tcph);
+ BUG_ON(th == NULL);
+
+ /* Don't need lock here: this conntrack not in circulation yet */
+ new_state
+ = tcp_conntracks[0][get_conntrack_index(th)]
+ [TCP_CONNTRACK_NONE];
+
+ /* Invalid: delete conntrack */
+ if (new_state >= TCP_CONNTRACK_MAX) {
+ DEBUGP("ip_ct_tcp: invalid new deleting.\n");
+ return 0;
+ }
+
+ if (new_state == TCP_CONNTRACK_SYN_SENT) {
+ /* SYN packet */
+ conntrack->proto.tcp.seen[0].td_end =
+ segment_seq_plus_len(ntohl(th->seq), skb->len,
+ iph, th);
+ conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
+ if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
+ conntrack->proto.tcp.seen[0].td_maxwin = 1;
+ conntrack->proto.tcp.seen[0].td_maxend =
+ conntrack->proto.tcp.seen[0].td_end;
+
+ tcp_options(skb, iph, th, &conntrack->proto.tcp.seen[0]);
+ conntrack->proto.tcp.seen[1].flags = 0;
+ conntrack->proto.tcp.seen[0].loose =
+ conntrack->proto.tcp.seen[1].loose = 0;
+ } else if (ip_ct_tcp_loose == 0) {
+ /* Don't try to pick up connections. */
+ return 0;
+ } else {
+ /*
+ * We are in the middle of a connection,
+ * its history is lost for us.
+ * Let's try to use the data from the packet.
+ */
+ conntrack->proto.tcp.seen[0].td_end =
+ segment_seq_plus_len(ntohl(th->seq), skb->len,
+ iph, th);
+ conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
+ if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
+ conntrack->proto.tcp.seen[0].td_maxwin = 1;
+ conntrack->proto.tcp.seen[0].td_maxend =
+ conntrack->proto.tcp.seen[0].td_end +
+ conntrack->proto.tcp.seen[0].td_maxwin;
+ conntrack->proto.tcp.seen[0].td_scale = 0;
+
+ /* We assume SACK. Should we assume window scaling too? */
+ conntrack->proto.tcp.seen[0].flags =
+ conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM;
+ conntrack->proto.tcp.seen[0].loose =
+ conntrack->proto.tcp.seen[1].loose = ip_ct_tcp_loose;
+ }
+
+ conntrack->proto.tcp.seen[1].td_end = 0;
+ conntrack->proto.tcp.seen[1].td_maxend = 0;
+ conntrack->proto.tcp.seen[1].td_maxwin = 1;
+ conntrack->proto.tcp.seen[1].td_scale = 0;
+
+ /* tcp_packet will set them */
+ conntrack->proto.tcp.state = TCP_CONNTRACK_NONE;
+ conntrack->proto.tcp.last_index = TCP_NONE_SET;
+
+ DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
+ "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+ sender->td_end, sender->td_maxend, sender->td_maxwin,
+ sender->td_scale,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+ receiver->td_scale);
+ return 1;
+}
+
+struct ip_conntrack_protocol ip_conntrack_protocol_tcp =
+{
+ .proto = IPPROTO_TCP,
+ .name = "tcp",
+ .pkt_to_tuple = tcp_pkt_to_tuple,
+ .invert_tuple = tcp_invert_tuple,
+ .print_tuple = tcp_print_tuple,
+ .print_conntrack = tcp_print_conntrack,
+ .packet = tcp_packet,
+ .new = tcp_new,
+ .error = tcp_error,
+};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
new file mode 100644
index 00000000000..5bc28a22462
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -0,0 +1,146 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/seq_file.h>
+#include <net/checksum.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+
+unsigned long ip_ct_udp_timeout = 30*HZ;
+unsigned long ip_ct_udp_timeout_stream = 180*HZ;
+
+static int udp_pkt_to_tuple(const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct ip_conntrack_tuple *tuple)
+{
+ struct udphdr _hdr, *hp;
+
+ /* Actually only need first 8 bytes. */
+ hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return 0;
+
+ tuple->src.u.udp.port = hp->source;
+ tuple->dst.u.udp.port = hp->dest;
+
+ return 1;
+}
+
+static int udp_invert_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack_tuple *orig)
+{
+ tuple->src.u.udp.port = orig->dst.u.udp.port;
+ tuple->dst.u.udp.port = orig->src.u.udp.port;
+ return 1;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int udp_print_tuple(struct seq_file *s,
+ const struct ip_conntrack_tuple *tuple)
+{
+ return seq_printf(s, "sport=%hu dport=%hu ",
+ ntohs(tuple->src.u.udp.port),
+ ntohs(tuple->dst.u.udp.port));
+}
+
+/* Print out the private part of the conntrack. */
+static int udp_print_conntrack(struct seq_file *s,
+ const struct ip_conntrack *conntrack)
+{
+ return 0;
+}
+
+/* Returns verdict for packet, and may modify conntracktype */
+static int udp_packet(struct ip_conntrack *conntrack,
+ const struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo)
+{
+ /* If we've seen traffic both ways, this is some kind of UDP
+ stream. Extend timeout. */
+ if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
+ ip_ct_refresh_acct(conntrack, ctinfo, skb,
+ ip_ct_udp_timeout_stream);
+ /* Also, more likely to be important, and not a probe */
+ set_bit(IPS_ASSURED_BIT, &conntrack->status);
+ } else
+ ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout);
+
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static int udp_new(struct ip_conntrack *conntrack, const struct sk_buff *skb)
+{
+ return 1;
+}
+
+static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
+ unsigned int hooknum)
+{
+ struct iphdr *iph = skb->nh.iph;
+ unsigned int udplen = skb->len - iph->ihl * 4;
+ struct udphdr _hdr, *hdr;
+
+ /* Header is too small? */
+ hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr);
+ if (hdr == NULL) {
+ if (LOG_INVALID(IPPROTO_UDP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ "ip_ct_udp: short packet ");
+ return -NF_ACCEPT;
+ }
+
+ /* Truncated/malformed packets */
+ if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
+ if (LOG_INVALID(IPPROTO_UDP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ "ip_ct_udp: truncated/malformed packet ");
+ return -NF_ACCEPT;
+ }
+
+ /* Packet with no checksum */
+ if (!hdr->check)
+ return NF_ACCEPT;
+
+ /* Checksum invalid? Ignore.
+ * We skip checking packets on the outgoing path
+ * because the semantic of CHECKSUM_HW is different there
+ * and moreover root might send raw packets.
+ * FIXME: Source route IP option packets --RR */
+ if (hooknum == NF_IP_PRE_ROUTING
+ && csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP,
+ skb->ip_summed == CHECKSUM_HW ? skb->csum
+ : skb_checksum(skb, iph->ihl*4, udplen, 0))) {
+ if (LOG_INVALID(IPPROTO_UDP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ "ip_ct_udp: bad UDP checksum ");
+ return -NF_ACCEPT;
+ }
+
+ return NF_ACCEPT;
+}
+
+struct ip_conntrack_protocol ip_conntrack_protocol_udp =
+{
+ .proto = IPPROTO_UDP,
+ .name = "udp",
+ .pkt_to_tuple = udp_pkt_to_tuple,
+ .invert_tuple = udp_invert_tuple,
+ .print_tuple = udp_print_tuple,
+ .print_conntrack = udp_print_conntrack,
+ .packet = udp_packet,
+ .new = udp_new,
+ .error = udp_error,
+};
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
new file mode 100644
index 00000000000..80a7bde2a57
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -0,0 +1,961 @@
+/* This file contains all the functions required for the standalone
+ ip_conntrack module.
+
+ These are not required by the compatibility layer.
+*/
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/percpu.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#include <net/checksum.h>
+#include <net/ip.h>
+
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
+
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+MODULE_LICENSE("GPL");
+
+extern atomic_t ip_conntrack_count;
+DECLARE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
+
+static int kill_proto(struct ip_conntrack *i, void *data)
+{
+ return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum ==
+ *((u_int8_t *) data));
+}
+
+#ifdef CONFIG_PROC_FS
+static int
+print_tuple(struct seq_file *s, const struct ip_conntrack_tuple *tuple,
+ struct ip_conntrack_protocol *proto)
+{
+ seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ",
+ NIPQUAD(tuple->src.ip), NIPQUAD(tuple->dst.ip));
+ return proto->print_tuple(s, tuple);
+}
+
+#ifdef CONFIG_IP_NF_CT_ACCT
+static unsigned int
+seq_print_counters(struct seq_file *s,
+ const struct ip_conntrack_counter *counter)
+{
+ return seq_printf(s, "packets=%llu bytes=%llu ",
+ (unsigned long long)counter->packets,
+ (unsigned long long)counter->bytes);
+}
+#else
+#define seq_print_counters(x, y) 0
+#endif
+
+struct ct_iter_state {
+ unsigned int bucket;
+};
+
+static struct list_head *ct_get_first(struct seq_file *seq)
+{
+ struct ct_iter_state *st = seq->private;
+
+ for (st->bucket = 0;
+ st->bucket < ip_conntrack_htable_size;
+ st->bucket++) {
+ if (!list_empty(&ip_conntrack_hash[st->bucket]))
+ return ip_conntrack_hash[st->bucket].next;
+ }
+ return NULL;
+}
+
+static struct list_head *ct_get_next(struct seq_file *seq, struct list_head *head)
+{
+ struct ct_iter_state *st = seq->private;
+
+ head = head->next;
+ while (head == &ip_conntrack_hash[st->bucket]) {
+ if (++st->bucket >= ip_conntrack_htable_size)
+ return NULL;
+ head = ip_conntrack_hash[st->bucket].next;
+ }
+ return head;
+}
+
+static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct list_head *head = ct_get_first(seq);
+
+ if (head)
+ while (pos && (head = ct_get_next(seq, head)))
+ pos--;
+ return pos ? NULL : head;
+}
+
+static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ READ_LOCK(&ip_conntrack_lock);
+ return ct_get_idx(seq, *pos);
+}
+
+static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return ct_get_next(s, v);
+}
+
+static void ct_seq_stop(struct seq_file *s, void *v)
+{
+ READ_UNLOCK(&ip_conntrack_lock);
+}
+
+static int ct_seq_show(struct seq_file *s, void *v)
+{
+ const struct ip_conntrack_tuple_hash *hash = v;
+ const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash);
+ struct ip_conntrack_protocol *proto;
+
+ MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+ IP_NF_ASSERT(conntrack);
+
+ /* we only want to print DIR_ORIGINAL */
+ if (DIRECTION(hash))
+ return 0;
+
+ proto = ip_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.protonum);
+ IP_NF_ASSERT(proto);
+
+ if (seq_printf(s, "%-8s %u %ld ",
+ proto->name,
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum,
+ timer_pending(&conntrack->timeout)
+ ? (long)(conntrack->timeout.expires - jiffies)/HZ
+ : 0) != 0)
+ return -ENOSPC;
+
+ if (proto->print_conntrack(s, conntrack))
+ return -ENOSPC;
+
+ if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ proto))
+ return -ENOSPC;
+
+ if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_ORIGINAL]))
+ return -ENOSPC;
+
+ if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)))
+ if (seq_printf(s, "[UNREPLIED] "))
+ return -ENOSPC;
+
+ if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple,
+ proto))
+ return -ENOSPC;
+
+ if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_REPLY]))
+ return -ENOSPC;
+
+ if (test_bit(IPS_ASSURED_BIT, &conntrack->status))
+ if (seq_printf(s, "[ASSURED] "))
+ return -ENOSPC;
+
+#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
+ if (seq_printf(s, "mark=%lu ", conntrack->mark))
+ return -ENOSPC;
+#endif
+
+ if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use)))
+ return -ENOSPC;
+
+ return 0;
+}
+
+static struct seq_operations ct_seq_ops = {
+ .start = ct_seq_start,
+ .next = ct_seq_next,
+ .stop = ct_seq_stop,
+ .show = ct_seq_show
+};
+
+static int ct_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ struct ct_iter_state *st;
+ int ret;
+
+ st = kmalloc(sizeof(struct ct_iter_state), GFP_KERNEL);
+ if (st == NULL)
+ return -ENOMEM;
+ ret = seq_open(file, &ct_seq_ops);
+ if (ret)
+ goto out_free;
+ seq = file->private_data;
+ seq->private = st;
+ memset(st, 0, sizeof(struct ct_iter_state));
+ return ret;
+out_free:
+ kfree(st);
+ return ret;
+}
+
+static struct file_operations ct_file_ops = {
+ .owner = THIS_MODULE,
+ .open = ct_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+/* expects */
+static void *exp_seq_start(struct seq_file *s, loff_t *pos)
+{
+ struct list_head *e = &ip_conntrack_expect_list;
+ loff_t i;
+
+ /* strange seq_file api calls stop even if we fail,
+ * thus we need to grab lock since stop unlocks */
+ READ_LOCK(&ip_conntrack_lock);
+
+ if (list_empty(e))
+ return NULL;
+
+ for (i = 0; i <= *pos; i++) {
+ e = e->next;
+ if (e == &ip_conntrack_expect_list)
+ return NULL;
+ }
+ return e;
+}
+
+static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct list_head *e = v;
+
+ e = e->next;
+
+ if (e == &ip_conntrack_expect_list)
+ return NULL;
+
+ return e;
+}
+
+static void exp_seq_stop(struct seq_file *s, void *v)
+{
+ READ_UNLOCK(&ip_conntrack_lock);
+}
+
+static int exp_seq_show(struct seq_file *s, void *v)
+{
+ struct ip_conntrack_expect *expect = v;
+
+ if (expect->timeout.function)
+ seq_printf(s, "%ld ", timer_pending(&expect->timeout)
+ ? (long)(expect->timeout.expires - jiffies)/HZ : 0);
+ else
+ seq_printf(s, "- ");
+
+ seq_printf(s, "proto=%u ", expect->tuple.dst.protonum);
+
+ print_tuple(s, &expect->tuple,
+ ip_ct_find_proto(expect->tuple.dst.protonum));
+ return seq_putc(s, '\n');
+}
+
+static struct seq_operations exp_seq_ops = {
+ .start = exp_seq_start,
+ .next = exp_seq_next,
+ .stop = exp_seq_stop,
+ .show = exp_seq_show
+};
+
+static int exp_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &exp_seq_ops);
+}
+
+static struct file_operations exp_file_ops = {
+ .owner = THIS_MODULE,
+ .open = exp_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ int cpu;
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu+1;
+ return &per_cpu(ip_conntrack_stat, cpu);
+ }
+
+ return NULL;
+}
+
+static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ int cpu;
+
+ for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu+1;
+ return &per_cpu(ip_conntrack_stat, cpu);
+ }
+
+ return NULL;
+}
+
+static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int ct_cpu_seq_show(struct seq_file *seq, void *v)
+{
+ unsigned int nr_conntracks = atomic_read(&ip_conntrack_count);
+ struct ip_conntrack_stat *st = v;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete\n");
+ return 0;
+ }
+
+ seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
+ "%08x %08x %08x %08x %08x %08x %08x %08x \n",
+ nr_conntracks,
+ st->searched,
+ st->found,
+ st->new,
+ st->invalid,
+ st->ignore,
+ st->delete,
+ st->delete_list,
+ st->insert,
+ st->insert_failed,
+ st->drop,
+ st->early_drop,
+ st->error,
+
+ st->expect_new,
+ st->expect_create,
+ st->expect_delete
+ );
+ return 0;
+}
+
+static struct seq_operations ct_cpu_seq_ops = {
+ .start = ct_cpu_seq_start,
+ .next = ct_cpu_seq_next,
+ .stop = ct_cpu_seq_stop,
+ .show = ct_cpu_seq_show,
+};
+
+static int ct_cpu_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &ct_cpu_seq_ops);
+}
+
+static struct file_operations ct_cpu_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = ct_cpu_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+#endif
+
+static unsigned int ip_confirm(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct ip_conntrack *ct;
+ enum ip_conntrack_info ctinfo;
+
+ /* This is where we call the helper: as the packet goes out. */
+ ct = ip_conntrack_get(*pskb, &ctinfo);
+ if (ct && ct->helper) {
+ unsigned int ret;
+ ret = ct->helper->help(pskb, ct, ctinfo);
+ if (ret != NF_ACCEPT)
+ return ret;
+ }
+
+ /* We've seen it coming out the other side: confirm it */
+ return ip_conntrack_confirm(pskb);
+}
+
+static unsigned int ip_conntrack_defrag(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE)
+ /* Previously seen (loopback)? Ignore. Do this before
+ fragment check. */
+ if ((*pskb)->nfct)
+ return NF_ACCEPT;
+#endif
+
+ /* Gather fragments. */
+ if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+ *pskb = ip_ct_gather_frags(*pskb,
+ hooknum == NF_IP_PRE_ROUTING ?
+ IP_DEFRAG_CONNTRACK_IN :
+ IP_DEFRAG_CONNTRACK_OUT);
+ if (!*pskb)
+ return NF_STOLEN;
+ }
+ return NF_ACCEPT;
+}
+
+static unsigned int ip_refrag(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct rtable *rt = (struct rtable *)(*pskb)->dst;
+
+ /* We've seen it coming out the other side: confirm */
+ if (ip_confirm(hooknum, pskb, in, out, okfn) != NF_ACCEPT)
+ return NF_DROP;
+
+ /* Local packets are never produced too large for their
+ interface. We degfragment them at LOCAL_OUT, however,
+ so we have to refragment them here. */
+ if ((*pskb)->len > dst_mtu(&rt->u.dst) &&
+ !skb_shinfo(*pskb)->tso_size) {
+ /* No hook can be after us, so this should be OK. */
+ ip_fragment(*pskb, okfn);
+ return NF_STOLEN;
+ }
+ return NF_ACCEPT;
+}
+
+static unsigned int ip_conntrack_local(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ /* root is playing with raw sockets. */
+ if ((*pskb)->len < sizeof(struct iphdr)
+ || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
+ if (net_ratelimit())
+ printk("ipt_hook: happy cracking.\n");
+ return NF_ACCEPT;
+ }
+ return ip_conntrack_in(hooknum, pskb, in, out, okfn);
+}
+
+/* Connection tracking may drop packets, but never alters them, so
+ make it the first hook. */
+static struct nf_hook_ops ip_conntrack_defrag_ops = {
+ .hook = ip_conntrack_defrag,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_PRE_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
+};
+
+static struct nf_hook_ops ip_conntrack_in_ops = {
+ .hook = ip_conntrack_in,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_PRE_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK,
+};
+
+static struct nf_hook_ops ip_conntrack_defrag_local_out_ops = {
+ .hook = ip_conntrack_defrag,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_LOCAL_OUT,
+ .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
+};
+
+static struct nf_hook_ops ip_conntrack_local_out_ops = {
+ .hook = ip_conntrack_local,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_LOCAL_OUT,
+ .priority = NF_IP_PRI_CONNTRACK,
+};
+
+/* Refragmenter; last chance. */
+static struct nf_hook_ops ip_conntrack_out_ops = {
+ .hook = ip_refrag,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_POST_ROUTING,
+ .priority = NF_IP_PRI_LAST,
+};
+
+static struct nf_hook_ops ip_conntrack_local_in_ops = {
+ .hook = ip_confirm,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_LOCAL_IN,
+ .priority = NF_IP_PRI_LAST-1,
+};
+
+/* Sysctl support */
+
+#ifdef CONFIG_SYSCTL
+
+/* From ip_conntrack_core.c */
+extern int ip_conntrack_max;
+extern unsigned int ip_conntrack_htable_size;
+
+/* From ip_conntrack_proto_tcp.c */
+extern unsigned long ip_ct_tcp_timeout_syn_sent;
+extern unsigned long ip_ct_tcp_timeout_syn_recv;
+extern unsigned long ip_ct_tcp_timeout_established;
+extern unsigned long ip_ct_tcp_timeout_fin_wait;
+extern unsigned long ip_ct_tcp_timeout_close_wait;
+extern unsigned long ip_ct_tcp_timeout_last_ack;
+extern unsigned long ip_ct_tcp_timeout_time_wait;
+extern unsigned long ip_ct_tcp_timeout_close;
+extern unsigned long ip_ct_tcp_timeout_max_retrans;
+extern int ip_ct_tcp_loose;
+extern int ip_ct_tcp_be_liberal;
+extern int ip_ct_tcp_max_retrans;
+
+/* From ip_conntrack_proto_udp.c */
+extern unsigned long ip_ct_udp_timeout;
+extern unsigned long ip_ct_udp_timeout_stream;
+
+/* From ip_conntrack_proto_icmp.c */
+extern unsigned long ip_ct_icmp_timeout;
+
+/* From ip_conntrack_proto_icmp.c */
+extern unsigned long ip_ct_generic_timeout;
+
+/* Log invalid packets of a given protocol */
+static int log_invalid_proto_min = 0;
+static int log_invalid_proto_max = 255;
+
+static struct ctl_table_header *ip_ct_sysctl_header;
+
+static ctl_table ip_ct_sysctl_table[] = {
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_MAX,
+ .procname = "ip_conntrack_max",
+ .data = &ip_conntrack_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT,
+ .procname = "ip_conntrack_count",
+ .data = &ip_conntrack_count,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_BUCKETS,
+ .procname = "ip_conntrack_buckets",
+ .data = &ip_conntrack_htable_size,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,
+ .procname = "ip_conntrack_tcp_timeout_syn_sent",
+ .data = &ip_ct_tcp_timeout_syn_sent,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV,
+ .procname = "ip_conntrack_tcp_timeout_syn_recv",
+ .data = &ip_ct_tcp_timeout_syn_recv,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED,
+ .procname = "ip_conntrack_tcp_timeout_established",
+ .data = &ip_ct_tcp_timeout_established,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT,
+ .procname = "ip_conntrack_tcp_timeout_fin_wait",
+ .data = &ip_ct_tcp_timeout_fin_wait,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT,
+ .procname = "ip_conntrack_tcp_timeout_close_wait",
+ .data = &ip_ct_tcp_timeout_close_wait,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK,
+ .procname = "ip_conntrack_tcp_timeout_last_ack",
+ .data = &ip_ct_tcp_timeout_last_ack,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT,
+ .procname = "ip_conntrack_tcp_timeout_time_wait",
+ .data = &ip_ct_tcp_timeout_time_wait,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE,
+ .procname = "ip_conntrack_tcp_timeout_close",
+ .data = &ip_ct_tcp_timeout_close,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT,
+ .procname = "ip_conntrack_udp_timeout",
+ .data = &ip_ct_udp_timeout,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM,
+ .procname = "ip_conntrack_udp_timeout_stream",
+ .data = &ip_ct_udp_timeout_stream,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT,
+ .procname = "ip_conntrack_icmp_timeout",
+ .data = &ip_ct_icmp_timeout,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT,
+ .procname = "ip_conntrack_generic_timeout",
+ .data = &ip_ct_generic_timeout,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID,
+ .procname = "ip_conntrack_log_invalid",
+ .data = &ip_ct_log_invalid,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &log_invalid_proto_min,
+ .extra2 = &log_invalid_proto_max,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS,
+ .procname = "ip_conntrack_tcp_timeout_max_retrans",
+ .data = &ip_ct_tcp_timeout_max_retrans,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_LOOSE,
+ .procname = "ip_conntrack_tcp_loose",
+ .data = &ip_ct_tcp_loose,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL,
+ .procname = "ip_conntrack_tcp_be_liberal",
+ .data = &ip_ct_tcp_be_liberal,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS,
+ .procname = "ip_conntrack_tcp_max_retrans",
+ .data = &ip_ct_tcp_max_retrans,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ { .ctl_name = 0 }
+};
+
+#define NET_IP_CONNTRACK_MAX 2089
+
+static ctl_table ip_ct_netfilter_table[] = {
+ {
+ .ctl_name = NET_IPV4_NETFILTER,
+ .procname = "netfilter",
+ .mode = 0555,
+ .child = ip_ct_sysctl_table,
+ },
+ {
+ .ctl_name = NET_IP_CONNTRACK_MAX,
+ .procname = "ip_conntrack_max",
+ .data = &ip_conntrack_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table ip_ct_ipv4_table[] = {
+ {
+ .ctl_name = NET_IPV4,
+ .procname = "ipv4",
+ .mode = 0555,
+ .child = ip_ct_netfilter_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table ip_ct_net_table[] = {
+ {
+ .ctl_name = CTL_NET,
+ .procname = "net",
+ .mode = 0555,
+ .child = ip_ct_ipv4_table,
+ },
+ { .ctl_name = 0 }
+};
+
+EXPORT_SYMBOL(ip_ct_log_invalid);
+#endif /* CONFIG_SYSCTL */
+
+static int init_or_cleanup(int init)
+{
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *proc, *proc_exp, *proc_stat;
+#endif
+ int ret = 0;
+
+ if (!init) goto cleanup;
+
+ ret = ip_conntrack_init();
+ if (ret < 0)
+ goto cleanup_nothing;
+
+#ifdef CONFIG_PROC_FS
+ ret = -ENOMEM;
+ proc = proc_net_fops_create("ip_conntrack", 0440, &ct_file_ops);
+ if (!proc) goto cleanup_init;
+
+ proc_exp = proc_net_fops_create("ip_conntrack_expect", 0440,
+ &exp_file_ops);
+ if (!proc_exp) goto cleanup_proc;
+
+ proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat);
+ if (!proc_stat)
+ goto cleanup_proc_exp;
+
+ proc_stat->proc_fops = &ct_cpu_seq_fops;
+ proc_stat->owner = THIS_MODULE;
+#endif
+
+ ret = nf_register_hook(&ip_conntrack_defrag_ops);
+ if (ret < 0) {
+ printk("ip_conntrack: can't register pre-routing defrag hook.\n");
+ goto cleanup_proc_stat;
+ }
+ ret = nf_register_hook(&ip_conntrack_defrag_local_out_ops);
+ if (ret < 0) {
+ printk("ip_conntrack: can't register local_out defrag hook.\n");
+ goto cleanup_defragops;
+ }
+ ret = nf_register_hook(&ip_conntrack_in_ops);
+ if (ret < 0) {
+ printk("ip_conntrack: can't register pre-routing hook.\n");
+ goto cleanup_defraglocalops;
+ }
+ ret = nf_register_hook(&ip_conntrack_local_out_ops);
+ if (ret < 0) {
+ printk("ip_conntrack: can't register local out hook.\n");
+ goto cleanup_inops;
+ }
+ ret = nf_register_hook(&ip_conntrack_out_ops);
+ if (ret < 0) {
+ printk("ip_conntrack: can't register post-routing hook.\n");
+ goto cleanup_inandlocalops;
+ }
+ ret = nf_register_hook(&ip_conntrack_local_in_ops);
+ if (ret < 0) {
+ printk("ip_conntrack: can't register local in hook.\n");
+ goto cleanup_inoutandlocalops;
+ }
+#ifdef CONFIG_SYSCTL
+ ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table, 0);
+ if (ip_ct_sysctl_header == NULL) {
+ printk("ip_conntrack: can't register to sysctl.\n");
+ ret = -ENOMEM;
+ goto cleanup_localinops;
+ }
+#endif
+
+ return ret;
+
+ cleanup:
+#ifdef CONFIG_SYSCTL
+ unregister_sysctl_table(ip_ct_sysctl_header);
+ cleanup_localinops:
+#endif
+ nf_unregister_hook(&ip_conntrack_local_in_ops);
+ cleanup_inoutandlocalops:
+ nf_unregister_hook(&ip_conntrack_out_ops);
+ cleanup_inandlocalops:
+ nf_unregister_hook(&ip_conntrack_local_out_ops);
+ cleanup_inops:
+ nf_unregister_hook(&ip_conntrack_in_ops);
+ cleanup_defraglocalops:
+ nf_unregister_hook(&ip_conntrack_defrag_local_out_ops);
+ cleanup_defragops:
+ nf_unregister_hook(&ip_conntrack_defrag_ops);
+ cleanup_proc_stat:
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("ip_conntrack", proc_net_stat);
+ cleanup_proc_exp:
+ proc_net_remove("ip_conntrack_expect");
+ cleanup_proc:
+ proc_net_remove("ip_conntrack");
+ cleanup_init:
+#endif /* CONFIG_PROC_FS */
+ ip_conntrack_cleanup();
+ cleanup_nothing:
+ return ret;
+}
+
+/* FIXME: Allow NULL functions and sub in pointers to generic for
+ them. --RR */
+int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto)
+{
+ int ret = 0;
+
+ WRITE_LOCK(&ip_conntrack_lock);
+ if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) {
+ ret = -EBUSY;
+ goto out;
+ }
+ ip_ct_protos[proto->proto] = proto;
+ out:
+ WRITE_UNLOCK(&ip_conntrack_lock);
+ return ret;
+}
+
+void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto)
+{
+ WRITE_LOCK(&ip_conntrack_lock);
+ ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol;
+ WRITE_UNLOCK(&ip_conntrack_lock);
+
+ /* Somebody could be still looking at the proto in bh. */
+ synchronize_net();
+
+ /* Remove all contrack entries for this protocol */
+ ip_ct_iterate_cleanup(kill_proto, &proto->proto);
+}
+
+static int __init init(void)
+{
+ return init_or_cleanup(1);
+}
+
+static void __exit fini(void)
+{
+ init_or_cleanup(0);
+}
+
+module_init(init);
+module_exit(fini);
+
+/* Some modules need us, but don't depend directly on any symbol.
+ They should call this. */
+void need_ip_conntrack(void)
+{
+}
+
+EXPORT_SYMBOL(ip_conntrack_protocol_register);
+EXPORT_SYMBOL(ip_conntrack_protocol_unregister);
+EXPORT_SYMBOL(ip_ct_get_tuple);
+EXPORT_SYMBOL(invert_tuplepr);
+EXPORT_SYMBOL(ip_conntrack_alter_reply);
+EXPORT_SYMBOL(ip_conntrack_destroyed);
+EXPORT_SYMBOL(need_ip_conntrack);
+EXPORT_SYMBOL(ip_conntrack_helper_register);
+EXPORT_SYMBOL(ip_conntrack_helper_unregister);
+EXPORT_SYMBOL(ip_ct_iterate_cleanup);
+EXPORT_SYMBOL(ip_ct_refresh_acct);
+EXPORT_SYMBOL(ip_ct_protos);
+EXPORT_SYMBOL(ip_ct_find_proto);
+EXPORT_SYMBOL(ip_conntrack_expect_alloc);
+EXPORT_SYMBOL(ip_conntrack_expect_free);
+EXPORT_SYMBOL(ip_conntrack_expect_related);
+EXPORT_SYMBOL(ip_conntrack_unexpect_related);
+EXPORT_SYMBOL(ip_conntrack_tuple_taken);
+EXPORT_SYMBOL(ip_ct_gather_frags);
+EXPORT_SYMBOL(ip_conntrack_htable_size);
+EXPORT_SYMBOL(ip_conntrack_lock);
+EXPORT_SYMBOL(ip_conntrack_hash);
+EXPORT_SYMBOL(ip_conntrack_untracked);
+EXPORT_SYMBOL_GPL(ip_conntrack_find_get);
+EXPORT_SYMBOL_GPL(ip_conntrack_put);
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+EXPORT_SYMBOL(ip_conntrack_tcp_update);
+#endif
diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c
new file mode 100644
index 00000000000..992fac3e36e
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_tftp.c
@@ -0,0 +1,159 @@
+/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Version: 0.0.7
+ *
+ * Thu 21 Mar 2002 Harald Welte <laforge@gnumonks.org>
+ * - port to newnat API
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_tftp.h>
+#include <linux/moduleparam.h>
+
+MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
+MODULE_DESCRIPTION("tftp connection tracking helper");
+MODULE_LICENSE("GPL");
+
+#define MAX_PORTS 8
+static int ports[MAX_PORTS];
+static int ports_c;
+module_param_array(ports, int, &ports_c, 0400);
+MODULE_PARM_DESC(ports, "port numbers of tftp servers");
+
+#if 0
+#define DEBUGP(format, args...) printk("%s:%s:" format, \
+ __FILE__, __FUNCTION__ , ## args)
+#else
+#define DEBUGP(format, args...)
+#endif
+
+unsigned int (*ip_nat_tftp_hook)(struct sk_buff **pskb,
+ enum ip_conntrack_info ctinfo,
+ struct ip_conntrack_expect *exp);
+EXPORT_SYMBOL_GPL(ip_nat_tftp_hook);
+
+static int tftp_help(struct sk_buff **pskb,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ struct tftphdr _tftph, *tfh;
+ struct ip_conntrack_expect *exp;
+ unsigned int ret = NF_ACCEPT;
+
+ tfh = skb_header_pointer(*pskb,
+ (*pskb)->nh.iph->ihl*4+sizeof(struct udphdr),
+ sizeof(_tftph), &_tftph);
+ if (tfh == NULL)
+ return NF_ACCEPT;
+
+ switch (ntohs(tfh->opcode)) {
+ /* RRQ and WRQ works the same way */
+ case TFTP_OPCODE_READ:
+ case TFTP_OPCODE_WRITE:
+ DEBUGP("");
+ DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+ exp = ip_conntrack_expect_alloc();
+ if (exp == NULL)
+ return NF_DROP;
+
+ exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ exp->mask.src.ip = 0xffffffff;
+ exp->mask.dst.ip = 0xffffffff;
+ exp->mask.dst.u.udp.port = 0xffff;
+ exp->mask.dst.protonum = 0xff;
+ exp->expectfn = NULL;
+ exp->master = ct;
+
+ DEBUGP("expect: ");
+ DUMP_TUPLE(&exp->tuple);
+ DUMP_TUPLE(&exp->mask);
+ if (ip_nat_tftp_hook)
+ ret = ip_nat_tftp_hook(pskb, ctinfo, exp);
+ else if (ip_conntrack_expect_related(exp) != 0) {
+ ip_conntrack_expect_free(exp);
+ ret = NF_DROP;
+ }
+ break;
+ case TFTP_OPCODE_DATA:
+ case TFTP_OPCODE_ACK:
+ DEBUGP("Data/ACK opcode\n");
+ break;
+ case TFTP_OPCODE_ERROR:
+ DEBUGP("Error opcode\n");
+ break;
+ default:
+ DEBUGP("Unknown opcode\n");
+ }
+ return NF_ACCEPT;
+}
+
+static struct ip_conntrack_helper tftp[MAX_PORTS];
+static char tftp_names[MAX_PORTS][10];
+
+static void fini(void)
+{
+ int i;
+
+ for (i = 0 ; i < ports_c; i++) {
+ DEBUGP("unregistering helper for port %d\n",
+ ports[i]);
+ ip_conntrack_helper_unregister(&tftp[i]);
+ }
+}
+
+static int __init init(void)
+{
+ int i, ret;
+ char *tmpname;
+
+ if (ports_c == 0)
+ ports[ports_c++] = TFTP_PORT;
+
+ for (i = 0; i < ports_c; i++) {
+ /* Create helper structure */
+ memset(&tftp[i], 0, sizeof(struct ip_conntrack_helper));
+
+ tftp[i].tuple.dst.protonum = IPPROTO_UDP;
+ tftp[i].tuple.src.u.udp.port = htons(ports[i]);
+ tftp[i].mask.dst.protonum = 0xFF;
+ tftp[i].mask.src.u.udp.port = 0xFFFF;
+ tftp[i].max_expected = 1;
+ tftp[i].timeout = 5 * 60; /* 5 minutes */
+ tftp[i].me = THIS_MODULE;
+ tftp[i].help = tftp_help;
+
+ tmpname = &tftp_names[i][0];
+ if (ports[i] == TFTP_PORT)
+ sprintf(tmpname, "tftp");
+ else
+ sprintf(tmpname, "tftp-%d", i);
+ tftp[i].name = tmpname;
+
+ DEBUGP("port #%d: %d\n", i, ports[i]);
+
+ ret=ip_conntrack_helper_register(&tftp[i]);
+ if (ret) {
+ printk("ERROR registering helper for port %d\n",
+ ports[i]);
+ fini();
+ return(ret);
+ }
+ }
+ return(0);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_amanda.c b/net/ipv4/netfilter/ip_nat_amanda.c
new file mode 100644
index 00000000000..da1f412583e
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_amanda.c
@@ -0,0 +1,88 @@
+/* Amanda extension for TCP NAT alteration.
+ * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
+ * based on a copy of HW's ip_nat_irc.c as well as other modules
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Module load syntax:
+ * insmod ip_nat_amanda.o
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
+
+
+MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
+MODULE_DESCRIPTION("Amanda NAT helper");
+MODULE_LICENSE("GPL");
+
+static unsigned int help(struct sk_buff **pskb,
+ enum ip_conntrack_info ctinfo,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct ip_conntrack_expect *exp)
+{
+ char buffer[sizeof("65535")];
+ u_int16_t port;
+ unsigned int ret;
+
+ /* Connection comes from client. */
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->dir = IP_CT_DIR_ORIGINAL;
+
+ /* When you see the packet, we need to NAT it the same as the
+ * this one (ie. same IP: it will be TCP and master is UDP). */
+ exp->expectfn = ip_nat_follow_master;
+
+ /* Try to get same port: if not, try to change it. */
+ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+ exp->tuple.dst.u.tcp.port = htons(port);
+ if (ip_conntrack_expect_related(exp) == 0)
+ break;
+ }
+
+ if (port == 0) {
+ ip_conntrack_expect_free(exp);
+ return NF_DROP;
+ }
+
+ sprintf(buffer, "%u", port);
+ ret = ip_nat_mangle_udp_packet(pskb, exp->master, ctinfo,
+ matchoff, matchlen,
+ buffer, strlen(buffer));
+ if (ret != NF_ACCEPT)
+ ip_conntrack_unexpect_related(exp);
+ return ret;
+}
+
+static void __exit fini(void)
+{
+ ip_nat_amanda_hook = NULL;
+ /* Make sure noone calls it, meanwhile. */
+ synchronize_net();
+}
+
+static int __init init(void)
+{
+ BUG_ON(ip_nat_amanda_hook);
+ ip_nat_amanda_hook = help;
+ return 0;
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
new file mode 100644
index 00000000000..162ceacfc29
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -0,0 +1,556 @@
+/* NAT for netfilter; shared with compatibility layer. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/vmalloc.h>
+#include <net/checksum.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/tcp.h> /* For tcp_prot in getorigdst */
+#include <linux/icmp.h>
+#include <linux/udp.h>
+#include <linux/jhash.h>
+
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+DECLARE_RWLOCK(ip_nat_lock);
+
+/* Calculated at init based on memory size */
+static unsigned int ip_nat_htable_size;
+
+static struct list_head *bysource;
+struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
+
+
+/* We keep an extra hash for each conntrack, for fast searching. */
+static inline unsigned int
+hash_by_src(const struct ip_conntrack_tuple *tuple)
+{
+ /* Original src, to ensure we map it consistently if poss. */
+ return jhash_3words(tuple->src.ip, tuple->src.u.all,
+ tuple->dst.protonum, 0) % ip_nat_htable_size;
+}
+
+/* Noone using conntrack by the time this called. */
+static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
+{
+ if (!(conn->status & IPS_NAT_DONE_MASK))
+ return;
+
+ WRITE_LOCK(&ip_nat_lock);
+ list_del(&conn->nat.info.bysource);
+ WRITE_UNLOCK(&ip_nat_lock);
+}
+
+/* We do checksum mangling, so if they were wrong before they're still
+ * wrong. Also works for incomplete packets (eg. ICMP dest
+ * unreachables.) */
+u_int16_t
+ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
+{
+ u_int32_t diffs[] = { oldvalinv, newval };
+ return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
+ oldcheck^0xFFFF));
+}
+
+/* Is this tuple already taken? (not by us) */
+int
+ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack *ignored_conntrack)
+{
+ /* Conntrack tracking doesn't keep track of outgoing tuples; only
+ incoming ones. NAT means they don't have a fixed mapping,
+ so we invert the tuple and look for the incoming reply.
+
+ We could keep a separate hash if this proves too slow. */
+ struct ip_conntrack_tuple reply;
+
+ invert_tuplepr(&reply, tuple);
+ return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
+}
+
+/* If we source map this tuple so reply looks like reply_tuple, will
+ * that meet the constraints of range. */
+static int
+in_range(const struct ip_conntrack_tuple *tuple,
+ const struct ip_nat_range *range)
+{
+ struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum);
+
+ /* If we are supposed to map IPs, then we must be in the
+ range specified, otherwise let this drag us onto a new src IP. */
+ if (range->flags & IP_NAT_RANGE_MAP_IPS) {
+ if (ntohl(tuple->src.ip) < ntohl(range->min_ip)
+ || ntohl(tuple->src.ip) > ntohl(range->max_ip))
+ return 0;
+ }
+
+ if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
+ || proto->in_range(tuple, IP_NAT_MANIP_SRC,
+ &range->min, &range->max))
+ return 1;
+
+ return 0;
+}
+
+static inline int
+same_src(const struct ip_conntrack *ct,
+ const struct ip_conntrack_tuple *tuple)
+{
+ return (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
+ == tuple->dst.protonum
+ && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
+ == tuple->src.ip
+ && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
+ == tuple->src.u.all);
+}
+
+/* Only called for SRC manip */
+static int
+find_appropriate_src(const struct ip_conntrack_tuple *tuple,
+ struct ip_conntrack_tuple *result,
+ const struct ip_nat_range *range)
+{
+ unsigned int h = hash_by_src(tuple);
+ struct ip_conntrack *ct;
+
+ READ_LOCK(&ip_nat_lock);
+ list_for_each_entry(ct, &bysource[h], nat.info.bysource) {
+ if (same_src(ct, tuple)) {
+ /* Copy source part from reply tuple. */
+ invert_tuplepr(result,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ result->dst = tuple->dst;
+
+ if (in_range(result, range)) {
+ READ_UNLOCK(&ip_nat_lock);
+ return 1;
+ }
+ }
+ }
+ READ_UNLOCK(&ip_nat_lock);
+ return 0;
+}
+
+/* For [FUTURE] fragmentation handling, we want the least-used
+ src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
+ if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
+ 1-65535, we don't do pro-rata allocation based on ports; we choose
+ the ip with the lowest src-ip/dst-ip/proto usage.
+*/
+static void
+find_best_ips_proto(struct ip_conntrack_tuple *tuple,
+ const struct ip_nat_range *range,
+ const struct ip_conntrack *conntrack,
+ enum ip_nat_manip_type maniptype)
+{
+ u_int32_t *var_ipp;
+ /* Host order */
+ u_int32_t minip, maxip, j;
+
+ /* No IP mapping? Do nothing. */
+ if (!(range->flags & IP_NAT_RANGE_MAP_IPS))
+ return;
+
+ if (maniptype == IP_NAT_MANIP_SRC)
+ var_ipp = &tuple->src.ip;
+ else
+ var_ipp = &tuple->dst.ip;
+
+ /* Fast path: only one choice. */
+ if (range->min_ip == range->max_ip) {
+ *var_ipp = range->min_ip;
+ return;
+ }
+
+ /* Hashing source and destination IPs gives a fairly even
+ * spread in practice (if there are a small number of IPs
+ * involved, there usually aren't that many connections
+ * anyway). The consistency means that servers see the same
+ * client coming from the same IP (some Internet Banking sites
+ * like this), even across reboots. */
+ minip = ntohl(range->min_ip);
+ maxip = ntohl(range->max_ip);
+ j = jhash_2words(tuple->src.ip, tuple->dst.ip, 0);
+ *var_ipp = htonl(minip + j % (maxip - minip + 1));
+}
+
+/* Manipulate the tuple into the range given. For NF_IP_POST_ROUTING,
+ * we change the source to map into the range. For NF_IP_PRE_ROUTING
+ * and NF_IP_LOCAL_OUT, we change the destination to map into the
+ * range. It might not be possible to get a unique tuple, but we try.
+ * At worst (or if we race), we will end up with a final duplicate in
+ * __ip_conntrack_confirm and drop the packet. */
+static void
+get_unique_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack_tuple *orig_tuple,
+ const struct ip_nat_range *range,
+ struct ip_conntrack *conntrack,
+ enum ip_nat_manip_type maniptype)
+{
+ struct ip_nat_protocol *proto
+ = ip_nat_find_proto(orig_tuple->dst.protonum);
+
+ /* 1) If this srcip/proto/src-proto-part is currently mapped,
+ and that same mapping gives a unique tuple within the given
+ range, use that.
+
+ This is only required for source (ie. NAT/masq) mappings.
+ So far, we don't do local source mappings, so multiple
+ manips not an issue. */
+ if (maniptype == IP_NAT_MANIP_SRC) {
+ if (find_appropriate_src(orig_tuple, tuple, range)) {
+ DEBUGP("get_unique_tuple: Found current src map\n");
+ if (!ip_nat_used_tuple(tuple, conntrack))
+ return;
+ }
+ }
+
+ /* 2) Select the least-used IP/proto combination in the given
+ range. */
+ *tuple = *orig_tuple;
+ find_best_ips_proto(tuple, range, conntrack, maniptype);
+
+ /* 3) The per-protocol part of the manip is made to map into
+ the range to make a unique tuple. */
+
+ /* Only bother mapping if it's not already in range and unique */
+ if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
+ || proto->in_range(tuple, maniptype, &range->min, &range->max))
+ && !ip_nat_used_tuple(tuple, conntrack))
+ return;
+
+ /* Last change: get protocol to try to obtain unique tuple. */
+ proto->unique_tuple(tuple, range, maniptype, conntrack);
+}
+
+unsigned int
+ip_nat_setup_info(struct ip_conntrack *conntrack,
+ const struct ip_nat_range *range,
+ unsigned int hooknum)
+{
+ struct ip_conntrack_tuple curr_tuple, new_tuple;
+ struct ip_nat_info *info = &conntrack->nat.info;
+ int have_to_hash = !(conntrack->status & IPS_NAT_DONE_MASK);
+ enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum);
+
+ IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
+ || hooknum == NF_IP_POST_ROUTING
+ || hooknum == NF_IP_LOCAL_IN
+ || hooknum == NF_IP_LOCAL_OUT);
+ BUG_ON(ip_nat_initialized(conntrack, maniptype));
+
+ /* What we've got will look like inverse of reply. Normally
+ this is what is in the conntrack, except for prior
+ manipulations (future optimization: if num_manips == 0,
+ orig_tp =
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
+ invert_tuplepr(&curr_tuple,
+ &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+ get_unique_tuple(&new_tuple, &curr_tuple, range, conntrack, maniptype);
+
+ if (!ip_ct_tuple_equal(&new_tuple, &curr_tuple)) {
+ struct ip_conntrack_tuple reply;
+
+ /* Alter conntrack table so will recognize replies. */
+ invert_tuplepr(&reply, &new_tuple);
+ ip_conntrack_alter_reply(conntrack, &reply);
+
+ /* Non-atomic: we own this at the moment. */
+ if (maniptype == IP_NAT_MANIP_SRC)
+ conntrack->status |= IPS_SRC_NAT;
+ else
+ conntrack->status |= IPS_DST_NAT;
+ }
+
+ /* Place in source hash if this is the first time. */
+ if (have_to_hash) {
+ unsigned int srchash
+ = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple);
+ WRITE_LOCK(&ip_nat_lock);
+ list_add(&info->bysource, &bysource[srchash]);
+ WRITE_UNLOCK(&ip_nat_lock);
+ }
+
+ /* It's done. */
+ if (maniptype == IP_NAT_MANIP_DST)
+ set_bit(IPS_DST_NAT_DONE_BIT, &conntrack->status);
+ else
+ set_bit(IPS_SRC_NAT_DONE_BIT, &conntrack->status);
+
+ return NF_ACCEPT;
+}
+
+/* Returns true if succeeded. */
+static int
+manip_pkt(u_int16_t proto,
+ struct sk_buff **pskb,
+ unsigned int iphdroff,
+ const struct ip_conntrack_tuple *target,
+ enum ip_nat_manip_type maniptype)
+{
+ struct iphdr *iph;
+
+ (*pskb)->nfcache |= NFC_ALTERED;
+ if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph)))
+ return 0;
+
+ iph = (void *)(*pskb)->data + iphdroff;
+
+ /* Manipulate protcol part. */
+ if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff,
+ target, maniptype))
+ return 0;
+
+ iph = (void *)(*pskb)->data + iphdroff;
+
+ if (maniptype == IP_NAT_MANIP_SRC) {
+ iph->check = ip_nat_cheat_check(~iph->saddr, target->src.ip,
+ iph->check);
+ iph->saddr = target->src.ip;
+ } else {
+ iph->check = ip_nat_cheat_check(~iph->daddr, target->dst.ip,
+ iph->check);
+ iph->daddr = target->dst.ip;
+ }
+ return 1;
+}
+
+/* Do packet manipulations according to ip_nat_setup_info. */
+unsigned int nat_packet(struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int hooknum,
+ struct sk_buff **pskb)
+{
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ unsigned long statusbit;
+ enum ip_nat_manip_type mtype = HOOK2MANIP(hooknum);
+
+ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)
+ && (hooknum == NF_IP_POST_ROUTING || hooknum == NF_IP_LOCAL_IN)) {
+ DEBUGP("ip_nat_core: adjusting sequence number\n");
+ /* future: put this in a l4-proto specific function,
+ * and call this function here. */
+ if (!ip_nat_seq_adjust(pskb, ct, ctinfo))
+ return NF_DROP;
+ }
+
+ if (mtype == IP_NAT_MANIP_SRC)
+ statusbit = IPS_SRC_NAT;
+ else
+ statusbit = IPS_DST_NAT;
+
+ /* Invert if this is reply dir. */
+ if (dir == IP_CT_DIR_REPLY)
+ statusbit ^= IPS_NAT_MASK;
+
+ /* Non-atomic: these bits don't change. */
+ if (ct->status & statusbit) {
+ struct ip_conntrack_tuple target;
+
+ /* We are aiming to look like inverse of other direction. */
+ invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+
+ if (!manip_pkt(target.dst.protonum, pskb, 0, &target, mtype))
+ return NF_DROP;
+ }
+ return NF_ACCEPT;
+}
+
+/* Dir is direction ICMP is coming from (opposite to packet it contains) */
+int icmp_reply_translation(struct sk_buff **pskb,
+ struct ip_conntrack *ct,
+ enum ip_nat_manip_type manip,
+ enum ip_conntrack_dir dir)
+{
+ struct {
+ struct icmphdr icmp;
+ struct iphdr ip;
+ } *inside;
+ struct ip_conntrack_tuple inner, target;
+ int hdrlen = (*pskb)->nh.iph->ihl * 4;
+
+ if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside)))
+ return 0;
+
+ inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
+
+ /* We're actually going to mangle it beyond trivial checksum
+ adjustment, so make sure the current checksum is correct. */
+ if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
+ hdrlen = (*pskb)->nh.iph->ihl * 4;
+ if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
+ (*pskb)->len - hdrlen, 0)))
+ return 0;
+ }
+
+ /* Must be RELATED */
+ IP_NF_ASSERT((*pskb)->nfctinfo == IP_CT_RELATED ||
+ (*pskb)->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY);
+
+ /* Redirects on non-null nats must be dropped, else they'll
+ start talking to each other without our translation, and be
+ confused... --RR */
+ if (inside->icmp.type == ICMP_REDIRECT) {
+ /* If NAT isn't finished, assume it and drop. */
+ if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
+ return 0;
+
+ if (ct->status & IPS_NAT_MASK)
+ return 0;
+ }
+
+ DEBUGP("icmp_reply_translation: translating error %p manp %u dir %s\n",
+ *pskb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
+
+ if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 +
+ sizeof(struct icmphdr) + inside->ip.ihl*4,
+ &inner, ip_ct_find_proto(inside->ip.protocol)))
+ return 0;
+
+ /* Change inner back to look like incoming packet. We do the
+ opposite manip on this hook to normal, because it might not
+ pass all hooks (locally-generated ICMP). Consider incoming
+ packet: PREROUTING (DST manip), routing produces ICMP, goes
+ through POSTROUTING (which must correct the DST manip). */
+ if (!manip_pkt(inside->ip.protocol, pskb,
+ (*pskb)->nh.iph->ihl*4
+ + sizeof(inside->icmp),
+ &ct->tuplehash[!dir].tuple,
+ !manip))
+ return 0;
+
+ /* Reloading "inside" here since manip_pkt inner. */
+ inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
+ inside->icmp.checksum = 0;
+ inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
+ (*pskb)->len - hdrlen,
+ 0));
+
+ /* Change outer to look the reply to an incoming packet
+ * (proto 0 means don't invert per-proto part). */
+
+ /* Obviously, we need to NAT destination IP, but source IP
+ should be NAT'ed only if it is from a NAT'd host.
+
+ Explanation: some people use NAT for anonymizing. Also,
+ CERT recommends dropping all packets from private IP
+ addresses (although ICMP errors from internal links with
+ such addresses are not too uncommon, as Alan Cox points
+ out) */
+ if (manip != IP_NAT_MANIP_SRC
+ || ((*pskb)->nh.iph->saddr == ct->tuplehash[dir].tuple.src.ip)) {
+ invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+ if (!manip_pkt(0, pskb, 0, &target, manip))
+ return 0;
+ }
+
+ return 1;
+}
+
+/* Protocol registration. */
+int ip_nat_protocol_register(struct ip_nat_protocol *proto)
+{
+ int ret = 0;
+
+ WRITE_LOCK(&ip_nat_lock);
+ if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) {
+ ret = -EBUSY;
+ goto out;
+ }
+ ip_nat_protos[proto->protonum] = proto;
+ out:
+ WRITE_UNLOCK(&ip_nat_lock);
+ return ret;
+}
+
+/* Noone stores the protocol anywhere; simply delete it. */
+void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
+{
+ WRITE_LOCK(&ip_nat_lock);
+ ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol;
+ WRITE_UNLOCK(&ip_nat_lock);
+
+ /* Someone could be still looking at the proto in a bh. */
+ synchronize_net();
+}
+
+int __init ip_nat_init(void)
+{
+ size_t i;
+
+ /* Leave them the same for the moment. */
+ ip_nat_htable_size = ip_conntrack_htable_size;
+
+ /* One vmalloc for both hash tables */
+ bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size);
+ if (!bysource)
+ return -ENOMEM;
+
+ /* Sew in builtin protocols. */
+ WRITE_LOCK(&ip_nat_lock);
+ for (i = 0; i < MAX_IP_NAT_PROTO; i++)
+ ip_nat_protos[i] = &ip_nat_unknown_protocol;
+ ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp;
+ ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp;
+ ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp;
+ WRITE_UNLOCK(&ip_nat_lock);
+
+ for (i = 0; i < ip_nat_htable_size; i++) {
+ INIT_LIST_HEAD(&bysource[i]);
+ }
+
+ /* FIXME: Man, this is a hack. <SIGH> */
+ IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
+ ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
+
+ /* Initialize fake conntrack so that NAT will skip it */
+ ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
+ return 0;
+}
+
+/* Clear NAT section of all conntracks, in case we're loaded again. */
+static int clean_nat(struct ip_conntrack *i, void *data)
+{
+ memset(&i->nat, 0, sizeof(i->nat));
+ i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
+ return 0;
+}
+
+/* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
+void ip_nat_cleanup(void)
+{
+ ip_ct_iterate_cleanup(&clean_nat, NULL);
+ ip_conntrack_destroyed = NULL;
+ vfree(bysource);
+}
diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c
new file mode 100644
index 00000000000..c6000e794ad
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_ftp.c
@@ -0,0 +1,183 @@
+/* FTP extension for TCP NAT alteration. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/moduleparam.h>
+#include <net/tcp.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+MODULE_DESCRIPTION("ftp NAT helper");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/* FIXME: Time out? --RR */
+
+static int
+mangle_rfc959_packet(struct sk_buff **pskb,
+ u_int32_t newip,
+ u_int16_t port,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ u32 *seq)
+{
+ char buffer[sizeof("nnn,nnn,nnn,nnn,nnn,nnn")];
+
+ sprintf(buffer, "%u,%u,%u,%u,%u,%u",
+ NIPQUAD(newip), port>>8, port&0xFF);
+
+ DEBUGP("calling ip_nat_mangle_tcp_packet\n");
+
+ *seq += strlen(buffer) - matchlen;
+ return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff,
+ matchlen, buffer, strlen(buffer));
+}
+
+/* |1|132.235.1.2|6275| */
+static int
+mangle_eprt_packet(struct sk_buff **pskb,
+ u_int32_t newip,
+ u_int16_t port,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ u32 *seq)
+{
+ char buffer[sizeof("|1|255.255.255.255|65535|")];
+
+ sprintf(buffer, "|1|%u.%u.%u.%u|%u|", NIPQUAD(newip), port);
+
+ DEBUGP("calling ip_nat_mangle_tcp_packet\n");
+
+ *seq += strlen(buffer) - matchlen;
+ return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff,
+ matchlen, buffer, strlen(buffer));
+}
+
+/* |1|132.235.1.2|6275| */
+static int
+mangle_epsv_packet(struct sk_buff **pskb,
+ u_int32_t newip,
+ u_int16_t port,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ u32 *seq)
+{
+ char buffer[sizeof("|||65535|")];
+
+ sprintf(buffer, "|||%u|", port);
+
+ DEBUGP("calling ip_nat_mangle_tcp_packet\n");
+
+ *seq += strlen(buffer) - matchlen;
+ return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff,
+ matchlen, buffer, strlen(buffer));
+}
+
+static int (*mangle[])(struct sk_buff **, u_int32_t, u_int16_t,
+ unsigned int,
+ unsigned int,
+ struct ip_conntrack *,
+ enum ip_conntrack_info,
+ u32 *seq)
+= { [IP_CT_FTP_PORT] = mangle_rfc959_packet,
+ [IP_CT_FTP_PASV] = mangle_rfc959_packet,
+ [IP_CT_FTP_EPRT] = mangle_eprt_packet,
+ [IP_CT_FTP_EPSV] = mangle_epsv_packet
+};
+
+/* So, this packet has hit the connection tracking matching code.
+ Mangle it, and change the expectation to match the new version. */
+static unsigned int ip_nat_ftp(struct sk_buff **pskb,
+ enum ip_conntrack_info ctinfo,
+ enum ip_ct_ftp_type type,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct ip_conntrack_expect *exp,
+ u32 *seq)
+{
+ u_int32_t newip;
+ u_int16_t port;
+ int dir = CTINFO2DIR(ctinfo);
+ struct ip_conntrack *ct = exp->master;
+
+ DEBUGP("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
+
+ /* Connection will come from wherever this packet goes, hence !dir */
+ newip = ct->tuplehash[!dir].tuple.dst.ip;
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->dir = !dir;
+
+ /* When you see the packet, we need to NAT it the same as the
+ * this one. */
+ exp->expectfn = ip_nat_follow_master;
+
+ /* Try to get same port: if not, try to change it. */
+ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+ exp->tuple.dst.u.tcp.port = htons(port);
+ if (ip_conntrack_expect_related(exp) == 0)
+ break;
+ }
+
+ if (port == 0) {
+ ip_conntrack_expect_free(exp);
+ return NF_DROP;
+ }
+
+ if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo,
+ seq)) {
+ ip_conntrack_unexpect_related(exp);
+ return NF_DROP;
+ }
+ return NF_ACCEPT;
+}
+
+static void __exit fini(void)
+{
+ ip_nat_ftp_hook = NULL;
+ /* Make sure noone calls it, meanwhile. */
+ synchronize_net();
+}
+
+static int __init init(void)
+{
+ BUG_ON(ip_nat_ftp_hook);
+ ip_nat_ftp_hook = ip_nat_ftp;
+ return 0;
+}
+
+/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
+static int warn_set(const char *val, struct kernel_param *kp)
+{
+ printk(KERN_INFO __stringify(KBUILD_MODNAME)
+ ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
+ return 0;
+}
+module_param_call(ports, warn_set, NULL, NULL, 0);
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
new file mode 100644
index 00000000000..1637b96d8c0
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_helper.c
@@ -0,0 +1,430 @@
+/* ip_nat_helper.c - generic support functions for NAT helpers
+ *
+ * (C) 2000-2002 Harald Welte <laforge@netfilter.org>
+ * (C) 2003-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 14 Jan 2002 Harald Welte <laforge@gnumonks.org>:
+ * - add support for SACK adjustment
+ * 14 Mar 2002 Harald Welte <laforge@gnumonks.org>:
+ * - merge SACK support into newnat API
+ * 16 Aug 2002 Brian J. Murrell <netfilter@interlinx.bc.ca>:
+ * - make ip_nat_resize_packet more generic (TCP and UDP)
+ * - add ip_nat_mangle_udp_packet
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/checksum.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+#define DEBUGP printk
+#define DUMP_OFFSET(x) printk("offset_before=%d, offset_after=%d, correction_pos=%u\n", x->offset_before, x->offset_after, x->correction_pos);
+#else
+#define DEBUGP(format, args...)
+#define DUMP_OFFSET(x)
+#endif
+
+static DECLARE_LOCK(ip_nat_seqofs_lock);
+
+/* Setup TCP sequence correction given this change at this sequence */
+static inline void
+adjust_tcp_sequence(u32 seq,
+ int sizediff,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ int dir;
+ struct ip_nat_seq *this_way, *other_way;
+
+ DEBUGP("ip_nat_resize_packet: old_size = %u, new_size = %u\n",
+ (*skb)->len, new_size);
+
+ dir = CTINFO2DIR(ctinfo);
+
+ this_way = &ct->nat.info.seq[dir];
+ other_way = &ct->nat.info.seq[!dir];
+
+ DEBUGP("ip_nat_resize_packet: Seq_offset before: ");
+ DUMP_OFFSET(this_way);
+
+ LOCK_BH(&ip_nat_seqofs_lock);
+
+ /* SYN adjust. If it's uninitialized, or this is after last
+ * correction, record it: we don't handle more than one
+ * adjustment in the window, but do deal with common case of a
+ * retransmit */
+ if (this_way->offset_before == this_way->offset_after
+ || before(this_way->correction_pos, seq)) {
+ this_way->correction_pos = seq;
+ this_way->offset_before = this_way->offset_after;
+ this_way->offset_after += sizediff;
+ }
+ UNLOCK_BH(&ip_nat_seqofs_lock);
+
+ DEBUGP("ip_nat_resize_packet: Seq_offset after: ");
+ DUMP_OFFSET(this_way);
+}
+
+/* Frobs data inside this packet, which is linear. */
+static void mangle_contents(struct sk_buff *skb,
+ unsigned int dataoff,
+ unsigned int match_offset,
+ unsigned int match_len,
+ const char *rep_buffer,
+ unsigned int rep_len)
+{
+ unsigned char *data;
+
+ BUG_ON(skb_is_nonlinear(skb));
+ data = (unsigned char *)skb->nh.iph + dataoff;
+
+ /* move post-replacement */
+ memmove(data + match_offset + rep_len,
+ data + match_offset + match_len,
+ skb->tail - (data + match_offset + match_len));
+
+ /* insert data from buffer */
+ memcpy(data + match_offset, rep_buffer, rep_len);
+
+ /* update skb info */
+ if (rep_len > match_len) {
+ DEBUGP("ip_nat_mangle_packet: Extending packet by "
+ "%u from %u bytes\n", rep_len - match_len,
+ skb->len);
+ skb_put(skb, rep_len - match_len);
+ } else {
+ DEBUGP("ip_nat_mangle_packet: Shrinking packet from "
+ "%u from %u bytes\n", match_len - rep_len,
+ skb->len);
+ __skb_trim(skb, skb->len + rep_len - match_len);
+ }
+
+ /* fix IP hdr checksum information */
+ skb->nh.iph->tot_len = htons(skb->len);
+ ip_send_check(skb->nh.iph);
+}
+
+/* Unusual, but possible case. */
+static int enlarge_skb(struct sk_buff **pskb, unsigned int extra)
+{
+ struct sk_buff *nskb;
+
+ if ((*pskb)->len + extra > 65535)
+ return 0;
+
+ nskb = skb_copy_expand(*pskb, skb_headroom(*pskb), extra, GFP_ATOMIC);
+ if (!nskb)
+ return 0;
+
+ /* Transfer socket to new skb. */
+ if ((*pskb)->sk)
+ skb_set_owner_w(nskb, (*pskb)->sk);
+#ifdef CONFIG_NETFILTER_DEBUG
+ nskb->nf_debug = (*pskb)->nf_debug;
+#endif
+ kfree_skb(*pskb);
+ *pskb = nskb;
+ return 1;
+}
+
+/* Generic function for mangling variable-length address changes inside
+ * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
+ * command in FTP).
+ *
+ * Takes care about all the nasty sequence number changes, checksumming,
+ * skb enlargement, ...
+ *
+ * */
+int
+ip_nat_mangle_tcp_packet(struct sk_buff **pskb,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int match_offset,
+ unsigned int match_len,
+ const char *rep_buffer,
+ unsigned int rep_len)
+{
+ struct iphdr *iph;
+ struct tcphdr *tcph;
+ int datalen;
+
+ if (!skb_ip_make_writable(pskb, (*pskb)->len))
+ return 0;
+
+ if (rep_len > match_len
+ && rep_len - match_len > skb_tailroom(*pskb)
+ && !enlarge_skb(pskb, rep_len - match_len))
+ return 0;
+
+ SKB_LINEAR_ASSERT(*pskb);
+
+ iph = (*pskb)->nh.iph;
+ tcph = (void *)iph + iph->ihl*4;
+
+ mangle_contents(*pskb, iph->ihl*4 + tcph->doff*4,
+ match_offset, match_len, rep_buffer, rep_len);
+
+ datalen = (*pskb)->len - iph->ihl*4;
+ tcph->check = 0;
+ tcph->check = tcp_v4_check(tcph, datalen, iph->saddr, iph->daddr,
+ csum_partial((char *)tcph, datalen, 0));
+
+ if (rep_len != match_len) {
+ set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
+ adjust_tcp_sequence(ntohl(tcph->seq),
+ (int)rep_len - (int)match_len,
+ ct, ctinfo);
+ /* Tell TCP window tracking about seq change */
+ ip_conntrack_tcp_update(*pskb, ct, CTINFO2DIR(ctinfo));
+ }
+ return 1;
+}
+
+/* Generic function for mangling variable-length address changes inside
+ * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
+ * command in the Amanda protocol)
+ *
+ * Takes care about all the nasty sequence number changes, checksumming,
+ * skb enlargement, ...
+ *
+ * XXX - This function could be merged with ip_nat_mangle_tcp_packet which
+ * should be fairly easy to do.
+ */
+int
+ip_nat_mangle_udp_packet(struct sk_buff **pskb,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int match_offset,
+ unsigned int match_len,
+ const char *rep_buffer,
+ unsigned int rep_len)
+{
+ struct iphdr *iph;
+ struct udphdr *udph;
+
+ /* UDP helpers might accidentally mangle the wrong packet */
+ iph = (*pskb)->nh.iph;
+ if ((*pskb)->len < iph->ihl*4 + sizeof(*udph) +
+ match_offset + match_len)
+ return 0;
+
+ if (!skb_ip_make_writable(pskb, (*pskb)->len))
+ return 0;
+
+ if (rep_len > match_len
+ && rep_len - match_len > skb_tailroom(*pskb)
+ && !enlarge_skb(pskb, rep_len - match_len))
+ return 0;
+
+ iph = (*pskb)->nh.iph;
+ udph = (void *)iph + iph->ihl*4;
+ mangle_contents(*pskb, iph->ihl*4 + sizeof(*udph),
+ match_offset, match_len, rep_buffer, rep_len);
+
+ /* update the length of the UDP packet */
+ udph->len = htons((*pskb)->len - iph->ihl*4);
+
+ /* fix udp checksum if udp checksum was previously calculated */
+ if (udph->check) {
+ int datalen = (*pskb)->len - iph->ihl * 4;
+ udph->check = 0;
+ udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+ datalen, IPPROTO_UDP,
+ csum_partial((char *)udph,
+ datalen, 0));
+ }
+
+ return 1;
+}
+
+/* Adjust one found SACK option including checksum correction */
+static void
+sack_adjust(struct sk_buff *skb,
+ struct tcphdr *tcph,
+ unsigned int sackoff,
+ unsigned int sackend,
+ struct ip_nat_seq *natseq)
+{
+ while (sackoff < sackend) {
+ struct tcp_sack_block *sack;
+ u_int32_t new_start_seq, new_end_seq;
+
+ sack = (void *)skb->data + sackoff;
+ if (after(ntohl(sack->start_seq) - natseq->offset_before,
+ natseq->correction_pos))
+ new_start_seq = ntohl(sack->start_seq)
+ - natseq->offset_after;
+ else
+ new_start_seq = ntohl(sack->start_seq)
+ - natseq->offset_before;
+ new_start_seq = htonl(new_start_seq);
+
+ if (after(ntohl(sack->end_seq) - natseq->offset_before,
+ natseq->correction_pos))
+ new_end_seq = ntohl(sack->end_seq)
+ - natseq->offset_after;
+ else
+ new_end_seq = ntohl(sack->end_seq)
+ - natseq->offset_before;
+ new_end_seq = htonl(new_end_seq);
+
+ DEBUGP("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n",
+ ntohl(sack->start_seq), new_start_seq,
+ ntohl(sack->end_seq), new_end_seq);
+
+ tcph->check =
+ ip_nat_cheat_check(~sack->start_seq, new_start_seq,
+ ip_nat_cheat_check(~sack->end_seq,
+ new_end_seq,
+ tcph->check));
+ sack->start_seq = new_start_seq;
+ sack->end_seq = new_end_seq;
+ sackoff += sizeof(*sack);
+ }
+}
+
+/* TCP SACK sequence number adjustment */
+static inline unsigned int
+ip_nat_sack_adjust(struct sk_buff **pskb,
+ struct tcphdr *tcph,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ unsigned int dir, optoff, optend;
+
+ optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr);
+ optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4;
+
+ if (!skb_ip_make_writable(pskb, optend))
+ return 0;
+
+ dir = CTINFO2DIR(ctinfo);
+
+ while (optoff < optend) {
+ /* Usually: option, length. */
+ unsigned char *op = (*pskb)->data + optoff;
+
+ switch (op[0]) {
+ case TCPOPT_EOL:
+ return 1;
+ case TCPOPT_NOP:
+ optoff++;
+ continue;
+ default:
+ /* no partial options */
+ if (optoff + 1 == optend
+ || optoff + op[1] > optend
+ || op[1] < 2)
+ return 0;
+ if (op[0] == TCPOPT_SACK
+ && op[1] >= 2+TCPOLEN_SACK_PERBLOCK
+ && ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0)
+ sack_adjust(*pskb, tcph, optoff+2,
+ optoff+op[1],
+ &ct->nat.info.seq[!dir]);
+ optoff += op[1];
+ }
+ }
+ return 1;
+}
+
+/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */
+int
+ip_nat_seq_adjust(struct sk_buff **pskb,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ struct tcphdr *tcph;
+ int dir, newseq, newack;
+ struct ip_nat_seq *this_way, *other_way;
+
+ dir = CTINFO2DIR(ctinfo);
+
+ this_way = &ct->nat.info.seq[dir];
+ other_way = &ct->nat.info.seq[!dir];
+
+ if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
+ return 0;
+
+ tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
+ if (after(ntohl(tcph->seq), this_way->correction_pos))
+ newseq = ntohl(tcph->seq) + this_way->offset_after;
+ else
+ newseq = ntohl(tcph->seq) + this_way->offset_before;
+ newseq = htonl(newseq);
+
+ if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
+ other_way->correction_pos))
+ newack = ntohl(tcph->ack_seq) - other_way->offset_after;
+ else
+ newack = ntohl(tcph->ack_seq) - other_way->offset_before;
+ newack = htonl(newack);
+
+ tcph->check = ip_nat_cheat_check(~tcph->seq, newseq,
+ ip_nat_cheat_check(~tcph->ack_seq,
+ newack,
+ tcph->check));
+
+ DEBUGP("Adjusting sequence number from %u->%u, ack from %u->%u\n",
+ ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
+ ntohl(newack));
+
+ tcph->seq = newseq;
+ tcph->ack_seq = newack;
+
+ if (!ip_nat_sack_adjust(pskb, tcph, ct, ctinfo))
+ return 0;
+
+ ip_conntrack_tcp_update(*pskb, ct, dir);
+
+ return 1;
+}
+
+/* Setup NAT on this expected conntrack so it follows master. */
+/* If we fail to get a free NAT slot, we'll get dropped on confirm */
+void ip_nat_follow_master(struct ip_conntrack *ct,
+ struct ip_conntrack_expect *exp)
+{
+ struct ip_nat_range range;
+
+ /* This must be a fresh one. */
+ BUG_ON(ct->status & IPS_NAT_DONE_MASK);
+
+ /* Change src to where master sends to */
+ range.flags = IP_NAT_RANGE_MAP_IPS;
+ range.min_ip = range.max_ip
+ = ct->master->tuplehash[!exp->dir].tuple.dst.ip;
+ /* hook doesn't matter, but it has to do source manip */
+ ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
+
+ /* For DST manip, map port here to where it's expected. */
+ range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
+ range.min = range.max = exp->saved_proto;
+ range.min_ip = range.max_ip
+ = ct->master->tuplehash[!exp->dir].tuple.src.ip;
+ /* hook doesn't matter, but it has to do destination manip */
+ ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
+}
diff --git a/net/ipv4/netfilter/ip_nat_irc.c b/net/ipv4/netfilter/ip_nat_irc.c
new file mode 100644
index 00000000000..9c1ca3381d5
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_irc.c
@@ -0,0 +1,125 @@
+/* IRC extension for TCP NAT alteration.
+ * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
+ * based on a copy of RR's ip_nat_ftp.c
+ *
+ * ip_nat_irc.c,v 1.16 2001/12/06 07:42:10 laforge Exp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/kernel.h>
+#include <net/tcp.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_conntrack_irc.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/moduleparam.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("IRC (DCC) NAT helper");
+MODULE_LICENSE("GPL");
+
+static unsigned int help(struct sk_buff **pskb,
+ enum ip_conntrack_info ctinfo,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct ip_conntrack_expect *exp)
+{
+ u_int16_t port;
+ unsigned int ret;
+
+ /* "4294967296 65635 " */
+ char buffer[18];
+
+ DEBUGP("IRC_NAT: info (seq %u + %u) in %u\n",
+ expect->seq, exp_irc_info->len,
+ ntohl(tcph->seq));
+
+ /* Reply comes from server. */
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->dir = IP_CT_DIR_REPLY;
+
+ /* When you see the packet, we need to NAT it the same as the
+ * this one. */
+ exp->expectfn = ip_nat_follow_master;
+
+ /* Try to get same port: if not, try to change it. */
+ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+ exp->tuple.dst.u.tcp.port = htons(port);
+ if (ip_conntrack_expect_related(exp) == 0)
+ break;
+ }
+
+ if (port == 0) {
+ ip_conntrack_expect_free(exp);
+ return NF_DROP;
+ }
+
+ /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27
+ * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28
+ * strlen("\1DCC SEND F AAAAAAAA P S\1\n")=26
+ * strlen("\1DCC MOVE F AAAAAAAA P S\1\n")=26
+ * strlen("\1DCC TSEND F AAAAAAAA P S\1\n")=27
+ * AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits,
+ * 255.255.255.255==4294967296, 10 digits)
+ * P: bound port (min 1 d, max 5d (65635))
+ * F: filename (min 1 d )
+ * S: size (min 1 d )
+ * 0x01, \n: terminators
+ */
+
+ /* AAA = "us", ie. where server normally talks to. */
+ sprintf(buffer, "%u %u",
+ ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip),
+ port);
+ DEBUGP("ip_nat_irc: Inserting '%s' == %u.%u.%u.%u, port %u\n",
+ buffer, NIPQUAD(exp->tuple.src.ip), port);
+
+ ret = ip_nat_mangle_tcp_packet(pskb, exp->master, ctinfo,
+ matchoff, matchlen, buffer,
+ strlen(buffer));
+ if (ret != NF_ACCEPT)
+ ip_conntrack_unexpect_related(exp);
+ return ret;
+}
+
+static void __exit fini(void)
+{
+ ip_nat_irc_hook = NULL;
+ /* Make sure noone calls it, meanwhile. */
+ synchronize_net();
+}
+
+static int __init init(void)
+{
+ BUG_ON(ip_nat_irc_hook);
+ ip_nat_irc_hook = help;
+ return 0;
+}
+
+/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
+static int warn_set(const char *val, struct kernel_param *kp)
+{
+ printk(KERN_INFO __stringify(KBUILD_MODNAME)
+ ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
+ return 0;
+}
+module_param_call(ports, warn_set, NULL, NULL, 0);
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c
new file mode 100644
index 00000000000..a558cf0eee8
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c
@@ -0,0 +1,115 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/if.h>
+
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+
+static int
+icmp_in_range(const struct ip_conntrack_tuple *tuple,
+ enum ip_nat_manip_type maniptype,
+ const union ip_conntrack_manip_proto *min,
+ const union ip_conntrack_manip_proto *max)
+{
+ return (tuple->src.u.icmp.id >= min->icmp.id
+ && tuple->src.u.icmp.id <= max->icmp.id);
+}
+
+static int
+icmp_unique_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_nat_range *range,
+ enum ip_nat_manip_type maniptype,
+ const struct ip_conntrack *conntrack)
+{
+ static u_int16_t id;
+ unsigned int range_size
+ = (unsigned int)range->max.icmp.id - range->min.icmp.id + 1;
+ unsigned int i;
+
+ /* If no range specified... */
+ if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
+ range_size = 0xFFFF;
+
+ for (i = 0; i < range_size; i++, id++) {
+ tuple->src.u.icmp.id = range->min.icmp.id + (id % range_size);
+ if (!ip_nat_used_tuple(tuple, conntrack))
+ return 1;
+ }
+ return 0;
+}
+
+static int
+icmp_manip_pkt(struct sk_buff **pskb,
+ unsigned int iphdroff,
+ const struct ip_conntrack_tuple *tuple,
+ enum ip_nat_manip_type maniptype)
+{
+ struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
+ struct icmphdr *hdr;
+ unsigned int hdroff = iphdroff + iph->ihl*4;
+
+ if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
+ return 0;
+
+ hdr = (struct icmphdr *)((*pskb)->data + hdroff);
+
+ hdr->checksum = ip_nat_cheat_check(hdr->un.echo.id ^ 0xFFFF,
+ tuple->src.u.icmp.id,
+ hdr->checksum);
+ hdr->un.echo.id = tuple->src.u.icmp.id;
+ return 1;
+}
+
+static unsigned int
+icmp_print(char *buffer,
+ const struct ip_conntrack_tuple *match,
+ const struct ip_conntrack_tuple *mask)
+{
+ unsigned int len = 0;
+
+ if (mask->src.u.icmp.id)
+ len += sprintf(buffer + len, "id=%u ",
+ ntohs(match->src.u.icmp.id));
+
+ if (mask->dst.u.icmp.type)
+ len += sprintf(buffer + len, "type=%u ",
+ ntohs(match->dst.u.icmp.type));
+
+ if (mask->dst.u.icmp.code)
+ len += sprintf(buffer + len, "code=%u ",
+ ntohs(match->dst.u.icmp.code));
+
+ return len;
+}
+
+static unsigned int
+icmp_print_range(char *buffer, const struct ip_nat_range *range)
+{
+ if (range->min.icmp.id != 0 || range->max.icmp.id != 0xFFFF)
+ return sprintf(buffer, "id %u-%u ",
+ ntohs(range->min.icmp.id),
+ ntohs(range->max.icmp.id));
+ else return 0;
+}
+
+struct ip_nat_protocol ip_nat_protocol_icmp
+= { "ICMP", IPPROTO_ICMP,
+ icmp_manip_pkt,
+ icmp_in_range,
+ icmp_unique_tuple,
+ icmp_print,
+ icmp_print_range
+};
diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c
new file mode 100644
index 00000000000..a91cfceff27
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c
@@ -0,0 +1,178 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/if.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+
+static int
+tcp_in_range(const struct ip_conntrack_tuple *tuple,
+ enum ip_nat_manip_type maniptype,
+ const union ip_conntrack_manip_proto *min,
+ const union ip_conntrack_manip_proto *max)
+{
+ u_int16_t port;
+
+ if (maniptype == IP_NAT_MANIP_SRC)
+ port = tuple->src.u.tcp.port;
+ else
+ port = tuple->dst.u.tcp.port;
+
+ return ntohs(port) >= ntohs(min->tcp.port)
+ && ntohs(port) <= ntohs(max->tcp.port);
+}
+
+static int
+tcp_unique_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_nat_range *range,
+ enum ip_nat_manip_type maniptype,
+ const struct ip_conntrack *conntrack)
+{
+ static u_int16_t port, *portptr;
+ unsigned int range_size, min, i;
+
+ if (maniptype == IP_NAT_MANIP_SRC)
+ portptr = &tuple->src.u.tcp.port;
+ else
+ portptr = &tuple->dst.u.tcp.port;
+
+ /* If no range specified... */
+ if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
+ /* If it's dst rewrite, can't change port */
+ if (maniptype == IP_NAT_MANIP_DST)
+ return 0;
+
+ /* Map privileged onto privileged. */
+ if (ntohs(*portptr) < 1024) {
+ /* Loose convention: >> 512 is credential passing */
+ if (ntohs(*portptr)<512) {
+ min = 1;
+ range_size = 511 - min + 1;
+ } else {
+ min = 600;
+ range_size = 1023 - min + 1;
+ }
+ } else {
+ min = 1024;
+ range_size = 65535 - 1024 + 1;
+ }
+ } else {
+ min = ntohs(range->min.tcp.port);
+ range_size = ntohs(range->max.tcp.port) - min + 1;
+ }
+
+ for (i = 0; i < range_size; i++, port++) {
+ *portptr = htons(min + port % range_size);
+ if (!ip_nat_used_tuple(tuple, conntrack)) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static int
+tcp_manip_pkt(struct sk_buff **pskb,
+ unsigned int iphdroff,
+ const struct ip_conntrack_tuple *tuple,
+ enum ip_nat_manip_type maniptype)
+{
+ struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
+ struct tcphdr *hdr;
+ unsigned int hdroff = iphdroff + iph->ihl*4;
+ u32 oldip, newip;
+ u16 *portptr, newport, oldport;
+ int hdrsize = 8; /* TCP connection tracking guarantees this much */
+
+ /* this could be a inner header returned in icmp packet; in such
+ cases we cannot update the checksum field since it is outside of
+ the 8 bytes of transport layer headers we are guaranteed */
+ if ((*pskb)->len >= hdroff + sizeof(struct tcphdr))
+ hdrsize = sizeof(struct tcphdr);
+
+ if (!skb_ip_make_writable(pskb, hdroff + hdrsize))
+ return 0;
+
+ iph = (struct iphdr *)((*pskb)->data + iphdroff);
+ hdr = (struct tcphdr *)((*pskb)->data + hdroff);
+
+ if (maniptype == IP_NAT_MANIP_SRC) {
+ /* Get rid of src ip and src pt */
+ oldip = iph->saddr;
+ newip = tuple->src.ip;
+ newport = tuple->src.u.tcp.port;
+ portptr = &hdr->source;
+ } else {
+ /* Get rid of dst ip and dst pt */
+ oldip = iph->daddr;
+ newip = tuple->dst.ip;
+ newport = tuple->dst.u.tcp.port;
+ portptr = &hdr->dest;
+ }
+
+ oldport = *portptr;
+ *portptr = newport;
+
+ if (hdrsize < sizeof(*hdr))
+ return 1;
+
+ hdr->check = ip_nat_cheat_check(~oldip, newip,
+ ip_nat_cheat_check(oldport ^ 0xFFFF,
+ newport,
+ hdr->check));
+ return 1;
+}
+
+static unsigned int
+tcp_print(char *buffer,
+ const struct ip_conntrack_tuple *match,
+ const struct ip_conntrack_tuple *mask)
+{
+ unsigned int len = 0;
+
+ if (mask->src.u.tcp.port)
+ len += sprintf(buffer + len, "srcpt=%u ",
+ ntohs(match->src.u.tcp.port));
+
+
+ if (mask->dst.u.tcp.port)
+ len += sprintf(buffer + len, "dstpt=%u ",
+ ntohs(match->dst.u.tcp.port));
+
+ return len;
+}
+
+static unsigned int
+tcp_print_range(char *buffer, const struct ip_nat_range *range)
+{
+ if (range->min.tcp.port != 0 || range->max.tcp.port != 0xFFFF) {
+ if (range->min.tcp.port == range->max.tcp.port)
+ return sprintf(buffer, "port %u ",
+ ntohs(range->min.tcp.port));
+ else
+ return sprintf(buffer, "ports %u-%u ",
+ ntohs(range->min.tcp.port),
+ ntohs(range->max.tcp.port));
+ }
+ else return 0;
+}
+
+struct ip_nat_protocol ip_nat_protocol_tcp
+= { "TCP", IPPROTO_TCP,
+ tcp_manip_pkt,
+ tcp_in_range,
+ tcp_unique_tuple,
+ tcp_print,
+ tcp_print_range
+};
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c
new file mode 100644
index 00000000000..c669e3b5f5d
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_proto_udp.c
@@ -0,0 +1,165 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/if.h>
+
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+
+static int
+udp_in_range(const struct ip_conntrack_tuple *tuple,
+ enum ip_nat_manip_type maniptype,
+ const union ip_conntrack_manip_proto *min,
+ const union ip_conntrack_manip_proto *max)
+{
+ u_int16_t port;
+
+ if (maniptype == IP_NAT_MANIP_SRC)
+ port = tuple->src.u.udp.port;
+ else
+ port = tuple->dst.u.udp.port;
+
+ return ntohs(port) >= ntohs(min->udp.port)
+ && ntohs(port) <= ntohs(max->udp.port);
+}
+
+static int
+udp_unique_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_nat_range *range,
+ enum ip_nat_manip_type maniptype,
+ const struct ip_conntrack *conntrack)
+{
+ static u_int16_t port, *portptr;
+ unsigned int range_size, min, i;
+
+ if (maniptype == IP_NAT_MANIP_SRC)
+ portptr = &tuple->src.u.udp.port;
+ else
+ portptr = &tuple->dst.u.udp.port;
+
+ /* If no range specified... */
+ if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
+ /* If it's dst rewrite, can't change port */
+ if (maniptype == IP_NAT_MANIP_DST)
+ return 0;
+
+ if (ntohs(*portptr) < 1024) {
+ /* Loose convention: >> 512 is credential passing */
+ if (ntohs(*portptr)<512) {
+ min = 1;
+ range_size = 511 - min + 1;
+ } else {
+ min = 600;
+ range_size = 1023 - min + 1;
+ }
+ } else {
+ min = 1024;
+ range_size = 65535 - 1024 + 1;
+ }
+ } else {
+ min = ntohs(range->min.udp.port);
+ range_size = ntohs(range->max.udp.port) - min + 1;
+ }
+
+ for (i = 0; i < range_size; i++, port++) {
+ *portptr = htons(min + port % range_size);
+ if (!ip_nat_used_tuple(tuple, conntrack))
+ return 1;
+ }
+ return 0;
+}
+
+static int
+udp_manip_pkt(struct sk_buff **pskb,
+ unsigned int iphdroff,
+ const struct ip_conntrack_tuple *tuple,
+ enum ip_nat_manip_type maniptype)
+{
+ struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
+ struct udphdr *hdr;
+ unsigned int hdroff = iphdroff + iph->ihl*4;
+ u32 oldip, newip;
+ u16 *portptr, newport;
+
+ if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
+ return 0;
+
+ iph = (struct iphdr *)((*pskb)->data + iphdroff);
+ hdr = (struct udphdr *)((*pskb)->data + hdroff);
+
+ if (maniptype == IP_NAT_MANIP_SRC) {
+ /* Get rid of src ip and src pt */
+ oldip = iph->saddr;
+ newip = tuple->src.ip;
+ newport = tuple->src.u.udp.port;
+ portptr = &hdr->source;
+ } else {
+ /* Get rid of dst ip and dst pt */
+ oldip = iph->daddr;
+ newip = tuple->dst.ip;
+ newport = tuple->dst.u.udp.port;
+ portptr = &hdr->dest;
+ }
+ if (hdr->check) /* 0 is a special case meaning no checksum */
+ hdr->check = ip_nat_cheat_check(~oldip, newip,
+ ip_nat_cheat_check(*portptr ^ 0xFFFF,
+ newport,
+ hdr->check));
+ *portptr = newport;
+ return 1;
+}
+
+static unsigned int
+udp_print(char *buffer,
+ const struct ip_conntrack_tuple *match,
+ const struct ip_conntrack_tuple *mask)
+{
+ unsigned int len = 0;
+
+ if (mask->src.u.udp.port)
+ len += sprintf(buffer + len, "srcpt=%u ",
+ ntohs(match->src.u.udp.port));
+
+
+ if (mask->dst.u.udp.port)
+ len += sprintf(buffer + len, "dstpt=%u ",
+ ntohs(match->dst.u.udp.port));
+
+ return len;
+}
+
+static unsigned int
+udp_print_range(char *buffer, const struct ip_nat_range *range)
+{
+ if (range->min.udp.port != 0 || range->max.udp.port != 0xFFFF) {
+ if (range->min.udp.port == range->max.udp.port)
+ return sprintf(buffer, "port %u ",
+ ntohs(range->min.udp.port));
+ else
+ return sprintf(buffer, "ports %u-%u ",
+ ntohs(range->min.udp.port),
+ ntohs(range->max.udp.port));
+ }
+ else return 0;
+}
+
+struct ip_nat_protocol ip_nat_protocol_udp
+= { "UDP", IPPROTO_UDP,
+ udp_manip_pkt,
+ udp_in_range,
+ udp_unique_tuple,
+ udp_print,
+ udp_print_range
+};
diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c
new file mode 100644
index 00000000000..f5525bd58d1
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c
@@ -0,0 +1,70 @@
+/* The "unknown" protocol. This is what is used for protocols we
+ * don't understand. It's returned by ip_ct_find_proto().
+ */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/netfilter.h>
+#include <linux/if.h>
+
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+
+static int unknown_in_range(const struct ip_conntrack_tuple *tuple,
+ enum ip_nat_manip_type manip_type,
+ const union ip_conntrack_manip_proto *min,
+ const union ip_conntrack_manip_proto *max)
+{
+ return 1;
+}
+
+static int unknown_unique_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_nat_range *range,
+ enum ip_nat_manip_type maniptype,
+ const struct ip_conntrack *conntrack)
+{
+ /* Sorry: we can't help you; if it's not unique, we can't frob
+ anything. */
+ return 0;
+}
+
+static int
+unknown_manip_pkt(struct sk_buff **pskb,
+ unsigned int iphdroff,
+ const struct ip_conntrack_tuple *tuple,
+ enum ip_nat_manip_type maniptype)
+{
+ return 1;
+}
+
+static unsigned int
+unknown_print(char *buffer,
+ const struct ip_conntrack_tuple *match,
+ const struct ip_conntrack_tuple *mask)
+{
+ return 0;
+}
+
+static unsigned int
+unknown_print_range(char *buffer, const struct ip_nat_range *range)
+{
+ return 0;
+}
+
+struct ip_nat_protocol ip_nat_unknown_protocol = {
+ "unknown", 0,
+ unknown_manip_pkt,
+ unknown_in_range,
+ unknown_unique_tuple,
+ unknown_print,
+ unknown_print_range
+};
diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c
new file mode 100644
index 00000000000..581f097f5a2
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_rule.c
@@ -0,0 +1,319 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Everything about the rules for NAT. */
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <net/checksum.h>
+#include <net/route.h>
+#include <linux/bitops.h>
+
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+#define NAT_VALID_HOOKS ((1<<NF_IP_PRE_ROUTING) | (1<<NF_IP_POST_ROUTING) | (1<<NF_IP_LOCAL_OUT))
+
+static struct
+{
+ struct ipt_replace repl;
+ struct ipt_standard entries[3];
+ struct ipt_error term;
+} nat_initial_table __initdata
+= { { "nat", NAT_VALID_HOOKS, 4,
+ sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
+ { [NF_IP_PRE_ROUTING] = 0,
+ [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard),
+ [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },
+ { [NF_IP_PRE_ROUTING] = 0,
+ [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard),
+ [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },
+ 0, NULL, { } },
+ {
+ /* PRE_ROUTING */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_standard),
+ 0, { 0, 0 }, { } },
+ { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+ -NF_ACCEPT - 1 } },
+ /* POST_ROUTING */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_standard),
+ 0, { 0, 0 }, { } },
+ { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+ -NF_ACCEPT - 1 } },
+ /* LOCAL_OUT */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_standard),
+ 0, { 0, 0 }, { } },
+ { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+ -NF_ACCEPT - 1 } }
+ },
+ /* ERROR */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_error),
+ 0, { 0, 0 }, { } },
+ { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } },
+ { } },
+ "ERROR"
+ }
+ }
+};
+
+static struct ipt_table nat_table = {
+ .name = "nat",
+ .valid_hooks = NAT_VALID_HOOKS,
+ .lock = RW_LOCK_UNLOCKED,
+ .me = THIS_MODULE,
+};
+
+/* Source NAT */
+static unsigned int ipt_snat_target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ struct ip_conntrack *ct;
+ enum ip_conntrack_info ctinfo;
+ const struct ip_nat_multi_range_compat *mr = targinfo;
+
+ IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING);
+
+ ct = ip_conntrack_get(*pskb, &ctinfo);
+
+ /* Connection must be valid and new. */
+ IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED
+ || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
+ IP_NF_ASSERT(out);
+
+ return ip_nat_setup_info(ct, &mr->range[0], hooknum);
+}
+
+/* Before 2.6.11 we did implicit source NAT if required. Warn about change. */
+static void warn_if_extra_mangle(u32 dstip, u32 srcip)
+{
+ static int warned = 0;
+ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } };
+ struct rtable *rt;
+
+ if (ip_route_output_key(&rt, &fl) != 0)
+ return;
+
+ if (rt->rt_src != srcip && !warned) {
+ printk("NAT: no longer support implicit source local NAT\n");
+ printk("NAT: packet src %u.%u.%u.%u -> dst %u.%u.%u.%u\n",
+ NIPQUAD(srcip), NIPQUAD(dstip));
+ warned = 1;
+ }
+ ip_rt_put(rt);
+}
+
+static unsigned int ipt_dnat_target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ struct ip_conntrack *ct;
+ enum ip_conntrack_info ctinfo;
+ const struct ip_nat_multi_range_compat *mr = targinfo;
+
+ IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
+ || hooknum == NF_IP_LOCAL_OUT);
+
+ ct = ip_conntrack_get(*pskb, &ctinfo);
+
+ /* Connection must be valid and new. */
+ IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
+
+ if (hooknum == NF_IP_LOCAL_OUT
+ && mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
+ warn_if_extra_mangle((*pskb)->nh.iph->daddr,
+ mr->range[0].min_ip);
+
+ return ip_nat_setup_info(ct, &mr->range[0], hooknum);
+}
+
+static int ipt_snat_checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ struct ip_nat_multi_range_compat *mr = targinfo;
+
+ /* Must be a valid range */
+ if (mr->rangesize != 1) {
+ printk("SNAT: multiple ranges no longer supported\n");
+ return 0;
+ }
+
+ if (targinfosize != IPT_ALIGN(sizeof(struct ip_nat_multi_range_compat))) {
+ DEBUGP("SNAT: Target size %u wrong for %u ranges\n",
+ targinfosize, mr->rangesize);
+ return 0;
+ }
+
+ /* Only allow these for NAT. */
+ if (strcmp(tablename, "nat") != 0) {
+ DEBUGP("SNAT: wrong table %s\n", tablename);
+ return 0;
+ }
+
+ if (hook_mask & ~(1 << NF_IP_POST_ROUTING)) {
+ DEBUGP("SNAT: hook mask 0x%x bad\n", hook_mask);
+ return 0;
+ }
+ return 1;
+}
+
+static int ipt_dnat_checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ struct ip_nat_multi_range_compat *mr = targinfo;
+
+ /* Must be a valid range */
+ if (mr->rangesize != 1) {
+ printk("DNAT: multiple ranges no longer supported\n");
+ return 0;
+ }
+
+ if (targinfosize != IPT_ALIGN(sizeof(struct ip_nat_multi_range_compat))) {
+ DEBUGP("DNAT: Target size %u wrong for %u ranges\n",
+ targinfosize, mr->rangesize);
+ return 0;
+ }
+
+ /* Only allow these for NAT. */
+ if (strcmp(tablename, "nat") != 0) {
+ DEBUGP("DNAT: wrong table %s\n", tablename);
+ return 0;
+ }
+
+ if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))) {
+ DEBUGP("DNAT: hook mask 0x%x bad\n", hook_mask);
+ return 0;
+ }
+
+ return 1;
+}
+
+inline unsigned int
+alloc_null_binding(struct ip_conntrack *conntrack,
+ struct ip_nat_info *info,
+ unsigned int hooknum)
+{
+ /* Force range to this IP; let proto decide mapping for
+ per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
+ Use reply in case it's already been mangled (eg local packet).
+ */
+ u_int32_t ip
+ = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
+ ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip
+ : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip);
+ struct ip_nat_range range
+ = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } };
+
+ DEBUGP("Allocating NULL binding for %p (%u.%u.%u.%u)\n", conntrack,
+ NIPQUAD(ip));
+ return ip_nat_setup_info(conntrack, &range, hooknum);
+}
+
+int ip_nat_rule_find(struct sk_buff **pskb,
+ unsigned int hooknum,
+ const struct net_device *in,
+ const struct net_device *out,
+ struct ip_conntrack *ct,
+ struct ip_nat_info *info)
+{
+ int ret;
+
+ ret = ipt_do_table(pskb, hooknum, in, out, &nat_table, NULL);
+
+ if (ret == NF_ACCEPT) {
+ if (!ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
+ /* NUL mapping */
+ ret = alloc_null_binding(ct, info, hooknum);
+ }
+ return ret;
+}
+
+static struct ipt_target ipt_snat_reg = {
+ .name = "SNAT",
+ .target = ipt_snat_target,
+ .checkentry = ipt_snat_checkentry,
+};
+
+static struct ipt_target ipt_dnat_reg = {
+ .name = "DNAT",
+ .target = ipt_dnat_target,
+ .checkentry = ipt_dnat_checkentry,
+};
+
+int __init ip_nat_rule_init(void)
+{
+ int ret;
+
+ ret = ipt_register_table(&nat_table, &nat_initial_table.repl);
+ if (ret != 0)
+ return ret;
+ ret = ipt_register_target(&ipt_snat_reg);
+ if (ret != 0)
+ goto unregister_table;
+
+ ret = ipt_register_target(&ipt_dnat_reg);
+ if (ret != 0)
+ goto unregister_snat;
+
+ return ret;
+
+ unregister_snat:
+ ipt_unregister_target(&ipt_snat_reg);
+ unregister_table:
+ ipt_unregister_table(&nat_table);
+
+ return ret;
+}
+
+void ip_nat_rule_cleanup(void)
+{
+ ipt_unregister_target(&ipt_dnat_reg);
+ ipt_unregister_target(&ipt_snat_reg);
+ ipt_unregister_table(&nat_table);
+}
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
new file mode 100644
index 00000000000..2a48b6e635a
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -0,0 +1,1347 @@
+/*
+ * ip_nat_snmp_basic.c
+ *
+ * Basic SNMP Application Layer Gateway
+ *
+ * This IP NAT module is intended for use with SNMP network
+ * discovery and monitoring applications where target networks use
+ * conflicting private address realms.
+ *
+ * Static NAT is used to remap the networks from the view of the network
+ * management system at the IP layer, and this module remaps some application
+ * layer addresses to match.
+ *
+ * The simplest form of ALG is performed, where only tagged IP addresses
+ * are modified. The module does not need to be MIB aware and only scans
+ * messages at the ASN.1/BER level.
+ *
+ * Currently, only SNMPv1 and SNMPv2 are supported.
+ *
+ * More information on ALG and associated issues can be found in
+ * RFC 2962
+ *
+ * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory
+ * McLean & Jochen Friedrich, stripped down for use in the kernel.
+ *
+ * Copyright (c) 2000 RP Internet (www.rpi.net.au).
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: James Morris <jmorris@intercode.com.au>
+ *
+ * Updates:
+ * 2000-08-06: Convert to new helper API (Harald Welte).
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+#include <asm/uaccess.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway");
+
+#define SNMP_PORT 161
+#define SNMP_TRAP_PORT 162
+#define NOCT1(n) (u_int8_t )((n) & 0xff)
+
+static int debug;
+static DEFINE_SPINLOCK(snmp_lock);
+
+/*
+ * Application layer address mapping mimics the NAT mapping, but
+ * only for the first octet in this case (a more flexible system
+ * can be implemented if needed).
+ */
+struct oct1_map
+{
+ u_int8_t from;
+ u_int8_t to;
+};
+
+
+/*****************************************************************************
+ *
+ * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse)
+ *
+ *****************************************************************************/
+
+/* Class */
+#define ASN1_UNI 0 /* Universal */
+#define ASN1_APL 1 /* Application */
+#define ASN1_CTX 2 /* Context */
+#define ASN1_PRV 3 /* Private */
+
+/* Tag */
+#define ASN1_EOC 0 /* End Of Contents */
+#define ASN1_BOL 1 /* Boolean */
+#define ASN1_INT 2 /* Integer */
+#define ASN1_BTS 3 /* Bit String */
+#define ASN1_OTS 4 /* Octet String */
+#define ASN1_NUL 5 /* Null */
+#define ASN1_OJI 6 /* Object Identifier */
+#define ASN1_OJD 7 /* Object Description */
+#define ASN1_EXT 8 /* External */
+#define ASN1_SEQ 16 /* Sequence */
+#define ASN1_SET 17 /* Set */
+#define ASN1_NUMSTR 18 /* Numerical String */
+#define ASN1_PRNSTR 19 /* Printable String */
+#define ASN1_TEXSTR 20 /* Teletext String */
+#define ASN1_VIDSTR 21 /* Video String */
+#define ASN1_IA5STR 22 /* IA5 String */
+#define ASN1_UNITIM 23 /* Universal Time */
+#define ASN1_GENTIM 24 /* General Time */
+#define ASN1_GRASTR 25 /* Graphical String */
+#define ASN1_VISSTR 26 /* Visible String */
+#define ASN1_GENSTR 27 /* General String */
+
+/* Primitive / Constructed methods*/
+#define ASN1_PRI 0 /* Primitive */
+#define ASN1_CON 1 /* Constructed */
+
+/*
+ * Error codes.
+ */
+#define ASN1_ERR_NOERROR 0
+#define ASN1_ERR_DEC_EMPTY 2
+#define ASN1_ERR_DEC_EOC_MISMATCH 3
+#define ASN1_ERR_DEC_LENGTH_MISMATCH 4
+#define ASN1_ERR_DEC_BADVALUE 5
+
+/*
+ * ASN.1 context.
+ */
+struct asn1_ctx
+{
+ int error; /* Error condition */
+ unsigned char *pointer; /* Octet just to be decoded */
+ unsigned char *begin; /* First octet */
+ unsigned char *end; /* Octet after last octet */
+};
+
+/*
+ * Octet string (not null terminated)
+ */
+struct asn1_octstr
+{
+ unsigned char *data;
+ unsigned int len;
+};
+
+static void asn1_open(struct asn1_ctx *ctx,
+ unsigned char *buf,
+ unsigned int len)
+{
+ ctx->begin = buf;
+ ctx->end = buf + len;
+ ctx->pointer = buf;
+ ctx->error = ASN1_ERR_NOERROR;
+}
+
+static unsigned char asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch)
+{
+ if (ctx->pointer >= ctx->end) {
+ ctx->error = ASN1_ERR_DEC_EMPTY;
+ return 0;
+ }
+ *ch = *(ctx->pointer)++;
+ return 1;
+}
+
+static unsigned char asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag)
+{
+ unsigned char ch;
+
+ *tag = 0;
+
+ do
+ {
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+ *tag <<= 7;
+ *tag |= ch & 0x7F;
+ } while ((ch & 0x80) == 0x80);
+ return 1;
+}
+
+static unsigned char asn1_id_decode(struct asn1_ctx *ctx,
+ unsigned int *cls,
+ unsigned int *con,
+ unsigned int *tag)
+{
+ unsigned char ch;
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *cls = (ch & 0xC0) >> 6;
+ *con = (ch & 0x20) >> 5;
+ *tag = (ch & 0x1F);
+
+ if (*tag == 0x1F) {
+ if (!asn1_tag_decode(ctx, tag))
+ return 0;
+ }
+ return 1;
+}
+
+static unsigned char asn1_length_decode(struct asn1_ctx *ctx,
+ unsigned int *def,
+ unsigned int *len)
+{
+ unsigned char ch, cnt;
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ if (ch == 0x80)
+ *def = 0;
+ else {
+ *def = 1;
+
+ if (ch < 0x80)
+ *len = ch;
+ else {
+ cnt = (unsigned char) (ch & 0x7F);
+ *len = 0;
+
+ while (cnt > 0) {
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+ *len <<= 8;
+ *len |= ch;
+ cnt--;
+ }
+ }
+ }
+ return 1;
+}
+
+static unsigned char asn1_header_decode(struct asn1_ctx *ctx,
+ unsigned char **eoc,
+ unsigned int *cls,
+ unsigned int *con,
+ unsigned int *tag)
+{
+ unsigned int def, len;
+
+ if (!asn1_id_decode(ctx, cls, con, tag))
+ return 0;
+
+ if (!asn1_length_decode(ctx, &def, &len))
+ return 0;
+
+ if (def)
+ *eoc = ctx->pointer + len;
+ else
+ *eoc = NULL;
+ return 1;
+}
+
+static unsigned char asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc)
+{
+ unsigned char ch;
+
+ if (eoc == 0) {
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ if (ch != 0x00) {
+ ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
+ return 0;
+ }
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ if (ch != 0x00) {
+ ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
+ return 0;
+ }
+ return 1;
+ } else {
+ if (ctx->pointer != eoc) {
+ ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH;
+ return 0;
+ }
+ return 1;
+ }
+}
+
+static unsigned char asn1_null_decode(struct asn1_ctx *ctx, unsigned char *eoc)
+{
+ ctx->pointer = eoc;
+ return 1;
+}
+
+static unsigned char asn1_long_decode(struct asn1_ctx *ctx,
+ unsigned char *eoc,
+ long *integer)
+{
+ unsigned char ch;
+ unsigned int len;
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *integer = (signed char) ch;
+ len = 1;
+
+ while (ctx->pointer < eoc) {
+ if (++len > sizeof (long)) {
+ ctx->error = ASN1_ERR_DEC_BADVALUE;
+ return 0;
+ }
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *integer <<= 8;
+ *integer |= ch;
+ }
+ return 1;
+}
+
+static unsigned char asn1_uint_decode(struct asn1_ctx *ctx,
+ unsigned char *eoc,
+ unsigned int *integer)
+{
+ unsigned char ch;
+ unsigned int len;
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *integer = ch;
+ if (ch == 0) len = 0;
+ else len = 1;
+
+ while (ctx->pointer < eoc) {
+ if (++len > sizeof (unsigned int)) {
+ ctx->error = ASN1_ERR_DEC_BADVALUE;
+ return 0;
+ }
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *integer <<= 8;
+ *integer |= ch;
+ }
+ return 1;
+}
+
+static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx,
+ unsigned char *eoc,
+ unsigned long *integer)
+{
+ unsigned char ch;
+ unsigned int len;
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *integer = ch;
+ if (ch == 0) len = 0;
+ else len = 1;
+
+ while (ctx->pointer < eoc) {
+ if (++len > sizeof (unsigned long)) {
+ ctx->error = ASN1_ERR_DEC_BADVALUE;
+ return 0;
+ }
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *integer <<= 8;
+ *integer |= ch;
+ }
+ return 1;
+}
+
+static unsigned char asn1_octets_decode(struct asn1_ctx *ctx,
+ unsigned char *eoc,
+ unsigned char **octets,
+ unsigned int *len)
+{
+ unsigned char *ptr;
+
+ *len = 0;
+
+ *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC);
+ if (*octets == NULL) {
+ if (net_ratelimit())
+ printk("OOM in bsalg (%d)\n", __LINE__);
+ return 0;
+ }
+
+ ptr = *octets;
+ while (ctx->pointer < eoc) {
+ if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) {
+ kfree(*octets);
+ *octets = NULL;
+ return 0;
+ }
+ (*len)++;
+ }
+ return 1;
+}
+
+static unsigned char asn1_subid_decode(struct asn1_ctx *ctx,
+ unsigned long *subid)
+{
+ unsigned char ch;
+
+ *subid = 0;
+
+ do {
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *subid <<= 7;
+ *subid |= ch & 0x7F;
+ } while ((ch & 0x80) == 0x80);
+ return 1;
+}
+
+static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
+ unsigned char *eoc,
+ unsigned long **oid,
+ unsigned int *len)
+{
+ unsigned long subid;
+ unsigned int size;
+ unsigned long *optr;
+
+ size = eoc - ctx->pointer + 1;
+ *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
+ if (*oid == NULL) {
+ if (net_ratelimit())
+ printk("OOM in bsalg (%d)\n", __LINE__);
+ return 0;
+ }
+
+ optr = *oid;
+
+ if (!asn1_subid_decode(ctx, &subid)) {
+ kfree(*oid);
+ *oid = NULL;
+ return 0;
+ }
+
+ if (subid < 40) {
+ optr [0] = 0;
+ optr [1] = subid;
+ } else if (subid < 80) {
+ optr [0] = 1;
+ optr [1] = subid - 40;
+ } else {
+ optr [0] = 2;
+ optr [1] = subid - 80;
+ }
+
+ *len = 2;
+ optr += 2;
+
+ while (ctx->pointer < eoc) {
+ if (++(*len) > size) {
+ ctx->error = ASN1_ERR_DEC_BADVALUE;
+ kfree(*oid);
+ *oid = NULL;
+ return 0;
+ }
+
+ if (!asn1_subid_decode(ctx, optr++)) {
+ kfree(*oid);
+ *oid = NULL;
+ return 0;
+ }
+ }
+ return 1;
+}
+
+/*****************************************************************************
+ *
+ * SNMP decoding routines (gxsnmp author Dirk Wisse)
+ *
+ *****************************************************************************/
+
+/* SNMP Versions */
+#define SNMP_V1 0
+#define SNMP_V2C 1
+#define SNMP_V2 2
+#define SNMP_V3 3
+
+/* Default Sizes */
+#define SNMP_SIZE_COMM 256
+#define SNMP_SIZE_OBJECTID 128
+#define SNMP_SIZE_BUFCHR 256
+#define SNMP_SIZE_BUFINT 128
+#define SNMP_SIZE_SMALLOBJECTID 16
+
+/* Requests */
+#define SNMP_PDU_GET 0
+#define SNMP_PDU_NEXT 1
+#define SNMP_PDU_RESPONSE 2
+#define SNMP_PDU_SET 3
+#define SNMP_PDU_TRAP1 4
+#define SNMP_PDU_BULK 5
+#define SNMP_PDU_INFORM 6
+#define SNMP_PDU_TRAP2 7
+
+/* Errors */
+#define SNMP_NOERROR 0
+#define SNMP_TOOBIG 1
+#define SNMP_NOSUCHNAME 2
+#define SNMP_BADVALUE 3
+#define SNMP_READONLY 4
+#define SNMP_GENERROR 5
+#define SNMP_NOACCESS 6
+#define SNMP_WRONGTYPE 7
+#define SNMP_WRONGLENGTH 8
+#define SNMP_WRONGENCODING 9
+#define SNMP_WRONGVALUE 10
+#define SNMP_NOCREATION 11
+#define SNMP_INCONSISTENTVALUE 12
+#define SNMP_RESOURCEUNAVAILABLE 13
+#define SNMP_COMMITFAILED 14
+#define SNMP_UNDOFAILED 15
+#define SNMP_AUTHORIZATIONERROR 16
+#define SNMP_NOTWRITABLE 17
+#define SNMP_INCONSISTENTNAME 18
+
+/* General SNMP V1 Traps */
+#define SNMP_TRAP_COLDSTART 0
+#define SNMP_TRAP_WARMSTART 1
+#define SNMP_TRAP_LINKDOWN 2
+#define SNMP_TRAP_LINKUP 3
+#define SNMP_TRAP_AUTFAILURE 4
+#define SNMP_TRAP_EQPNEIGHBORLOSS 5
+#define SNMP_TRAP_ENTSPECIFIC 6
+
+/* SNMPv1 Types */
+#define SNMP_NULL 0
+#define SNMP_INTEGER 1 /* l */
+#define SNMP_OCTETSTR 2 /* c */
+#define SNMP_DISPLAYSTR 2 /* c */
+#define SNMP_OBJECTID 3 /* ul */
+#define SNMP_IPADDR 4 /* uc */
+#define SNMP_COUNTER 5 /* ul */
+#define SNMP_GAUGE 6 /* ul */
+#define SNMP_TIMETICKS 7 /* ul */
+#define SNMP_OPAQUE 8 /* c */
+
+/* Additional SNMPv2 Types */
+#define SNMP_UINTEGER 5 /* ul */
+#define SNMP_BITSTR 9 /* uc */
+#define SNMP_NSAP 10 /* uc */
+#define SNMP_COUNTER64 11 /* ul */
+#define SNMP_NOSUCHOBJECT 12
+#define SNMP_NOSUCHINSTANCE 13
+#define SNMP_ENDOFMIBVIEW 14
+
+union snmp_syntax
+{
+ unsigned char uc[0]; /* 8 bit unsigned */
+ char c[0]; /* 8 bit signed */
+ unsigned long ul[0]; /* 32 bit unsigned */
+ long l[0]; /* 32 bit signed */
+};
+
+struct snmp_object
+{
+ unsigned long *id;
+ unsigned int id_len;
+ unsigned short type;
+ unsigned int syntax_len;
+ union snmp_syntax syntax;
+};
+
+struct snmp_request
+{
+ unsigned long id;
+ unsigned int error_status;
+ unsigned int error_index;
+};
+
+struct snmp_v1_trap
+{
+ unsigned long *id;
+ unsigned int id_len;
+ unsigned long ip_address; /* pointer */
+ unsigned int general;
+ unsigned int specific;
+ unsigned long time;
+};
+
+/* SNMP types */
+#define SNMP_IPA 0
+#define SNMP_CNT 1
+#define SNMP_GGE 2
+#define SNMP_TIT 3
+#define SNMP_OPQ 4
+#define SNMP_C64 6
+
+/* SNMP errors */
+#define SERR_NSO 0
+#define SERR_NSI 1
+#define SERR_EOM 2
+
+static inline void mangle_address(unsigned char *begin,
+ unsigned char *addr,
+ const struct oct1_map *map,
+ u_int16_t *check);
+struct snmp_cnv
+{
+ unsigned int class;
+ unsigned int tag;
+ int syntax;
+};
+
+static struct snmp_cnv snmp_conv [] =
+{
+ {ASN1_UNI, ASN1_NUL, SNMP_NULL},
+ {ASN1_UNI, ASN1_INT, SNMP_INTEGER},
+ {ASN1_UNI, ASN1_OTS, SNMP_OCTETSTR},
+ {ASN1_UNI, ASN1_OTS, SNMP_DISPLAYSTR},
+ {ASN1_UNI, ASN1_OJI, SNMP_OBJECTID},
+ {ASN1_APL, SNMP_IPA, SNMP_IPADDR},
+ {ASN1_APL, SNMP_CNT, SNMP_COUNTER}, /* Counter32 */
+ {ASN1_APL, SNMP_GGE, SNMP_GAUGE}, /* Gauge32 == Unsigned32 */
+ {ASN1_APL, SNMP_TIT, SNMP_TIMETICKS},
+ {ASN1_APL, SNMP_OPQ, SNMP_OPAQUE},
+
+ /* SNMPv2 data types and errors */
+ {ASN1_UNI, ASN1_BTS, SNMP_BITSTR},
+ {ASN1_APL, SNMP_C64, SNMP_COUNTER64},
+ {ASN1_CTX, SERR_NSO, SNMP_NOSUCHOBJECT},
+ {ASN1_CTX, SERR_NSI, SNMP_NOSUCHINSTANCE},
+ {ASN1_CTX, SERR_EOM, SNMP_ENDOFMIBVIEW},
+ {0, 0, -1}
+};
+
+static unsigned char snmp_tag_cls2syntax(unsigned int tag,
+ unsigned int cls,
+ unsigned short *syntax)
+{
+ struct snmp_cnv *cnv;
+
+ cnv = snmp_conv;
+
+ while (cnv->syntax != -1) {
+ if (cnv->tag == tag && cnv->class == cls) {
+ *syntax = cnv->syntax;
+ return 1;
+ }
+ cnv++;
+ }
+ return 0;
+}
+
+static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
+ struct snmp_object **obj)
+{
+ unsigned int cls, con, tag, len, idlen;
+ unsigned short type;
+ unsigned char *eoc, *end, *p;
+ unsigned long *lp, *id;
+ unsigned long ul;
+ long l;
+
+ *obj = NULL;
+ id = NULL;
+
+ if (!asn1_header_decode(ctx, &eoc, &cls, &con, &tag))
+ return 0;
+
+ if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
+ return 0;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ return 0;
+
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI)
+ return 0;
+
+ if (!asn1_oid_decode(ctx, end, &id, &idlen))
+ return 0;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) {
+ kfree(id);
+ return 0;
+ }
+
+ if (con != ASN1_PRI) {
+ kfree(id);
+ return 0;
+ }
+
+ if (!snmp_tag_cls2syntax(tag, cls, &type)) {
+ kfree(id);
+ return 0;
+ }
+
+ switch (type) {
+ case SNMP_INTEGER:
+ len = sizeof(long);
+ if (!asn1_long_decode(ctx, end, &l)) {
+ kfree(id);
+ return 0;
+ }
+ *obj = kmalloc(sizeof(struct snmp_object) + len,
+ GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(id);
+ if (net_ratelimit())
+ printk("OOM in bsalg (%d)\n", __LINE__);
+ return 0;
+ }
+ (*obj)->syntax.l[0] = l;
+ break;
+ case SNMP_OCTETSTR:
+ case SNMP_OPAQUE:
+ if (!asn1_octets_decode(ctx, end, &p, &len)) {
+ kfree(id);
+ return 0;
+ }
+ *obj = kmalloc(sizeof(struct snmp_object) + len,
+ GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(id);
+ if (net_ratelimit())
+ printk("OOM in bsalg (%d)\n", __LINE__);
+ return 0;
+ }
+ memcpy((*obj)->syntax.c, p, len);
+ kfree(p);
+ break;
+ case SNMP_NULL:
+ case SNMP_NOSUCHOBJECT:
+ case SNMP_NOSUCHINSTANCE:
+ case SNMP_ENDOFMIBVIEW:
+ len = 0;
+ *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(id);
+ if (net_ratelimit())
+ printk("OOM in bsalg (%d)\n", __LINE__);
+ return 0;
+ }
+ if (!asn1_null_decode(ctx, end)) {
+ kfree(id);
+ kfree(*obj);
+ *obj = NULL;
+ return 0;
+ }
+ break;
+ case SNMP_OBJECTID:
+ if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) {
+ kfree(id);
+ return 0;
+ }
+ len *= sizeof(unsigned long);
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(id);
+ if (net_ratelimit())
+ printk("OOM in bsalg (%d)\n", __LINE__);
+ return 0;
+ }
+ memcpy((*obj)->syntax.ul, lp, len);
+ kfree(lp);
+ break;
+ case SNMP_IPADDR:
+ if (!asn1_octets_decode(ctx, end, &p, &len)) {
+ kfree(id);
+ return 0;
+ }
+ if (len != 4) {
+ kfree(p);
+ kfree(id);
+ return 0;
+ }
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(p);
+ kfree(id);
+ if (net_ratelimit())
+ printk("OOM in bsalg (%d)\n", __LINE__);
+ return 0;
+ }
+ memcpy((*obj)->syntax.uc, p, len);
+ kfree(p);
+ break;
+ case SNMP_COUNTER:
+ case SNMP_GAUGE:
+ case SNMP_TIMETICKS:
+ len = sizeof(unsigned long);
+ if (!asn1_ulong_decode(ctx, end, &ul)) {
+ kfree(id);
+ return 0;
+ }
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(id);
+ if (net_ratelimit())
+ printk("OOM in bsalg (%d)\n", __LINE__);
+ return 0;
+ }
+ (*obj)->syntax.ul[0] = ul;
+ break;
+ default:
+ kfree(id);
+ return 0;
+ }
+
+ (*obj)->syntax_len = len;
+ (*obj)->type = type;
+ (*obj)->id = id;
+ (*obj)->id_len = idlen;
+
+ if (!asn1_eoc_decode(ctx, eoc)) {
+ kfree(id);
+ kfree(*obj);
+ *obj = NULL;
+ return 0;
+ }
+ return 1;
+}
+
+static unsigned char snmp_request_decode(struct asn1_ctx *ctx,
+ struct snmp_request *request)
+{
+ unsigned int cls, con, tag;
+ unsigned char *end;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ return 0;
+
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+ return 0;
+
+ if (!asn1_ulong_decode(ctx, end, &request->id))
+ return 0;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ return 0;
+
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+ return 0;
+
+ if (!asn1_uint_decode(ctx, end, &request->error_status))
+ return 0;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ return 0;
+
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+ return 0;
+
+ if (!asn1_uint_decode(ctx, end, &request->error_index))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Fast checksum update for possibly oddly-aligned UDP byte, from the
+ * code example in the draft.
+ */
+static void fast_csum(unsigned char *csum,
+ const unsigned char *optr,
+ const unsigned char *nptr,
+ int odd)
+{
+ long x, old, new;
+
+ x = csum[0] * 256 + csum[1];
+
+ x =~ x & 0xFFFF;
+
+ if (odd) old = optr[0] * 256;
+ else old = optr[0];
+
+ x -= old & 0xFFFF;
+ if (x <= 0) {
+ x--;
+ x &= 0xFFFF;
+ }
+
+ if (odd) new = nptr[0] * 256;
+ else new = nptr[0];
+
+ x += new & 0xFFFF;
+ if (x & 0x10000) {
+ x++;
+ x &= 0xFFFF;
+ }
+
+ x =~ x & 0xFFFF;
+ csum[0] = x / 256;
+ csum[1] = x & 0xFF;
+}
+
+/*
+ * Mangle IP address.
+ * - begin points to the start of the snmp messgae
+ * - addr points to the start of the address
+ */
+static inline void mangle_address(unsigned char *begin,
+ unsigned char *addr,
+ const struct oct1_map *map,
+ u_int16_t *check)
+{
+ if (map->from == NOCT1(*addr)) {
+ u_int32_t old;
+
+ if (debug)
+ memcpy(&old, (unsigned char *)addr, sizeof(old));
+
+ *addr = map->to;
+
+ /* Update UDP checksum if being used */
+ if (*check) {
+ unsigned char odd = !((addr - begin) % 2);
+
+ fast_csum((unsigned char *)check,
+ &map->from, &map->to, odd);
+
+ }
+
+ if (debug)
+ printk(KERN_DEBUG "bsalg: mapped %u.%u.%u.%u to "
+ "%u.%u.%u.%u\n", NIPQUAD(old), NIPQUAD(*addr));
+ }
+}
+
+static unsigned char snmp_trap_decode(struct asn1_ctx *ctx,
+ struct snmp_v1_trap *trap,
+ const struct oct1_map *map,
+ u_int16_t *check)
+{
+ unsigned int cls, con, tag, len;
+ unsigned char *end;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ return 0;
+
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI)
+ return 0;
+
+ if (!asn1_oid_decode(ctx, end, &trap->id, &trap->id_len))
+ return 0;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ goto err_id_free;
+
+ if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_IPA) ||
+ (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_OTS)))
+ goto err_id_free;
+
+ if (!asn1_octets_decode(ctx, end, (unsigned char **)&trap->ip_address, &len))
+ goto err_id_free;
+
+ /* IPv4 only */
+ if (len != 4)
+ goto err_addr_free;
+
+ mangle_address(ctx->begin, ctx->pointer - 4, map, check);
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ goto err_addr_free;
+
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+ goto err_addr_free;
+
+ if (!asn1_uint_decode(ctx, end, &trap->general))
+ goto err_addr_free;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ goto err_addr_free;
+
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+ goto err_addr_free;
+
+ if (!asn1_uint_decode(ctx, end, &trap->specific))
+ goto err_addr_free;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ goto err_addr_free;
+
+ if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_TIT) ||
+ (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_INT)))
+ goto err_addr_free;
+
+ if (!asn1_ulong_decode(ctx, end, &trap->time))
+ goto err_addr_free;
+
+ return 1;
+
+err_id_free:
+ kfree(trap->id);
+
+err_addr_free:
+ kfree((unsigned long *)trap->ip_address);
+
+ return 0;
+}
+
+/*****************************************************************************
+ *
+ * Misc. routines
+ *
+ *****************************************************************************/
+
+static void hex_dump(unsigned char *buf, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len; i++) {
+ if (i && !(i % 16))
+ printk("\n");
+ printk("%02x ", *(buf + i));
+ }
+ printk("\n");
+}
+
+/*
+ * Parse and mangle SNMP message according to mapping.
+ * (And this is the fucking 'basic' method).
+ */
+static int snmp_parse_mangle(unsigned char *msg,
+ u_int16_t len,
+ const struct oct1_map *map,
+ u_int16_t *check)
+{
+ unsigned char *eoc, *end;
+ unsigned int cls, con, tag, vers, pdutype;
+ struct asn1_ctx ctx;
+ struct asn1_octstr comm;
+ struct snmp_object **obj;
+
+ if (debug > 1)
+ hex_dump(msg, len);
+
+ asn1_open(&ctx, msg, len);
+
+ /*
+ * Start of SNMP message.
+ */
+ if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag))
+ return 0;
+ if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
+ return 0;
+
+ /*
+ * Version 1 or 2 handled.
+ */
+ if (!asn1_header_decode(&ctx, &end, &cls, &con, &tag))
+ return 0;
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+ return 0;
+ if (!asn1_uint_decode (&ctx, end, &vers))
+ return 0;
+ if (debug > 1)
+ printk(KERN_DEBUG "bsalg: snmp version: %u\n", vers + 1);
+ if (vers > 1)
+ return 1;
+
+ /*
+ * Community.
+ */
+ if (!asn1_header_decode (&ctx, &end, &cls, &con, &tag))
+ return 0;
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OTS)
+ return 0;
+ if (!asn1_octets_decode(&ctx, end, &comm.data, &comm.len))
+ return 0;
+ if (debug > 1) {
+ unsigned int i;
+
+ printk(KERN_DEBUG "bsalg: community: ");
+ for (i = 0; i < comm.len; i++)
+ printk("%c", comm.data[i]);
+ printk("\n");
+ }
+ kfree(comm.data);
+
+ /*
+ * PDU type
+ */
+ if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &pdutype))
+ return 0;
+ if (cls != ASN1_CTX || con != ASN1_CON)
+ return 0;
+ if (debug > 1) {
+ unsigned char *pdus[] = {
+ [SNMP_PDU_GET] = "get",
+ [SNMP_PDU_NEXT] = "get-next",
+ [SNMP_PDU_RESPONSE] = "response",
+ [SNMP_PDU_SET] = "set",
+ [SNMP_PDU_TRAP1] = "trapv1",
+ [SNMP_PDU_BULK] = "bulk",
+ [SNMP_PDU_INFORM] = "inform",
+ [SNMP_PDU_TRAP2] = "trapv2"
+ };
+
+ if (pdutype > SNMP_PDU_TRAP2)
+ printk(KERN_DEBUG "bsalg: bad pdu type %u\n", pdutype);
+ else
+ printk(KERN_DEBUG "bsalg: pdu: %s\n", pdus[pdutype]);
+ }
+ if (pdutype != SNMP_PDU_RESPONSE &&
+ pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2)
+ return 1;
+
+ /*
+ * Request header or v1 trap
+ */
+ if (pdutype == SNMP_PDU_TRAP1) {
+ struct snmp_v1_trap trap;
+ unsigned char ret = snmp_trap_decode(&ctx, &trap, map, check);
+
+ /* Discard trap allocations regardless */
+ kfree(trap.id);
+ kfree((unsigned long *)trap.ip_address);
+
+ if (!ret)
+ return ret;
+
+ } else {
+ struct snmp_request req;
+
+ if (!snmp_request_decode(&ctx, &req))
+ return 0;
+
+ if (debug > 1)
+ printk(KERN_DEBUG "bsalg: request: id=0x%lx error_status=%u "
+ "error_index=%u\n", req.id, req.error_status,
+ req.error_index);
+ }
+
+ /*
+ * Loop through objects, look for IP addresses to mangle.
+ */
+ if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag))
+ return 0;
+
+ if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
+ return 0;
+
+ obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
+ if (obj == NULL) {
+ if (net_ratelimit())
+ printk(KERN_WARNING "OOM in bsalg(%d)\n", __LINE__);
+ return 0;
+ }
+
+ while (!asn1_eoc_decode(&ctx, eoc)) {
+ unsigned int i;
+
+ if (!snmp_object_decode(&ctx, obj)) {
+ if (*obj) {
+ if ((*obj)->id)
+ kfree((*obj)->id);
+ kfree(*obj);
+ }
+ kfree(obj);
+ return 0;
+ }
+
+ if (debug > 1) {
+ printk(KERN_DEBUG "bsalg: object: ");
+ for (i = 0; i < (*obj)->id_len; i++) {
+ if (i > 0)
+ printk(".");
+ printk("%lu", (*obj)->id[i]);
+ }
+ printk(": type=%u\n", (*obj)->type);
+
+ }
+
+ if ((*obj)->type == SNMP_IPADDR)
+ mangle_address(ctx.begin, ctx.pointer - 4 , map, check);
+
+ kfree((*obj)->id);
+ kfree(*obj);
+ }
+ kfree(obj);
+
+ if (!asn1_eoc_decode(&ctx, eoc))
+ return 0;
+
+ return 1;
+}
+
+/*****************************************************************************
+ *
+ * NAT routines.
+ *
+ *****************************************************************************/
+
+/*
+ * SNMP translation routine.
+ */
+static int snmp_translate(struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ struct sk_buff **pskb)
+{
+ struct iphdr *iph = (*pskb)->nh.iph;
+ struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl);
+ u_int16_t udplen = ntohs(udph->len);
+ u_int16_t paylen = udplen - sizeof(struct udphdr);
+ int dir = CTINFO2DIR(ctinfo);
+ struct oct1_map map;
+
+ /*
+ * Determine mappping for application layer addresses based
+ * on NAT manipulations for the packet.
+ */
+ if (dir == IP_CT_DIR_ORIGINAL) {
+ /* SNAT traps */
+ map.from = NOCT1(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip);
+ map.to = NOCT1(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip);
+ } else {
+ /* DNAT replies */
+ map.from = NOCT1(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip);
+ map.to = NOCT1(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip);
+ }
+
+ if (map.from == map.to)
+ return NF_ACCEPT;
+
+ if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr),
+ paylen, &map, &udph->check)) {
+ if (net_ratelimit())
+ printk(KERN_WARNING "bsalg: parser failed\n");
+ return NF_DROP;
+ }
+ return NF_ACCEPT;
+}
+
+/* We don't actually set up expectations, just adjust internal IP
+ * addresses if this is being NATted */
+static int help(struct sk_buff **pskb,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ unsigned int ret;
+ struct iphdr *iph = (*pskb)->nh.iph;
+ struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl);
+
+ /* SNMP replies and originating SNMP traps get mangled */
+ if (udph->source == ntohs(SNMP_PORT) && dir != IP_CT_DIR_REPLY)
+ return NF_ACCEPT;
+ if (udph->dest == ntohs(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL)
+ return NF_ACCEPT;
+
+ /* No NAT? */
+ if (!(ct->status & IPS_NAT_MASK))
+ return NF_ACCEPT;
+
+ /*
+ * Make sure the packet length is ok. So far, we were only guaranteed
+ * to have a valid length IP header plus 8 bytes, which means we have
+ * enough room for a UDP header. Just verify the UDP length field so we
+ * can mess around with the payload.
+ */
+ if (ntohs(udph->len) != (*pskb)->len - (iph->ihl << 2)) {
+ if (net_ratelimit())
+ printk(KERN_WARNING "SNMP: dropping malformed packet "
+ "src=%u.%u.%u.%u dst=%u.%u.%u.%u\n",
+ NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+ return NF_DROP;
+ }
+
+ if (!skb_ip_make_writable(pskb, (*pskb)->len))
+ return NF_DROP;
+
+ spin_lock_bh(&snmp_lock);
+ ret = snmp_translate(ct, ctinfo, pskb);
+ spin_unlock_bh(&snmp_lock);
+ return ret;
+}
+
+static struct ip_conntrack_helper snmp_helper = {
+ .max_expected = 0,
+ .timeout = 180,
+ .me = THIS_MODULE,
+ .help = help,
+ .name = "snmp",
+
+ .tuple = { .src = { .u = { __constant_htons(SNMP_PORT) } },
+ .dst = { .protonum = IPPROTO_UDP },
+ },
+ .mask = { .src = { .u = { 0xFFFF } },
+ .dst = { .protonum = 0xFF },
+ },
+};
+
+static struct ip_conntrack_helper snmp_trap_helper = {
+ .max_expected = 0,
+ .timeout = 180,
+ .me = THIS_MODULE,
+ .help = help,
+ .name = "snmp_trap",
+
+ .tuple = { .src = { .u = { __constant_htons(SNMP_TRAP_PORT) } },
+ .dst = { .protonum = IPPROTO_UDP },
+ },
+ .mask = { .src = { .u = { 0xFFFF } },
+ .dst = { .protonum = 0xFF },
+ },
+};
+
+/*****************************************************************************
+ *
+ * Module stuff.
+ *
+ *****************************************************************************/
+
+static int __init init(void)
+{
+ int ret = 0;
+
+ ret = ip_conntrack_helper_register(&snmp_helper);
+ if (ret < 0)
+ return ret;
+ ret = ip_conntrack_helper_register(&snmp_trap_helper);
+ if (ret < 0) {
+ ip_conntrack_helper_unregister(&snmp_helper);
+ return ret;
+ }
+ return ret;
+}
+
+static void __exit fini(void)
+{
+ ip_conntrack_helper_unregister(&snmp_helper);
+ ip_conntrack_helper_unregister(&snmp_trap_helper);
+}
+
+module_init(init);
+module_exit(fini);
+
+module_param(debug, bool, 0600);
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
new file mode 100644
index 00000000000..dec4a74212c
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -0,0 +1,349 @@
+/* This file contains all the functions required for the standalone
+ ip_nat module.
+
+ These are not required by the compatibility layer.
+*/
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
+ * - new API and handling of conntrack/nat helpers
+ * - now capable of multiple expectations for one master
+ * */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/icmp.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include <linux/spinlock.h>
+
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+#define HOOKNAME(hooknum) ((hooknum) == NF_IP_POST_ROUTING ? "POST_ROUTING" \
+ : ((hooknum) == NF_IP_PRE_ROUTING ? "PRE_ROUTING" \
+ : ((hooknum) == NF_IP_LOCAL_OUT ? "LOCAL_OUT" \
+ : ((hooknum) == NF_IP_LOCAL_IN ? "LOCAL_IN" \
+ : "*ERROR*")))
+
+static unsigned int
+ip_nat_fn(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct ip_conntrack *ct;
+ enum ip_conntrack_info ctinfo;
+ struct ip_nat_info *info;
+ /* maniptype == SRC for postrouting. */
+ enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum);
+
+ /* We never see fragments: conntrack defrags on pre-routing
+ and local-out, and ip_nat_out protects post-routing. */
+ IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
+ & htons(IP_MF|IP_OFFSET)));
+
+ (*pskb)->nfcache |= NFC_UNKNOWN;
+
+ /* If we had a hardware checksum before, it's now invalid */
+ if ((*pskb)->ip_summed == CHECKSUM_HW)
+ if (skb_checksum_help(*pskb, (out == NULL)))
+ return NF_DROP;
+
+ ct = ip_conntrack_get(*pskb, &ctinfo);
+ /* Can't track? It's not due to stress, or conntrack would
+ have dropped it. Hence it's the user's responsibilty to
+ packet filter it out, or implement conntrack/NAT for that
+ protocol. 8) --RR */
+ if (!ct) {
+ /* Exception: ICMP redirect to new connection (not in
+ hash table yet). We must not let this through, in
+ case we're doing NAT to the same network. */
+ if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) {
+ struct icmphdr _hdr, *hp;
+
+ hp = skb_header_pointer(*pskb,
+ (*pskb)->nh.iph->ihl*4,
+ sizeof(_hdr), &_hdr);
+ if (hp != NULL &&
+ hp->type == ICMP_REDIRECT)
+ return NF_DROP;
+ }
+ return NF_ACCEPT;
+ }
+
+ switch (ctinfo) {
+ case IP_CT_RELATED:
+ case IP_CT_RELATED+IP_CT_IS_REPLY:
+ if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) {
+ if (!icmp_reply_translation(pskb, ct, maniptype,
+ CTINFO2DIR(ctinfo)))
+ return NF_DROP;
+ else
+ return NF_ACCEPT;
+ }
+ /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
+ case IP_CT_NEW:
+ info = &ct->nat.info;
+
+ /* Seen it before? This can happen for loopback, retrans,
+ or local packets.. */
+ if (!ip_nat_initialized(ct, maniptype)) {
+ unsigned int ret;
+
+ /* LOCAL_IN hook doesn't have a chain! */
+ if (hooknum == NF_IP_LOCAL_IN)
+ ret = alloc_null_binding(ct, info, hooknum);
+ else
+ ret = ip_nat_rule_find(pskb, hooknum,
+ in, out, ct,
+ info);
+
+ if (ret != NF_ACCEPT) {
+ return ret;
+ }
+ } else
+ DEBUGP("Already setup manip %s for ct %p\n",
+ maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST",
+ ct);
+ break;
+
+ default:
+ /* ESTABLISHED */
+ IP_NF_ASSERT(ctinfo == IP_CT_ESTABLISHED
+ || ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY));
+ info = &ct->nat.info;
+ }
+
+ IP_NF_ASSERT(info);
+ return nat_packet(ct, ctinfo, hooknum, pskb);
+}
+
+static unsigned int
+ip_nat_in(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ u_int32_t saddr, daddr;
+ unsigned int ret;
+
+ saddr = (*pskb)->nh.iph->saddr;
+ daddr = (*pskb)->nh.iph->daddr;
+
+ ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
+ if (ret != NF_DROP && ret != NF_STOLEN
+ && ((*pskb)->nh.iph->saddr != saddr
+ || (*pskb)->nh.iph->daddr != daddr)) {
+ dst_release((*pskb)->dst);
+ (*pskb)->dst = NULL;
+ }
+ return ret;
+}
+
+static unsigned int
+ip_nat_out(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ /* root is playing with raw sockets. */
+ if ((*pskb)->len < sizeof(struct iphdr)
+ || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr))
+ return NF_ACCEPT;
+
+ /* We can hit fragment here; forwarded packets get
+ defragmented by connection tracking coming in, then
+ fragmented (grr) by the forward code.
+
+ In future: If we have nfct != NULL, AND we have NAT
+ initialized, AND there is no helper, then we can do full
+ NAPT on the head, and IP-address-only NAT on the rest.
+
+ I'm starting to have nightmares about fragments. */
+
+ if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+ *pskb = ip_ct_gather_frags(*pskb, IP_DEFRAG_NAT_OUT);
+
+ if (!*pskb)
+ return NF_STOLEN;
+ }
+
+ return ip_nat_fn(hooknum, pskb, in, out, okfn);
+}
+
+static unsigned int
+ip_nat_local_fn(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ u_int32_t saddr, daddr;
+ unsigned int ret;
+
+ /* root is playing with raw sockets. */
+ if ((*pskb)->len < sizeof(struct iphdr)
+ || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr))
+ return NF_ACCEPT;
+
+ saddr = (*pskb)->nh.iph->saddr;
+ daddr = (*pskb)->nh.iph->daddr;
+
+ ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
+ if (ret != NF_DROP && ret != NF_STOLEN
+ && ((*pskb)->nh.iph->saddr != saddr
+ || (*pskb)->nh.iph->daddr != daddr))
+ return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP;
+ return ret;
+}
+
+/* We must be after connection tracking and before packet filtering. */
+
+/* Before packet filtering, change destination */
+static struct nf_hook_ops ip_nat_in_ops = {
+ .hook = ip_nat_in,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_PRE_ROUTING,
+ .priority = NF_IP_PRI_NAT_DST,
+};
+
+/* After packet filtering, change source */
+static struct nf_hook_ops ip_nat_out_ops = {
+ .hook = ip_nat_out,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_POST_ROUTING,
+ .priority = NF_IP_PRI_NAT_SRC,
+};
+
+/* Before packet filtering, change destination */
+static struct nf_hook_ops ip_nat_local_out_ops = {
+ .hook = ip_nat_local_fn,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_LOCAL_OUT,
+ .priority = NF_IP_PRI_NAT_DST,
+};
+
+/* After packet filtering, change source for reply packets of LOCAL_OUT DNAT */
+static struct nf_hook_ops ip_nat_local_in_ops = {
+ .hook = ip_nat_fn,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_LOCAL_IN,
+ .priority = NF_IP_PRI_NAT_SRC,
+};
+
+static int init_or_cleanup(int init)
+{
+ int ret = 0;
+
+ need_ip_conntrack();
+
+ if (!init) goto cleanup;
+
+ ret = ip_nat_rule_init();
+ if (ret < 0) {
+ printk("ip_nat_init: can't setup rules.\n");
+ goto cleanup_nothing;
+ }
+ ret = ip_nat_init();
+ if (ret < 0) {
+ printk("ip_nat_init: can't setup rules.\n");
+ goto cleanup_rule_init;
+ }
+ ret = nf_register_hook(&ip_nat_in_ops);
+ if (ret < 0) {
+ printk("ip_nat_init: can't register in hook.\n");
+ goto cleanup_nat;
+ }
+ ret = nf_register_hook(&ip_nat_out_ops);
+ if (ret < 0) {
+ printk("ip_nat_init: can't register out hook.\n");
+ goto cleanup_inops;
+ }
+ ret = nf_register_hook(&ip_nat_local_out_ops);
+ if (ret < 0) {
+ printk("ip_nat_init: can't register local out hook.\n");
+ goto cleanup_outops;
+ }
+ ret = nf_register_hook(&ip_nat_local_in_ops);
+ if (ret < 0) {
+ printk("ip_nat_init: can't register local in hook.\n");
+ goto cleanup_localoutops;
+ }
+ return ret;
+
+ cleanup:
+ nf_unregister_hook(&ip_nat_local_in_ops);
+ cleanup_localoutops:
+ nf_unregister_hook(&ip_nat_local_out_ops);
+ cleanup_outops:
+ nf_unregister_hook(&ip_nat_out_ops);
+ cleanup_inops:
+ nf_unregister_hook(&ip_nat_in_ops);
+ cleanup_nat:
+ ip_nat_cleanup();
+ cleanup_rule_init:
+ ip_nat_rule_cleanup();
+ cleanup_nothing:
+ MUST_BE_READ_WRITE_UNLOCKED(&ip_nat_lock);
+ return ret;
+}
+
+static int __init init(void)
+{
+ return init_or_cleanup(1);
+}
+
+static void __exit fini(void)
+{
+ init_or_cleanup(0);
+}
+
+module_init(init);
+module_exit(fini);
+
+EXPORT_SYMBOL(ip_nat_setup_info);
+EXPORT_SYMBOL(ip_nat_protocol_register);
+EXPORT_SYMBOL(ip_nat_protocol_unregister);
+EXPORT_SYMBOL(ip_nat_cheat_check);
+EXPORT_SYMBOL(ip_nat_mangle_tcp_packet);
+EXPORT_SYMBOL(ip_nat_mangle_udp_packet);
+EXPORT_SYMBOL(ip_nat_used_tuple);
+EXPORT_SYMBOL(ip_nat_follow_master);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c
new file mode 100644
index 00000000000..0343e0d6467
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_tftp.c
@@ -0,0 +1,70 @@
+/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Version: 0.0.7
+ *
+ * Thu 21 Mar 2002 Harald Welte <laforge@gnumonks.org>
+ * - Port to newnat API
+ *
+ * This module currently supports DNAT:
+ * iptables -t nat -A PREROUTING -d x.x.x.x -j DNAT --to-dest x.x.x.y
+ *
+ * and SNAT:
+ * iptables -t nat -A POSTROUTING { -j MASQUERADE , -j SNAT --to-source x.x.x.x }
+ *
+ * It has not been tested with
+ * -j SNAT --to-source x.x.x.x-x.x.x.y since I only have one external ip
+ * If you do test this please let me know if it works or not.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_tftp.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/moduleparam.h>
+
+MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
+MODULE_DESCRIPTION("tftp NAT helper");
+MODULE_LICENSE("GPL");
+
+static unsigned int help(struct sk_buff **pskb,
+ enum ip_conntrack_info ctinfo,
+ struct ip_conntrack_expect *exp)
+{
+ exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port;
+ exp->dir = IP_CT_DIR_REPLY;
+ exp->expectfn = ip_nat_follow_master;
+ if (ip_conntrack_expect_related(exp) != 0) {
+ ip_conntrack_expect_free(exp);
+ return NF_DROP;
+ }
+ return NF_ACCEPT;
+}
+
+static void __exit fini(void)
+{
+ ip_nat_tftp_hook = NULL;
+ /* Make sure noone calls it, meanwhile. */
+ synchronize_net();
+}
+
+static int __init init(void)
+{
+ BUG_ON(ip_nat_tftp_hook);
+ ip_nat_tftp_hook = help;
+ return 0;
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
new file mode 100644
index 00000000000..9e40dffc204
--- /dev/null
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -0,0 +1,741 @@
+/*
+ * This is a module which is used for queueing IPv4 packets and
+ * communicating with userspace via netlink.
+ *
+ * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 2000-03-27: Simplified code (thanks to Andi Kleen for clues).
+ * 2000-05-20: Fixed notifier problems (following Miguel Freitas' report).
+ * 2000-06-19: Fixed so nfmark is copied to metadata (reported by Sebastian
+ * Zander).
+ * 2000-08-01: Added Nick Williams' MAC support.
+ * 2002-06-25: Code cleanup.
+ * 2005-01-10: Added /proc counter for dropped packets; fixed so
+ * packets aren't delivered to user space if they're going
+ * to be dropped.
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_queue.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netlink.h>
+#include <linux/spinlock.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <net/sock.h>
+#include <net/route.h>
+
+#define IPQ_QMAX_DEFAULT 1024
+#define IPQ_PROC_FS_NAME "ip_queue"
+#define NET_IPQ_QMAX 2088
+#define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
+
+struct ipq_rt_info {
+ __u8 tos;
+ __u32 daddr;
+ __u32 saddr;
+};
+
+struct ipq_queue_entry {
+ struct list_head list;
+ struct nf_info *info;
+ struct sk_buff *skb;
+ struct ipq_rt_info rt_info;
+};
+
+typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
+
+static unsigned char copy_mode = IPQ_COPY_NONE;
+static unsigned int queue_maxlen = IPQ_QMAX_DEFAULT;
+static DEFINE_RWLOCK(queue_lock);
+static int peer_pid;
+static unsigned int copy_range;
+static unsigned int queue_total;
+static unsigned int queue_dropped = 0;
+static unsigned int queue_user_dropped = 0;
+static struct sock *ipqnl;
+static LIST_HEAD(queue_list);
+static DECLARE_MUTEX(ipqnl_sem);
+
+static void
+ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict)
+{
+ nf_reinject(entry->skb, entry->info, verdict);
+ kfree(entry);
+}
+
+static inline void
+__ipq_enqueue_entry(struct ipq_queue_entry *entry)
+{
+ list_add(&entry->list, &queue_list);
+ queue_total++;
+}
+
+/*
+ * Find and return a queued entry matched by cmpfn, or return the last
+ * entry if cmpfn is NULL.
+ */
+static inline struct ipq_queue_entry *
+__ipq_find_entry(ipq_cmpfn cmpfn, unsigned long data)
+{
+ struct list_head *p;
+
+ list_for_each_prev(p, &queue_list) {
+ struct ipq_queue_entry *entry = (struct ipq_queue_entry *)p;
+
+ if (!cmpfn || cmpfn(entry, data))
+ return entry;
+ }
+ return NULL;
+}
+
+static inline void
+__ipq_dequeue_entry(struct ipq_queue_entry *entry)
+{
+ list_del(&entry->list);
+ queue_total--;
+}
+
+static inline struct ipq_queue_entry *
+__ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data)
+{
+ struct ipq_queue_entry *entry;
+
+ entry = __ipq_find_entry(cmpfn, data);
+ if (entry == NULL)
+ return NULL;
+
+ __ipq_dequeue_entry(entry);
+ return entry;
+}
+
+
+static inline void
+__ipq_flush(int verdict)
+{
+ struct ipq_queue_entry *entry;
+
+ while ((entry = __ipq_find_dequeue_entry(NULL, 0)))
+ ipq_issue_verdict(entry, verdict);
+}
+
+static inline int
+__ipq_set_mode(unsigned char mode, unsigned int range)
+{
+ int status = 0;
+
+ switch(mode) {
+ case IPQ_COPY_NONE:
+ case IPQ_COPY_META:
+ copy_mode = mode;
+ copy_range = 0;
+ break;
+
+ case IPQ_COPY_PACKET:
+ copy_mode = mode;
+ copy_range = range;
+ if (copy_range > 0xFFFF)
+ copy_range = 0xFFFF;
+ break;
+
+ default:
+ status = -EINVAL;
+
+ }
+ return status;
+}
+
+static inline void
+__ipq_reset(void)
+{
+ peer_pid = 0;
+ net_disable_timestamp();
+ __ipq_set_mode(IPQ_COPY_NONE, 0);
+ __ipq_flush(NF_DROP);
+}
+
+static struct ipq_queue_entry *
+ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data)
+{
+ struct ipq_queue_entry *entry;
+
+ write_lock_bh(&queue_lock);
+ entry = __ipq_find_dequeue_entry(cmpfn, data);
+ write_unlock_bh(&queue_lock);
+ return entry;
+}
+
+static void
+ipq_flush(int verdict)
+{
+ write_lock_bh(&queue_lock);
+ __ipq_flush(verdict);
+ write_unlock_bh(&queue_lock);
+}
+
+static struct sk_buff *
+ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
+{
+ unsigned char *old_tail;
+ size_t size = 0;
+ size_t data_len = 0;
+ struct sk_buff *skb;
+ struct ipq_packet_msg *pmsg;
+ struct nlmsghdr *nlh;
+
+ read_lock_bh(&queue_lock);
+
+ switch (copy_mode) {
+ case IPQ_COPY_META:
+ case IPQ_COPY_NONE:
+ size = NLMSG_SPACE(sizeof(*pmsg));
+ data_len = 0;
+ break;
+
+ case IPQ_COPY_PACKET:
+ if (copy_range == 0 || copy_range > entry->skb->len)
+ data_len = entry->skb->len;
+ else
+ data_len = copy_range;
+
+ size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
+ break;
+
+ default:
+ *errp = -EINVAL;
+ read_unlock_bh(&queue_lock);
+ return NULL;
+ }
+
+ read_unlock_bh(&queue_lock);
+
+ skb = alloc_skb(size, GFP_ATOMIC);
+ if (!skb)
+ goto nlmsg_failure;
+
+ old_tail= skb->tail;
+ nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh));
+ pmsg = NLMSG_DATA(nlh);
+ memset(pmsg, 0, sizeof(*pmsg));
+
+ pmsg->packet_id = (unsigned long )entry;
+ pmsg->data_len = data_len;
+ pmsg->timestamp_sec = entry->skb->stamp.tv_sec;
+ pmsg->timestamp_usec = entry->skb->stamp.tv_usec;
+ pmsg->mark = entry->skb->nfmark;
+ pmsg->hook = entry->info->hook;
+ pmsg->hw_protocol = entry->skb->protocol;
+
+ if (entry->info->indev)
+ strcpy(pmsg->indev_name, entry->info->indev->name);
+ else
+ pmsg->indev_name[0] = '\0';
+
+ if (entry->info->outdev)
+ strcpy(pmsg->outdev_name, entry->info->outdev->name);
+ else
+ pmsg->outdev_name[0] = '\0';
+
+ if (entry->info->indev && entry->skb->dev) {
+ pmsg->hw_type = entry->skb->dev->type;
+ if (entry->skb->dev->hard_header_parse)
+ pmsg->hw_addrlen =
+ entry->skb->dev->hard_header_parse(entry->skb,
+ pmsg->hw_addr);
+ }
+
+ if (data_len)
+ if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len))
+ BUG();
+
+ nlh->nlmsg_len = skb->tail - old_tail;
+ return skb;
+
+nlmsg_failure:
+ if (skb)
+ kfree_skb(skb);
+ *errp = -EINVAL;
+ printk(KERN_ERR "ip_queue: error creating packet message\n");
+ return NULL;
+}
+
+static int
+ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
+{
+ int status = -EINVAL;
+ struct sk_buff *nskb;
+ struct ipq_queue_entry *entry;
+
+ if (copy_mode == IPQ_COPY_NONE)
+ return -EAGAIN;
+
+ entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+ if (entry == NULL) {
+ printk(KERN_ERR "ip_queue: OOM in ipq_enqueue_packet()\n");
+ return -ENOMEM;
+ }
+
+ entry->info = info;
+ entry->skb = skb;
+
+ if (entry->info->hook == NF_IP_LOCAL_OUT) {
+ struct iphdr *iph = skb->nh.iph;
+
+ entry->rt_info.tos = iph->tos;
+ entry->rt_info.daddr = iph->daddr;
+ entry->rt_info.saddr = iph->saddr;
+ }
+
+ nskb = ipq_build_packet_message(entry, &status);
+ if (nskb == NULL)
+ goto err_out_free;
+
+ write_lock_bh(&queue_lock);
+
+ if (!peer_pid)
+ goto err_out_free_nskb;
+
+ if (queue_total >= queue_maxlen) {
+ queue_dropped++;
+ status = -ENOSPC;
+ if (net_ratelimit())
+ printk (KERN_WARNING "ip_queue: full at %d entries, "
+ "dropping packets(s). Dropped: %d\n", queue_total,
+ queue_dropped);
+ goto err_out_free_nskb;
+ }
+
+ /* netlink_unicast will either free the nskb or attach it to a socket */
+ status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT);
+ if (status < 0) {
+ queue_user_dropped++;
+ goto err_out_unlock;
+ }
+
+ __ipq_enqueue_entry(entry);
+
+ write_unlock_bh(&queue_lock);
+ return status;
+
+err_out_free_nskb:
+ kfree_skb(nskb);
+
+err_out_unlock:
+ write_unlock_bh(&queue_lock);
+
+err_out_free:
+ kfree(entry);
+ return status;
+}
+
+static int
+ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
+{
+ int diff;
+ struct iphdr *user_iph = (struct iphdr *)v->payload;
+
+ if (v->data_len < sizeof(*user_iph))
+ return 0;
+ diff = v->data_len - e->skb->len;
+ if (diff < 0)
+ skb_trim(e->skb, v->data_len);
+ else if (diff > 0) {
+ if (v->data_len > 0xFFFF)
+ return -EINVAL;
+ if (diff > skb_tailroom(e->skb)) {
+ struct sk_buff *newskb;
+
+ newskb = skb_copy_expand(e->skb,
+ skb_headroom(e->skb),
+ diff,
+ GFP_ATOMIC);
+ if (newskb == NULL) {
+ printk(KERN_WARNING "ip_queue: OOM "
+ "in mangle, dropping packet\n");
+ return -ENOMEM;
+ }
+ if (e->skb->sk)
+ skb_set_owner_w(newskb, e->skb->sk);
+ kfree_skb(e->skb);
+ e->skb = newskb;
+ }
+ skb_put(e->skb, diff);
+ }
+ if (!skb_ip_make_writable(&e->skb, v->data_len))
+ return -ENOMEM;
+ memcpy(e->skb->data, v->payload, v->data_len);
+ e->skb->nfcache |= NFC_ALTERED;
+
+ /*
+ * Extra routing may needed on local out, as the QUEUE target never
+ * returns control to the table.
+ */
+ if (e->info->hook == NF_IP_LOCAL_OUT) {
+ struct iphdr *iph = e->skb->nh.iph;
+
+ if (!(iph->tos == e->rt_info.tos
+ && iph->daddr == e->rt_info.daddr
+ && iph->saddr == e->rt_info.saddr))
+ return ip_route_me_harder(&e->skb);
+ }
+ return 0;
+}
+
+static inline int
+id_cmp(struct ipq_queue_entry *e, unsigned long id)
+{
+ return (id == (unsigned long )e);
+}
+
+static int
+ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len)
+{
+ struct ipq_queue_entry *entry;
+
+ if (vmsg->value > NF_MAX_VERDICT)
+ return -EINVAL;
+
+ entry = ipq_find_dequeue_entry(id_cmp, vmsg->id);
+ if (entry == NULL)
+ return -ENOENT;
+ else {
+ int verdict = vmsg->value;
+
+ if (vmsg->data_len && vmsg->data_len == len)
+ if (ipq_mangle_ipv4(vmsg, entry) < 0)
+ verdict = NF_DROP;
+
+ ipq_issue_verdict(entry, verdict);
+ return 0;
+ }
+}
+
+static int
+ipq_set_mode(unsigned char mode, unsigned int range)
+{
+ int status;
+
+ write_lock_bh(&queue_lock);
+ status = __ipq_set_mode(mode, range);
+ write_unlock_bh(&queue_lock);
+ return status;
+}
+
+static int
+ipq_receive_peer(struct ipq_peer_msg *pmsg,
+ unsigned char type, unsigned int len)
+{
+ int status = 0;
+
+ if (len < sizeof(*pmsg))
+ return -EINVAL;
+
+ switch (type) {
+ case IPQM_MODE:
+ status = ipq_set_mode(pmsg->msg.mode.value,
+ pmsg->msg.mode.range);
+ break;
+
+ case IPQM_VERDICT:
+ if (pmsg->msg.verdict.value > NF_MAX_VERDICT)
+ status = -EINVAL;
+ else
+ status = ipq_set_verdict(&pmsg->msg.verdict,
+ len - sizeof(*pmsg));
+ break;
+ default:
+ status = -EINVAL;
+ }
+ return status;
+}
+
+static int
+dev_cmp(struct ipq_queue_entry *entry, unsigned long ifindex)
+{
+ if (entry->info->indev)
+ if (entry->info->indev->ifindex == ifindex)
+ return 1;
+
+ if (entry->info->outdev)
+ if (entry->info->outdev->ifindex == ifindex)
+ return 1;
+
+ return 0;
+}
+
+static void
+ipq_dev_drop(int ifindex)
+{
+ struct ipq_queue_entry *entry;
+
+ while ((entry = ipq_find_dequeue_entry(dev_cmp, ifindex)) != NULL)
+ ipq_issue_verdict(entry, NF_DROP);
+}
+
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
+
+static inline void
+ipq_rcv_skb(struct sk_buff *skb)
+{
+ int status, type, pid, flags, nlmsglen, skblen;
+ struct nlmsghdr *nlh;
+
+ skblen = skb->len;
+ if (skblen < sizeof(*nlh))
+ return;
+
+ nlh = (struct nlmsghdr *)skb->data;
+ nlmsglen = nlh->nlmsg_len;
+ if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen)
+ return;
+
+ pid = nlh->nlmsg_pid;
+ flags = nlh->nlmsg_flags;
+
+ if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI)
+ RCV_SKB_FAIL(-EINVAL);
+
+ if (flags & MSG_TRUNC)
+ RCV_SKB_FAIL(-ECOMM);
+
+ type = nlh->nlmsg_type;
+ if (type < NLMSG_NOOP || type >= IPQM_MAX)
+ RCV_SKB_FAIL(-EINVAL);
+
+ if (type <= IPQM_BASE)
+ return;
+
+ if (security_netlink_recv(skb))
+ RCV_SKB_FAIL(-EPERM);
+
+ write_lock_bh(&queue_lock);
+
+ if (peer_pid) {
+ if (peer_pid != pid) {
+ write_unlock_bh(&queue_lock);
+ RCV_SKB_FAIL(-EBUSY);
+ }
+ } else {
+ net_enable_timestamp();
+ peer_pid = pid;
+ }
+
+ write_unlock_bh(&queue_lock);
+
+ status = ipq_receive_peer(NLMSG_DATA(nlh), type,
+ skblen - NLMSG_LENGTH(0));
+ if (status < 0)
+ RCV_SKB_FAIL(status);
+
+ if (flags & NLM_F_ACK)
+ netlink_ack(skb, nlh, 0);
+ return;
+}
+
+static void
+ipq_rcv_sk(struct sock *sk, int len)
+{
+ do {
+ struct sk_buff *skb;
+
+ if (down_trylock(&ipqnl_sem))
+ return;
+
+ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ ipq_rcv_skb(skb);
+ kfree_skb(skb);
+ }
+
+ up(&ipqnl_sem);
+
+ } while (ipqnl && ipqnl->sk_receive_queue.qlen);
+}
+
+static int
+ipq_rcv_dev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = ptr;
+
+ /* Drop any packets associated with the downed device */
+ if (event == NETDEV_DOWN)
+ ipq_dev_drop(dev->ifindex);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block ipq_dev_notifier = {
+ .notifier_call = ipq_rcv_dev_event,
+};
+
+static int
+ipq_rcv_nl_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct netlink_notify *n = ptr;
+
+ if (event == NETLINK_URELEASE &&
+ n->protocol == NETLINK_FIREWALL && n->pid) {
+ write_lock_bh(&queue_lock);
+ if (n->pid == peer_pid)
+ __ipq_reset();
+ write_unlock_bh(&queue_lock);
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block ipq_nl_notifier = {
+ .notifier_call = ipq_rcv_nl_event,
+};
+
+static struct ctl_table_header *ipq_sysctl_header;
+
+static ctl_table ipq_table[] = {
+ {
+ .ctl_name = NET_IPQ_QMAX,
+ .procname = NET_IPQ_QMAX_NAME,
+ .data = &queue_maxlen,
+ .maxlen = sizeof(queue_maxlen),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table ipq_dir_table[] = {
+ {
+ .ctl_name = NET_IPV4,
+ .procname = "ipv4",
+ .mode = 0555,
+ .child = ipq_table
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table ipq_root_table[] = {
+ {
+ .ctl_name = CTL_NET,
+ .procname = "net",
+ .mode = 0555,
+ .child = ipq_dir_table
+ },
+ { .ctl_name = 0 }
+};
+
+#ifdef CONFIG_PROC_FS
+static int
+ipq_get_info(char *buffer, char **start, off_t offset, int length)
+{
+ int len;
+
+ read_lock_bh(&queue_lock);
+
+ len = sprintf(buffer,
+ "Peer PID : %d\n"
+ "Copy mode : %hu\n"
+ "Copy range : %u\n"
+ "Queue length : %u\n"
+ "Queue max. length : %u\n"
+ "Queue dropped : %u\n"
+ "Netlink dropped : %u\n",
+ peer_pid,
+ copy_mode,
+ copy_range,
+ queue_total,
+ queue_maxlen,
+ queue_dropped,
+ queue_user_dropped);
+
+ read_unlock_bh(&queue_lock);
+
+ *start = buffer + offset;
+ len -= offset;
+ if (len > length)
+ len = length;
+ else if (len < 0)
+ len = 0;
+ return len;
+}
+#endif /* CONFIG_PROC_FS */
+
+static int
+init_or_cleanup(int init)
+{
+ int status = -ENOMEM;
+ struct proc_dir_entry *proc;
+
+ if (!init)
+ goto cleanup;
+
+ netlink_register_notifier(&ipq_nl_notifier);
+ ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk);
+ if (ipqnl == NULL) {
+ printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
+ goto cleanup_netlink_notifier;
+ }
+
+ proc = proc_net_create(IPQ_PROC_FS_NAME, 0, ipq_get_info);
+ if (proc)
+ proc->owner = THIS_MODULE;
+ else {
+ printk(KERN_ERR "ip_queue: failed to create proc entry\n");
+ goto cleanup_ipqnl;
+ }
+
+ register_netdevice_notifier(&ipq_dev_notifier);
+ ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
+
+ status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL);
+ if (status < 0) {
+ printk(KERN_ERR "ip_queue: failed to register queue handler\n");
+ goto cleanup_sysctl;
+ }
+ return status;
+
+cleanup:
+ nf_unregister_queue_handler(PF_INET);
+ synchronize_net();
+ ipq_flush(NF_DROP);
+
+cleanup_sysctl:
+ unregister_sysctl_table(ipq_sysctl_header);
+ unregister_netdevice_notifier(&ipq_dev_notifier);
+ proc_net_remove(IPQ_PROC_FS_NAME);
+
+cleanup_ipqnl:
+ sock_release(ipqnl->sk_socket);
+ down(&ipqnl_sem);
+ up(&ipqnl_sem);
+
+cleanup_netlink_notifier:
+ netlink_unregister_notifier(&ipq_nl_notifier);
+ return status;
+}
+
+static int __init init(void)
+{
+
+ return init_or_cleanup(1);
+}
+
+static void __exit fini(void)
+{
+ init_or_cleanup(0);
+}
+
+MODULE_DESCRIPTION("IPv4 packet queue handler");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+MODULE_LICENSE("GPL");
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
new file mode 100644
index 00000000000..8a54f92b849
--- /dev/null
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -0,0 +1,1964 @@
+/*
+ * Packet matching code.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 19 Jan 2002 Harald Welte <laforge@gnumonks.org>
+ * - increase module usage count as soon as we have rules inside
+ * a table
+ */
+#include <linux/config.h>
+#include <linux/cache.h>
+#include <linux/skbuff.h>
+#include <linux/kmod.h>
+#include <linux/vmalloc.h>
+#include <linux/netdevice.h>
+#include <linux/module.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <net/ip.h>
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+#include <linux/proc_fs.h>
+#include <linux/err.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("IPv4 packet filter");
+
+/*#define DEBUG_IP_FIREWALL*/
+/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */
+/*#define DEBUG_IP_FIREWALL_USER*/
+
+#ifdef DEBUG_IP_FIREWALL
+#define dprintf(format, args...) printk(format , ## args)
+#else
+#define dprintf(format, args...)
+#endif
+
+#ifdef DEBUG_IP_FIREWALL_USER
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+#ifdef CONFIG_NETFILTER_DEBUG
+#define IP_NF_ASSERT(x) \
+do { \
+ if (!(x)) \
+ printk("IP_NF_ASSERT: %s:%s:%u\n", \
+ __FUNCTION__, __FILE__, __LINE__); \
+} while(0)
+#else
+#define IP_NF_ASSERT(x)
+#endif
+#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
+
+static DECLARE_MUTEX(ipt_mutex);
+
+/* Must have mutex */
+#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
+#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
+#include <linux/netfilter_ipv4/lockhelp.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+/* All the better to debug you with... */
+#define static
+#define inline
+#endif
+
+/*
+ We keep a set of rules for each CPU, so we can avoid write-locking
+ them in the softirq when updating the counters and therefore
+ only need to read-lock in the softirq; doing a write_lock_bh() in user
+ context stops packets coming through and allows user context to read
+ the counters or update the rules.
+
+ To be cache friendly on SMP, we arrange them like so:
+ [ n-entries ]
+ ... cache-align padding ...
+ [ n-entries ]
+
+ Hence the start of any table is given by get_table() below. */
+
+/* The table itself */
+struct ipt_table_info
+{
+ /* Size per table */
+ unsigned int size;
+ /* Number of entries: FIXME. --RR */
+ unsigned int number;
+ /* Initial number of entries. Needed for module usage count */
+ unsigned int initial_entries;
+
+ /* Entry points and underflows */
+ unsigned int hook_entry[NF_IP_NUMHOOKS];
+ unsigned int underflow[NF_IP_NUMHOOKS];
+
+ /* ipt_entry tables: one per CPU */
+ char entries[0] ____cacheline_aligned;
+};
+
+static LIST_HEAD(ipt_target);
+static LIST_HEAD(ipt_match);
+static LIST_HEAD(ipt_tables);
+#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
+
+#ifdef CONFIG_SMP
+#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
+#else
+#define TABLE_OFFSET(t,p) 0
+#endif
+
+#if 0
+#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
+#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; })
+#define up(x) do { printk("UP:%u:" #x "\n", __LINE__); up(x); } while(0)
+#endif
+
+/* Returns whether matches rule or not. */
+static inline int
+ip_packet_match(const struct iphdr *ip,
+ const char *indev,
+ const char *outdev,
+ const struct ipt_ip *ipinfo,
+ int isfrag)
+{
+ size_t i;
+ unsigned long ret;
+
+#define FWINV(bool,invflg) ((bool) ^ !!(ipinfo->invflags & invflg))
+
+ if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
+ IPT_INV_SRCIP)
+ || FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
+ IPT_INV_DSTIP)) {
+ dprintf("Source or dest mismatch.\n");
+
+ dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n",
+ NIPQUAD(ip->saddr),
+ NIPQUAD(ipinfo->smsk.s_addr),
+ NIPQUAD(ipinfo->src.s_addr),
+ ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : "");
+ dprintf("DST: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n",
+ NIPQUAD(ip->daddr),
+ NIPQUAD(ipinfo->dmsk.s_addr),
+ NIPQUAD(ipinfo->dst.s_addr),
+ ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : "");
+ return 0;
+ }
+
+ /* Look for ifname matches; this should unroll nicely. */
+ for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
+ ret |= (((const unsigned long *)indev)[i]
+ ^ ((const unsigned long *)ipinfo->iniface)[i])
+ & ((const unsigned long *)ipinfo->iniface_mask)[i];
+ }
+
+ if (FWINV(ret != 0, IPT_INV_VIA_IN)) {
+ dprintf("VIA in mismatch (%s vs %s).%s\n",
+ indev, ipinfo->iniface,
+ ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":"");
+ return 0;
+ }
+
+ for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
+ ret |= (((const unsigned long *)outdev)[i]
+ ^ ((const unsigned long *)ipinfo->outiface)[i])
+ & ((const unsigned long *)ipinfo->outiface_mask)[i];
+ }
+
+ if (FWINV(ret != 0, IPT_INV_VIA_OUT)) {
+ dprintf("VIA out mismatch (%s vs %s).%s\n",
+ outdev, ipinfo->outiface,
+ ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":"");
+ return 0;
+ }
+
+ /* Check specific protocol */
+ if (ipinfo->proto
+ && FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {
+ dprintf("Packet protocol %hi does not match %hi.%s\n",
+ ip->protocol, ipinfo->proto,
+ ipinfo->invflags&IPT_INV_PROTO ? " (INV)":"");
+ return 0;
+ }
+
+ /* If we have a fragment rule but the packet is not a fragment
+ * then we return zero */
+ if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) {
+ dprintf("Fragment rule but not fragment.%s\n",
+ ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : "");
+ return 0;
+ }
+
+ return 1;
+}
+
+static inline int
+ip_checkentry(const struct ipt_ip *ip)
+{
+ if (ip->flags & ~IPT_F_MASK) {
+ duprintf("Unknown flag bits set: %08X\n",
+ ip->flags & ~IPT_F_MASK);
+ return 0;
+ }
+ if (ip->invflags & ~IPT_INV_MASK) {
+ duprintf("Unknown invflag bits set: %08X\n",
+ ip->invflags & ~IPT_INV_MASK);
+ return 0;
+ }
+ return 1;
+}
+
+static unsigned int
+ipt_error(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ if (net_ratelimit())
+ printk("ip_tables: error: `%s'\n", (char *)targinfo);
+
+ return NF_DROP;
+}
+
+static inline
+int do_match(struct ipt_entry_match *m,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int offset,
+ int *hotdrop)
+{
+ /* Stop iteration if it doesn't match */
+ if (!m->u.kernel.match->match(skb, in, out, m->data, offset, hotdrop))
+ return 1;
+ else
+ return 0;
+}
+
+static inline struct ipt_entry *
+get_entry(void *base, unsigned int offset)
+{
+ return (struct ipt_entry *)(base + offset);
+}
+
+/* Returns one of the generic firewall policies, like NF_ACCEPT. */
+unsigned int
+ipt_do_table(struct sk_buff **pskb,
+ unsigned int hook,
+ const struct net_device *in,
+ const struct net_device *out,
+ struct ipt_table *table,
+ void *userdata)
+{
+ static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
+ u_int16_t offset;
+ struct iphdr *ip;
+ u_int16_t datalen;
+ int hotdrop = 0;
+ /* Initializing verdict to NF_DROP keeps gcc happy. */
+ unsigned int verdict = NF_DROP;
+ const char *indev, *outdev;
+ void *table_base;
+ struct ipt_entry *e, *back;
+
+ /* Initialization */
+ ip = (*pskb)->nh.iph;
+ datalen = (*pskb)->len - ip->ihl * 4;
+ indev = in ? in->name : nulldevname;
+ outdev = out ? out->name : nulldevname;
+ /* We handle fragments by dealing with the first fragment as
+ * if it was a normal packet. All other fragments are treated
+ * normally, except that they will NEVER match rules that ask
+ * things we don't know, ie. tcp syn flag or ports). If the
+ * rule is also a fragment-specific rule, non-fragments won't
+ * match it. */
+ offset = ntohs(ip->frag_off) & IP_OFFSET;
+
+ read_lock_bh(&table->lock);
+ IP_NF_ASSERT(table->valid_hooks & (1 << hook));
+ table_base = (void *)table->private->entries
+ + TABLE_OFFSET(table->private, smp_processor_id());
+ e = get_entry(table_base, table->private->hook_entry[hook]);
+
+#ifdef CONFIG_NETFILTER_DEBUG
+ /* Check noone else using our table */
+ if (((struct ipt_entry *)table_base)->comefrom != 0xdead57ac
+ && ((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec) {
+ printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n",
+ smp_processor_id(),
+ table->name,
+ &((struct ipt_entry *)table_base)->comefrom,
+ ((struct ipt_entry *)table_base)->comefrom);
+ }
+ ((struct ipt_entry *)table_base)->comefrom = 0x57acc001;
+#endif
+
+ /* For return from builtin chain */
+ back = get_entry(table_base, table->private->underflow[hook]);
+
+ do {
+ IP_NF_ASSERT(e);
+ IP_NF_ASSERT(back);
+ (*pskb)->nfcache |= e->nfcache;
+ if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) {
+ struct ipt_entry_target *t;
+
+ if (IPT_MATCH_ITERATE(e, do_match,
+ *pskb, in, out,
+ offset, &hotdrop) != 0)
+ goto no_match;
+
+ ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
+
+ t = ipt_get_target(e);
+ IP_NF_ASSERT(t->u.kernel.target);
+ /* Standard target? */
+ if (!t->u.kernel.target->target) {
+ int v;
+
+ v = ((struct ipt_standard_target *)t)->verdict;
+ if (v < 0) {
+ /* Pop from stack? */
+ if (v != IPT_RETURN) {
+ verdict = (unsigned)(-v) - 1;
+ break;
+ }
+ e = back;
+ back = get_entry(table_base,
+ back->comefrom);
+ continue;
+ }
+ if (table_base + v
+ != (void *)e + e->next_offset) {
+ /* Save old back ptr in next entry */
+ struct ipt_entry *next
+ = (void *)e + e->next_offset;
+ next->comefrom
+ = (void *)back - table_base;
+ /* set back pointer to next entry */
+ back = next;
+ }
+
+ e = get_entry(table_base, v);
+ } else {
+ /* Targets which reenter must return
+ abs. verdicts */
+#ifdef CONFIG_NETFILTER_DEBUG
+ ((struct ipt_entry *)table_base)->comefrom
+ = 0xeeeeeeec;
+#endif
+ verdict = t->u.kernel.target->target(pskb,
+ in, out,
+ hook,
+ t->data,
+ userdata);
+
+#ifdef CONFIG_NETFILTER_DEBUG
+ if (((struct ipt_entry *)table_base)->comefrom
+ != 0xeeeeeeec
+ && verdict == IPT_CONTINUE) {
+ printk("Target %s reentered!\n",
+ t->u.kernel.target->name);
+ verdict = NF_DROP;
+ }
+ ((struct ipt_entry *)table_base)->comefrom
+ = 0x57acc001;
+#endif
+ /* Target might have changed stuff. */
+ ip = (*pskb)->nh.iph;
+ datalen = (*pskb)->len - ip->ihl * 4;
+
+ if (verdict == IPT_CONTINUE)
+ e = (void *)e + e->next_offset;
+ else
+ /* Verdict */
+ break;
+ }
+ } else {
+
+ no_match:
+ e = (void *)e + e->next_offset;
+ }
+ } while (!hotdrop);
+
+#ifdef CONFIG_NETFILTER_DEBUG
+ ((struct ipt_entry *)table_base)->comefrom = 0xdead57ac;
+#endif
+ read_unlock_bh(&table->lock);
+
+#ifdef DEBUG_ALLOW_ALL
+ return NF_ACCEPT;
+#else
+ if (hotdrop)
+ return NF_DROP;
+ else return verdict;
+#endif
+}
+
+/*
+ * These are weird, but module loading must not be done with mutex
+ * held (since they will register), and we have to have a single
+ * function to use try_then_request_module().
+ */
+
+/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */
+static inline struct ipt_table *find_table_lock(const char *name)
+{
+ struct ipt_table *t;
+
+ if (down_interruptible(&ipt_mutex) != 0)
+ return ERR_PTR(-EINTR);
+
+ list_for_each_entry(t, &ipt_tables, list)
+ if (strcmp(t->name, name) == 0 && try_module_get(t->me))
+ return t;
+ up(&ipt_mutex);
+ return NULL;
+}
+
+/* Find match, grabs ref. Returns ERR_PTR() on error. */
+static inline struct ipt_match *find_match(const char *name, u8 revision)
+{
+ struct ipt_match *m;
+ int err = 0;
+
+ if (down_interruptible(&ipt_mutex) != 0)
+ return ERR_PTR(-EINTR);
+
+ list_for_each_entry(m, &ipt_match, list) {
+ if (strcmp(m->name, name) == 0) {
+ if (m->revision == revision) {
+ if (try_module_get(m->me)) {
+ up(&ipt_mutex);
+ return m;
+ }
+ } else
+ err = -EPROTOTYPE; /* Found something. */
+ }
+ }
+ up(&ipt_mutex);
+ return ERR_PTR(err);
+}
+
+/* Find target, grabs ref. Returns ERR_PTR() on error. */
+static inline struct ipt_target *find_target(const char *name, u8 revision)
+{
+ struct ipt_target *t;
+ int err = 0;
+
+ if (down_interruptible(&ipt_mutex) != 0)
+ return ERR_PTR(-EINTR);
+
+ list_for_each_entry(t, &ipt_target, list) {
+ if (strcmp(t->name, name) == 0) {
+ if (t->revision == revision) {
+ if (try_module_get(t->me)) {
+ up(&ipt_mutex);
+ return t;
+ }
+ } else
+ err = -EPROTOTYPE; /* Found something. */
+ }
+ }
+ up(&ipt_mutex);
+ return ERR_PTR(err);
+}
+
+struct ipt_target *ipt_find_target(const char *name, u8 revision)
+{
+ struct ipt_target *target;
+
+ target = try_then_request_module(find_target(name, revision),
+ "ipt_%s", name);
+ if (IS_ERR(target) || !target)
+ return NULL;
+ return target;
+}
+
+static int match_revfn(const char *name, u8 revision, int *bestp)
+{
+ struct ipt_match *m;
+ int have_rev = 0;
+
+ list_for_each_entry(m, &ipt_match, list) {
+ if (strcmp(m->name, name) == 0) {
+ if (m->revision > *bestp)
+ *bestp = m->revision;
+ if (m->revision == revision)
+ have_rev = 1;
+ }
+ }
+ return have_rev;
+}
+
+static int target_revfn(const char *name, u8 revision, int *bestp)
+{
+ struct ipt_target *t;
+ int have_rev = 0;
+
+ list_for_each_entry(t, &ipt_target, list) {
+ if (strcmp(t->name, name) == 0) {
+ if (t->revision > *bestp)
+ *bestp = t->revision;
+ if (t->revision == revision)
+ have_rev = 1;
+ }
+ }
+ return have_rev;
+}
+
+/* Returns true or false (if no such extension at all) */
+static inline int find_revision(const char *name, u8 revision,
+ int (*revfn)(const char *, u8, int *),
+ int *err)
+{
+ int have_rev, best = -1;
+
+ if (down_interruptible(&ipt_mutex) != 0) {
+ *err = -EINTR;
+ return 1;
+ }
+ have_rev = revfn(name, revision, &best);
+ up(&ipt_mutex);
+
+ /* Nothing at all? Return 0 to try loading module. */
+ if (best == -1) {
+ *err = -ENOENT;
+ return 0;
+ }
+
+ *err = best;
+ if (!have_rev)
+ *err = -EPROTONOSUPPORT;
+ return 1;
+}
+
+
+/* All zeroes == unconditional rule. */
+static inline int
+unconditional(const struct ipt_ip *ip)
+{
+ unsigned int i;
+
+ for (i = 0; i < sizeof(*ip)/sizeof(__u32); i++)
+ if (((__u32 *)ip)[i])
+ return 0;
+
+ return 1;
+}
+
+/* Figures out from what hook each rule can be called: returns 0 if
+ there are loops. Puts hook bitmask in comefrom. */
+static int
+mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
+{
+ unsigned int hook;
+
+ /* No recursion; use packet counter to save back ptrs (reset
+ to 0 as we leave), and comefrom to save source hook bitmask */
+ for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) {
+ unsigned int pos = newinfo->hook_entry[hook];
+ struct ipt_entry *e
+ = (struct ipt_entry *)(newinfo->entries + pos);
+
+ if (!(valid_hooks & (1 << hook)))
+ continue;
+
+ /* Set initial back pointer. */
+ e->counters.pcnt = pos;
+
+ for (;;) {
+ struct ipt_standard_target *t
+ = (void *)ipt_get_target(e);
+
+ if (e->comefrom & (1 << NF_IP_NUMHOOKS)) {
+ printk("iptables: loop hook %u pos %u %08X.\n",
+ hook, pos, e->comefrom);
+ return 0;
+ }
+ e->comefrom
+ |= ((1 << hook) | (1 << NF_IP_NUMHOOKS));
+
+ /* Unconditional return/END. */
+ if (e->target_offset == sizeof(struct ipt_entry)
+ && (strcmp(t->target.u.user.name,
+ IPT_STANDARD_TARGET) == 0)
+ && t->verdict < 0
+ && unconditional(&e->ip)) {
+ unsigned int oldpos, size;
+
+ /* Return: backtrack through the last
+ big jump. */
+ do {
+ e->comefrom ^= (1<<NF_IP_NUMHOOKS);
+#ifdef DEBUG_IP_FIREWALL_USER
+ if (e->comefrom
+ & (1 << NF_IP_NUMHOOKS)) {
+ duprintf("Back unset "
+ "on hook %u "
+ "rule %u\n",
+ hook, pos);
+ }
+#endif
+ oldpos = pos;
+ pos = e->counters.pcnt;
+ e->counters.pcnt = 0;
+
+ /* We're at the start. */
+ if (pos == oldpos)
+ goto next;
+
+ e = (struct ipt_entry *)
+ (newinfo->entries + pos);
+ } while (oldpos == pos + e->next_offset);
+
+ /* Move along one */
+ size = e->next_offset;
+ e = (struct ipt_entry *)
+ (newinfo->entries + pos + size);
+ e->counters.pcnt = pos;
+ pos += size;
+ } else {
+ int newpos = t->verdict;
+
+ if (strcmp(t->target.u.user.name,
+ IPT_STANDARD_TARGET) == 0
+ && newpos >= 0) {
+ /* This a jump; chase it. */
+ duprintf("Jump rule %u -> %u\n",
+ pos, newpos);
+ } else {
+ /* ... this is a fallthru */
+ newpos = pos + e->next_offset;
+ }
+ e = (struct ipt_entry *)
+ (newinfo->entries + newpos);
+ e->counters.pcnt = pos;
+ pos = newpos;
+ }
+ }
+ next:
+ duprintf("Finished chain %u\n", hook);
+ }
+ return 1;
+}
+
+static inline int
+cleanup_match(struct ipt_entry_match *m, unsigned int *i)
+{
+ if (i && (*i)-- == 0)
+ return 1;
+
+ if (m->u.kernel.match->destroy)
+ m->u.kernel.match->destroy(m->data,
+ m->u.match_size - sizeof(*m));
+ module_put(m->u.kernel.match->me);
+ return 0;
+}
+
+static inline int
+standard_check(const struct ipt_entry_target *t,
+ unsigned int max_offset)
+{
+ struct ipt_standard_target *targ = (void *)t;
+
+ /* Check standard info. */
+ if (t->u.target_size
+ != IPT_ALIGN(sizeof(struct ipt_standard_target))) {
+ duprintf("standard_check: target size %u != %u\n",
+ t->u.target_size,
+ IPT_ALIGN(sizeof(struct ipt_standard_target)));
+ return 0;
+ }
+
+ if (targ->verdict >= 0
+ && targ->verdict > max_offset - sizeof(struct ipt_entry)) {
+ duprintf("ipt_standard_check: bad verdict (%i)\n",
+ targ->verdict);
+ return 0;
+ }
+
+ if (targ->verdict < -NF_MAX_VERDICT - 1) {
+ duprintf("ipt_standard_check: bad negative verdict (%i)\n",
+ targ->verdict);
+ return 0;
+ }
+ return 1;
+}
+
+static inline int
+check_match(struct ipt_entry_match *m,
+ const char *name,
+ const struct ipt_ip *ip,
+ unsigned int hookmask,
+ unsigned int *i)
+{
+ struct ipt_match *match;
+
+ match = try_then_request_module(find_match(m->u.user.name,
+ m->u.user.revision),
+ "ipt_%s", m->u.user.name);
+ if (IS_ERR(match) || !match) {
+ duprintf("check_match: `%s' not found\n", m->u.user.name);
+ return match ? PTR_ERR(match) : -ENOENT;
+ }
+ m->u.kernel.match = match;
+
+ if (m->u.kernel.match->checkentry
+ && !m->u.kernel.match->checkentry(name, ip, m->data,
+ m->u.match_size - sizeof(*m),
+ hookmask)) {
+ module_put(m->u.kernel.match->me);
+ duprintf("ip_tables: check failed for `%s'.\n",
+ m->u.kernel.match->name);
+ return -EINVAL;
+ }
+
+ (*i)++;
+ return 0;
+}
+
+static struct ipt_target ipt_standard_target;
+
+static inline int
+check_entry(struct ipt_entry *e, const char *name, unsigned int size,
+ unsigned int *i)
+{
+ struct ipt_entry_target *t;
+ struct ipt_target *target;
+ int ret;
+ unsigned int j;
+
+ if (!ip_checkentry(&e->ip)) {
+ duprintf("ip_tables: ip check failed %p %s.\n", e, name);
+ return -EINVAL;
+ }
+
+ j = 0;
+ ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, e->comefrom, &j);
+ if (ret != 0)
+ goto cleanup_matches;
+
+ t = ipt_get_target(e);
+ target = try_then_request_module(find_target(t->u.user.name,
+ t->u.user.revision),
+ "ipt_%s", t->u.user.name);
+ if (IS_ERR(target) || !target) {
+ duprintf("check_entry: `%s' not found\n", t->u.user.name);
+ ret = target ? PTR_ERR(target) : -ENOENT;
+ goto cleanup_matches;
+ }
+ t->u.kernel.target = target;
+
+ if (t->u.kernel.target == &ipt_standard_target) {
+ if (!standard_check(t, size)) {
+ ret = -EINVAL;
+ goto cleanup_matches;
+ }
+ } else if (t->u.kernel.target->checkentry
+ && !t->u.kernel.target->checkentry(name, e, t->data,
+ t->u.target_size
+ - sizeof(*t),
+ e->comefrom)) {
+ module_put(t->u.kernel.target->me);
+ duprintf("ip_tables: check failed for `%s'.\n",
+ t->u.kernel.target->name);
+ ret = -EINVAL;
+ goto cleanup_matches;
+ }
+
+ (*i)++;
+ return 0;
+
+ cleanup_matches:
+ IPT_MATCH_ITERATE(e, cleanup_match, &j);
+ return ret;
+}
+
+static inline int
+check_entry_size_and_hooks(struct ipt_entry *e,
+ struct ipt_table_info *newinfo,
+ unsigned char *base,
+ unsigned char *limit,
+ const unsigned int *hook_entries,
+ const unsigned int *underflows,
+ unsigned int *i)
+{
+ unsigned int h;
+
+ if ((unsigned long)e % __alignof__(struct ipt_entry) != 0
+ || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) {
+ duprintf("Bad offset %p\n", e);
+ return -EINVAL;
+ }
+
+ if (e->next_offset
+ < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) {
+ duprintf("checking: element %p size %u\n",
+ e, e->next_offset);
+ return -EINVAL;
+ }
+
+ /* Check hooks & underflows */
+ for (h = 0; h < NF_IP_NUMHOOKS; h++) {
+ if ((unsigned char *)e - base == hook_entries[h])
+ newinfo->hook_entry[h] = hook_entries[h];
+ if ((unsigned char *)e - base == underflows[h])
+ newinfo->underflow[h] = underflows[h];
+ }
+
+ /* FIXME: underflows must be unconditional, standard verdicts
+ < 0 (not IPT_RETURN). --RR */
+
+ /* Clear counters and comefrom */
+ e->counters = ((struct ipt_counters) { 0, 0 });
+ e->comefrom = 0;
+
+ (*i)++;
+ return 0;
+}
+
+static inline int
+cleanup_entry(struct ipt_entry *e, unsigned int *i)
+{
+ struct ipt_entry_target *t;
+
+ if (i && (*i)-- == 0)
+ return 1;
+
+ /* Cleanup all matches */
+ IPT_MATCH_ITERATE(e, cleanup_match, NULL);
+ t = ipt_get_target(e);
+ if (t->u.kernel.target->destroy)
+ t->u.kernel.target->destroy(t->data,
+ t->u.target_size - sizeof(*t));
+ module_put(t->u.kernel.target->me);
+ return 0;
+}
+
+/* Checks and translates the user-supplied table segment (held in
+ newinfo) */
+static int
+translate_table(const char *name,
+ unsigned int valid_hooks,
+ struct ipt_table_info *newinfo,
+ unsigned int size,
+ unsigned int number,
+ const unsigned int *hook_entries,
+ const unsigned int *underflows)
+{
+ unsigned int i;
+ int ret;
+
+ newinfo->size = size;
+ newinfo->number = number;
+
+ /* Init all hooks to impossible value. */
+ for (i = 0; i < NF_IP_NUMHOOKS; i++) {
+ newinfo->hook_entry[i] = 0xFFFFFFFF;
+ newinfo->underflow[i] = 0xFFFFFFFF;
+ }
+
+ duprintf("translate_table: size %u\n", newinfo->size);
+ i = 0;
+ /* Walk through entries, checking offsets. */
+ ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ check_entry_size_and_hooks,
+ newinfo,
+ newinfo->entries,
+ newinfo->entries + size,
+ hook_entries, underflows, &i);
+ if (ret != 0)
+ return ret;
+
+ if (i != number) {
+ duprintf("translate_table: %u not %u entries\n",
+ i, number);
+ return -EINVAL;
+ }
+
+ /* Check hooks all assigned */
+ for (i = 0; i < NF_IP_NUMHOOKS; i++) {
+ /* Only hooks which are valid */
+ if (!(valid_hooks & (1 << i)))
+ continue;
+ if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
+ duprintf("Invalid hook entry %u %u\n",
+ i, hook_entries[i]);
+ return -EINVAL;
+ }
+ if (newinfo->underflow[i] == 0xFFFFFFFF) {
+ duprintf("Invalid underflow %u %u\n",
+ i, underflows[i]);
+ return -EINVAL;
+ }
+ }
+
+ if (!mark_source_chains(newinfo, valid_hooks))
+ return -ELOOP;
+
+ /* Finally, each sanity check must pass */
+ i = 0;
+ ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ check_entry, name, size, &i);
+
+ if (ret != 0) {
+ IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ cleanup_entry, &i);
+ return ret;
+ }
+
+ /* And one copy for every other CPU */
+ for (i = 1; i < num_possible_cpus(); i++) {
+ memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,
+ newinfo->entries,
+ SMP_ALIGN(newinfo->size));
+ }
+
+ return ret;
+}
+
+static struct ipt_table_info *
+replace_table(struct ipt_table *table,
+ unsigned int num_counters,
+ struct ipt_table_info *newinfo,
+ int *error)
+{
+ struct ipt_table_info *oldinfo;
+
+#ifdef CONFIG_NETFILTER_DEBUG
+ {
+ struct ipt_entry *table_base;
+ unsigned int i;
+
+ for (i = 0; i < num_possible_cpus(); i++) {
+ table_base =
+ (void *)newinfo->entries
+ + TABLE_OFFSET(newinfo, i);
+
+ table_base->comefrom = 0xdead57ac;
+ }
+ }
+#endif
+
+ /* Do the substitution. */
+ write_lock_bh(&table->lock);
+ /* Check inside lock: is the old number correct? */
+ if (num_counters != table->private->number) {
+ duprintf("num_counters != table->private->number (%u/%u)\n",
+ num_counters, table->private->number);
+ write_unlock_bh(&table->lock);
+ *error = -EAGAIN;
+ return NULL;
+ }
+ oldinfo = table->private;
+ table->private = newinfo;
+ newinfo->initial_entries = oldinfo->initial_entries;
+ write_unlock_bh(&table->lock);
+
+ return oldinfo;
+}
+
+/* Gets counters. */
+static inline int
+add_entry_to_counter(const struct ipt_entry *e,
+ struct ipt_counters total[],
+ unsigned int *i)
+{
+ ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+
+ (*i)++;
+ return 0;
+}
+
+static void
+get_counters(const struct ipt_table_info *t,
+ struct ipt_counters counters[])
+{
+ unsigned int cpu;
+ unsigned int i;
+
+ for (cpu = 0; cpu < num_possible_cpus(); cpu++) {
+ i = 0;
+ IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
+ t->size,
+ add_entry_to_counter,
+ counters,
+ &i);
+ }
+}
+
+static int
+copy_entries_to_user(unsigned int total_size,
+ struct ipt_table *table,
+ void __user *userptr)
+{
+ unsigned int off, num, countersize;
+ struct ipt_entry *e;
+ struct ipt_counters *counters;
+ int ret = 0;
+
+ /* We need atomic snapshot of counters: rest doesn't change
+ (other than comefrom, which userspace doesn't care
+ about). */
+ countersize = sizeof(struct ipt_counters) * table->private->number;
+ counters = vmalloc(countersize);
+
+ if (counters == NULL)
+ return -ENOMEM;
+
+ /* First, sum counters... */
+ memset(counters, 0, countersize);
+ write_lock_bh(&table->lock);
+ get_counters(table->private, counters);
+ write_unlock_bh(&table->lock);
+
+ /* ... then copy entire thing from CPU 0... */
+ if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+
+ /* FIXME: use iterator macros --RR */
+ /* ... then go back and fix counters and names */
+ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
+ unsigned int i;
+ struct ipt_entry_match *m;
+ struct ipt_entry_target *t;
+
+ e = (struct ipt_entry *)(table->private->entries + off);
+ if (copy_to_user(userptr + off
+ + offsetof(struct ipt_entry, counters),
+ &counters[num],
+ sizeof(counters[num])) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+
+ for (i = sizeof(struct ipt_entry);
+ i < e->target_offset;
+ i += m->u.match_size) {
+ m = (void *)e + i;
+
+ if (copy_to_user(userptr + off + i
+ + offsetof(struct ipt_entry_match,
+ u.user.name),
+ m->u.kernel.match->name,
+ strlen(m->u.kernel.match->name)+1)
+ != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+ }
+
+ t = ipt_get_target(e);
+ if (copy_to_user(userptr + off + e->target_offset
+ + offsetof(struct ipt_entry_target,
+ u.user.name),
+ t->u.kernel.target->name,
+ strlen(t->u.kernel.target->name)+1) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+ }
+
+ free_counters:
+ vfree(counters);
+ return ret;
+}
+
+static int
+get_entries(const struct ipt_get_entries *entries,
+ struct ipt_get_entries __user *uptr)
+{
+ int ret;
+ struct ipt_table *t;
+
+ t = find_table_lock(entries->name);
+ if (t && !IS_ERR(t)) {
+ duprintf("t->private->number = %u\n",
+ t->private->number);
+ if (entries->size == t->private->size)
+ ret = copy_entries_to_user(t->private->size,
+ t, uptr->entrytable);
+ else {
+ duprintf("get_entries: I've got %u not %u!\n",
+ t->private->size,
+ entries->size);
+ ret = -EINVAL;
+ }
+ module_put(t->me);
+ up(&ipt_mutex);
+ } else
+ ret = t ? PTR_ERR(t) : -ENOENT;
+
+ return ret;
+}
+
+static int
+do_replace(void __user *user, unsigned int len)
+{
+ int ret;
+ struct ipt_replace tmp;
+ struct ipt_table *t;
+ struct ipt_table_info *newinfo, *oldinfo;
+ struct ipt_counters *counters;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ /* Hack: Causes ipchains to give correct error msg --RR */
+ if (len != sizeof(tmp) + tmp.size)
+ return -ENOPROTOOPT;
+
+ /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
+ if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
+ return -ENOMEM;
+
+ newinfo = vmalloc(sizeof(struct ipt_table_info)
+ + SMP_ALIGN(tmp.size) * num_possible_cpus());
+ if (!newinfo)
+ return -ENOMEM;
+
+ if (copy_from_user(newinfo->entries, user + sizeof(tmp),
+ tmp.size) != 0) {
+ ret = -EFAULT;
+ goto free_newinfo;
+ }
+
+ counters = vmalloc(tmp.num_counters * sizeof(struct ipt_counters));
+ if (!counters) {
+ ret = -ENOMEM;
+ goto free_newinfo;
+ }
+ memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters));
+
+ ret = translate_table(tmp.name, tmp.valid_hooks,
+ newinfo, tmp.size, tmp.num_entries,
+ tmp.hook_entry, tmp.underflow);
+ if (ret != 0)
+ goto free_newinfo_counters;
+
+ duprintf("ip_tables: Translated table\n");
+
+ t = try_then_request_module(find_table_lock(tmp.name),
+ "iptable_%s", tmp.name);
+ if (!t || IS_ERR(t)) {
+ ret = t ? PTR_ERR(t) : -ENOENT;
+ goto free_newinfo_counters_untrans;
+ }
+
+ /* You lied! */
+ if (tmp.valid_hooks != t->valid_hooks) {
+ duprintf("Valid hook crap: %08X vs %08X\n",
+ tmp.valid_hooks, t->valid_hooks);
+ ret = -EINVAL;
+ goto put_module;
+ }
+
+ oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret);
+ if (!oldinfo)
+ goto put_module;
+
+ /* Update module usage count based on number of rules */
+ duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
+ oldinfo->number, oldinfo->initial_entries, newinfo->number);
+ if ((oldinfo->number > oldinfo->initial_entries) ||
+ (newinfo->number <= oldinfo->initial_entries))
+ module_put(t->me);
+ if ((oldinfo->number > oldinfo->initial_entries) &&
+ (newinfo->number <= oldinfo->initial_entries))
+ module_put(t->me);
+
+ /* Get the old counters. */
+ get_counters(oldinfo, counters);
+ /* Decrease module usage counts and free resource */
+ IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
+ vfree(oldinfo);
+ if (copy_to_user(tmp.counters, counters,
+ sizeof(struct ipt_counters) * tmp.num_counters) != 0)
+ ret = -EFAULT;
+ vfree(counters);
+ up(&ipt_mutex);
+ return ret;
+
+ put_module:
+ module_put(t->me);
+ up(&ipt_mutex);
+ free_newinfo_counters_untrans:
+ IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL);
+ free_newinfo_counters:
+ vfree(counters);
+ free_newinfo:
+ vfree(newinfo);
+ return ret;
+}
+
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static inline int
+add_counter_to_entry(struct ipt_entry *e,
+ const struct ipt_counters addme[],
+ unsigned int *i)
+{
+#if 0
+ duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
+ *i,
+ (long unsigned int)e->counters.pcnt,
+ (long unsigned int)e->counters.bcnt,
+ (long unsigned int)addme[*i].pcnt,
+ (long unsigned int)addme[*i].bcnt);
+#endif
+
+ ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+ (*i)++;
+ return 0;
+}
+
+static int
+do_add_counters(void __user *user, unsigned int len)
+{
+ unsigned int i;
+ struct ipt_counters_info tmp, *paddc;
+ struct ipt_table *t;
+ int ret = 0;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters))
+ return -EINVAL;
+
+ paddc = vmalloc(len);
+ if (!paddc)
+ return -ENOMEM;
+
+ if (copy_from_user(paddc, user, len) != 0) {
+ ret = -EFAULT;
+ goto free;
+ }
+
+ t = find_table_lock(tmp.name);
+ if (!t || IS_ERR(t)) {
+ ret = t ? PTR_ERR(t) : -ENOENT;
+ goto free;
+ }
+
+ write_lock_bh(&t->lock);
+ if (t->private->number != paddc->num_counters) {
+ ret = -EINVAL;
+ goto unlock_up_free;
+ }
+
+ i = 0;
+ IPT_ENTRY_ITERATE(t->private->entries,
+ t->private->size,
+ add_counter_to_entry,
+ paddc->counters,
+ &i);
+ unlock_up_free:
+ write_unlock_bh(&t->lock);
+ up(&ipt_mutex);
+ module_put(t->me);
+ free:
+ vfree(paddc);
+
+ return ret;
+}
+
+static int
+do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case IPT_SO_SET_REPLACE:
+ ret = do_replace(user, len);
+ break;
+
+ case IPT_SO_SET_ADD_COUNTERS:
+ ret = do_add_counters(user, len);
+ break;
+
+ default:
+ duprintf("do_ipt_set_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static int
+do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case IPT_SO_GET_INFO: {
+ char name[IPT_TABLE_MAXNAMELEN];
+ struct ipt_table *t;
+
+ if (*len != sizeof(struct ipt_getinfo)) {
+ duprintf("length %u != %u\n", *len,
+ sizeof(struct ipt_getinfo));
+ ret = -EINVAL;
+ break;
+ }
+
+ if (copy_from_user(name, user, sizeof(name)) != 0) {
+ ret = -EFAULT;
+ break;
+ }
+ name[IPT_TABLE_MAXNAMELEN-1] = '\0';
+
+ t = try_then_request_module(find_table_lock(name),
+ "iptable_%s", name);
+ if (t && !IS_ERR(t)) {
+ struct ipt_getinfo info;
+
+ info.valid_hooks = t->valid_hooks;
+ memcpy(info.hook_entry, t->private->hook_entry,
+ sizeof(info.hook_entry));
+ memcpy(info.underflow, t->private->underflow,
+ sizeof(info.underflow));
+ info.num_entries = t->private->number;
+ info.size = t->private->size;
+ memcpy(info.name, name, sizeof(info.name));
+
+ if (copy_to_user(user, &info, *len) != 0)
+ ret = -EFAULT;
+ else
+ ret = 0;
+ up(&ipt_mutex);
+ module_put(t->me);
+ } else
+ ret = t ? PTR_ERR(t) : -ENOENT;
+ }
+ break;
+
+ case IPT_SO_GET_ENTRIES: {
+ struct ipt_get_entries get;
+
+ if (*len < sizeof(get)) {
+ duprintf("get_entries: %u < %u\n", *len, sizeof(get));
+ ret = -EINVAL;
+ } else if (copy_from_user(&get, user, sizeof(get)) != 0) {
+ ret = -EFAULT;
+ } else if (*len != sizeof(struct ipt_get_entries) + get.size) {
+ duprintf("get_entries: %u != %u\n", *len,
+ sizeof(struct ipt_get_entries) + get.size);
+ ret = -EINVAL;
+ } else
+ ret = get_entries(&get, user);
+ break;
+ }
+
+ case IPT_SO_GET_REVISION_MATCH:
+ case IPT_SO_GET_REVISION_TARGET: {
+ struct ipt_get_revision rev;
+ int (*revfn)(const char *, u8, int *);
+
+ if (*len != sizeof(rev)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
+ ret = -EFAULT;
+ break;
+ }
+
+ if (cmd == IPT_SO_GET_REVISION_TARGET)
+ revfn = target_revfn;
+ else
+ revfn = match_revfn;
+
+ try_then_request_module(find_revision(rev.name, rev.revision,
+ revfn, &ret),
+ "ipt_%s", rev.name);
+ break;
+ }
+
+ default:
+ duprintf("do_ipt_get_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+/* Registration hooks for targets. */
+int
+ipt_register_target(struct ipt_target *target)
+{
+ int ret;
+
+ ret = down_interruptible(&ipt_mutex);
+ if (ret != 0)
+ return ret;
+ list_add(&target->list, &ipt_target);
+ up(&ipt_mutex);
+ return ret;
+}
+
+void
+ipt_unregister_target(struct ipt_target *target)
+{
+ down(&ipt_mutex);
+ LIST_DELETE(&ipt_target, target);
+ up(&ipt_mutex);
+}
+
+int
+ipt_register_match(struct ipt_match *match)
+{
+ int ret;
+
+ ret = down_interruptible(&ipt_mutex);
+ if (ret != 0)
+ return ret;
+
+ list_add(&match->list, &ipt_match);
+ up(&ipt_mutex);
+
+ return ret;
+}
+
+void
+ipt_unregister_match(struct ipt_match *match)
+{
+ down(&ipt_mutex);
+ LIST_DELETE(&ipt_match, match);
+ up(&ipt_mutex);
+}
+
+int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
+{
+ int ret;
+ struct ipt_table_info *newinfo;
+ static struct ipt_table_info bootstrap
+ = { 0, 0, 0, { 0 }, { 0 }, { } };
+
+ newinfo = vmalloc(sizeof(struct ipt_table_info)
+ + SMP_ALIGN(repl->size) * num_possible_cpus());
+ if (!newinfo)
+ return -ENOMEM;
+
+ memcpy(newinfo->entries, repl->entries, repl->size);
+
+ ret = translate_table(table->name, table->valid_hooks,
+ newinfo, repl->size,
+ repl->num_entries,
+ repl->hook_entry,
+ repl->underflow);
+ if (ret != 0) {
+ vfree(newinfo);
+ return ret;
+ }
+
+ ret = down_interruptible(&ipt_mutex);
+ if (ret != 0) {
+ vfree(newinfo);
+ return ret;
+ }
+
+ /* Don't autoload: we'd eat our tail... */
+ if (list_named_find(&ipt_tables, table->name)) {
+ ret = -EEXIST;
+ goto free_unlock;
+ }
+
+ /* Simplifies replace_table code. */
+ table->private = &bootstrap;
+ if (!replace_table(table, 0, newinfo, &ret))
+ goto free_unlock;
+
+ duprintf("table->private->number = %u\n",
+ table->private->number);
+
+ /* save number of initial entries */
+ table->private->initial_entries = table->private->number;
+
+ rwlock_init(&table->lock);
+ list_prepend(&ipt_tables, table);
+
+ unlock:
+ up(&ipt_mutex);
+ return ret;
+
+ free_unlock:
+ vfree(newinfo);
+ goto unlock;
+}
+
+void ipt_unregister_table(struct ipt_table *table)
+{
+ down(&ipt_mutex);
+ LIST_DELETE(&ipt_tables, table);
+ up(&ipt_mutex);
+
+ /* Decrease module usage counts and free resources */
+ IPT_ENTRY_ITERATE(table->private->entries, table->private->size,
+ cleanup_entry, NULL);
+ vfree(table->private);
+}
+
+/* Returns 1 if the port is matched by the range, 0 otherwise */
+static inline int
+port_match(u_int16_t min, u_int16_t max, u_int16_t port, int invert)
+{
+ int ret;
+
+ ret = (port >= min && port <= max) ^ invert;
+ return ret;
+}
+
+static int
+tcp_find_option(u_int8_t option,
+ const struct sk_buff *skb,
+ unsigned int optlen,
+ int invert,
+ int *hotdrop)
+{
+ /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
+ u_int8_t _opt[60 - sizeof(struct tcphdr)], *op;
+ unsigned int i;
+
+ duprintf("tcp_match: finding option\n");
+
+ if (!optlen)
+ return invert;
+
+ /* If we don't have the whole header, drop packet. */
+ op = skb_header_pointer(skb,
+ skb->nh.iph->ihl*4 + sizeof(struct tcphdr),
+ optlen, _opt);
+ if (op == NULL) {
+ *hotdrop = 1;
+ return 0;
+ }
+
+ for (i = 0; i < optlen; ) {
+ if (op[i] == option) return !invert;
+ if (op[i] < 2) i++;
+ else i += op[i+1]?:1;
+ }
+
+ return invert;
+}
+
+static int
+tcp_match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ struct tcphdr _tcph, *th;
+ const struct ipt_tcp *tcpinfo = matchinfo;
+
+ if (offset) {
+ /* To quote Alan:
+
+ Don't allow a fragment of TCP 8 bytes in. Nobody normal
+ causes this. Its a cracker trying to break in by doing a
+ flag overwrite to pass the direction checks.
+ */
+ if (offset == 1) {
+ duprintf("Dropping evil TCP offset=1 frag.\n");
+ *hotdrop = 1;
+ }
+ /* Must not be a fragment. */
+ return 0;
+ }
+
+#define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg))
+
+ th = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+ sizeof(_tcph), &_tcph);
+ if (th == NULL) {
+ /* We've been asked to examine this packet, and we
+ can't. Hence, no choice but to drop. */
+ duprintf("Dropping evil TCP offset=0 tinygram.\n");
+ *hotdrop = 1;
+ return 0;
+ }
+
+ if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1],
+ ntohs(th->source),
+ !!(tcpinfo->invflags & IPT_TCP_INV_SRCPT)))
+ return 0;
+ if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1],
+ ntohs(th->dest),
+ !!(tcpinfo->invflags & IPT_TCP_INV_DSTPT)))
+ return 0;
+ if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask)
+ == tcpinfo->flg_cmp,
+ IPT_TCP_INV_FLAGS))
+ return 0;
+ if (tcpinfo->option) {
+ if (th->doff * 4 < sizeof(_tcph)) {
+ *hotdrop = 1;
+ return 0;
+ }
+ if (!tcp_find_option(tcpinfo->option, skb,
+ th->doff*4 - sizeof(_tcph),
+ tcpinfo->invflags & IPT_TCP_INV_OPTION,
+ hotdrop))
+ return 0;
+ }
+ return 1;
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int
+tcp_checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ const struct ipt_tcp *tcpinfo = matchinfo;
+
+ /* Must specify proto == TCP, and no unknown invflags */
+ return ip->proto == IPPROTO_TCP
+ && !(ip->invflags & IPT_INV_PROTO)
+ && matchsize == IPT_ALIGN(sizeof(struct ipt_tcp))
+ && !(tcpinfo->invflags & ~IPT_TCP_INV_MASK);
+}
+
+static int
+udp_match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ struct udphdr _udph, *uh;
+ const struct ipt_udp *udpinfo = matchinfo;
+
+ /* Must not be a fragment. */
+ if (offset)
+ return 0;
+
+ uh = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+ sizeof(_udph), &_udph);
+ if (uh == NULL) {
+ /* We've been asked to examine this packet, and we
+ can't. Hence, no choice but to drop. */
+ duprintf("Dropping evil UDP tinygram.\n");
+ *hotdrop = 1;
+ return 0;
+ }
+
+ return port_match(udpinfo->spts[0], udpinfo->spts[1],
+ ntohs(uh->source),
+ !!(udpinfo->invflags & IPT_UDP_INV_SRCPT))
+ && port_match(udpinfo->dpts[0], udpinfo->dpts[1],
+ ntohs(uh->dest),
+ !!(udpinfo->invflags & IPT_UDP_INV_DSTPT));
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int
+udp_checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchinfosize,
+ unsigned int hook_mask)
+{
+ const struct ipt_udp *udpinfo = matchinfo;
+
+ /* Must specify proto == UDP, and no unknown invflags */
+ if (ip->proto != IPPROTO_UDP || (ip->invflags & IPT_INV_PROTO)) {
+ duprintf("ipt_udp: Protocol %u != %u\n", ip->proto,
+ IPPROTO_UDP);
+ return 0;
+ }
+ if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_udp))) {
+ duprintf("ipt_udp: matchsize %u != %u\n",
+ matchinfosize, IPT_ALIGN(sizeof(struct ipt_udp)));
+ return 0;
+ }
+ if (udpinfo->invflags & ~IPT_UDP_INV_MASK) {
+ duprintf("ipt_udp: unknown flags %X\n",
+ udpinfo->invflags);
+ return 0;
+ }
+
+ return 1;
+}
+
+/* Returns 1 if the type and code is matched by the range, 0 otherwise */
+static inline int
+icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
+ u_int8_t type, u_int8_t code,
+ int invert)
+{
+ return ((test_type == 0xFF) || (type == test_type && code >= min_code && code <= max_code))
+ ^ invert;
+}
+
+static int
+icmp_match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ struct icmphdr _icmph, *ic;
+ const struct ipt_icmp *icmpinfo = matchinfo;
+
+ /* Must not be a fragment. */
+ if (offset)
+ return 0;
+
+ ic = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+ sizeof(_icmph), &_icmph);
+ if (ic == NULL) {
+ /* We've been asked to examine this packet, and we
+ * can't. Hence, no choice but to drop.
+ */
+ duprintf("Dropping evil ICMP tinygram.\n");
+ *hotdrop = 1;
+ return 0;
+ }
+
+ return icmp_type_code_match(icmpinfo->type,
+ icmpinfo->code[0],
+ icmpinfo->code[1],
+ ic->type, ic->code,
+ !!(icmpinfo->invflags&IPT_ICMP_INV));
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int
+icmp_checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ const struct ipt_icmp *icmpinfo = matchinfo;
+
+ /* Must specify proto == ICMP, and no unknown invflags */
+ return ip->proto == IPPROTO_ICMP
+ && !(ip->invflags & IPT_INV_PROTO)
+ && matchsize == IPT_ALIGN(sizeof(struct ipt_icmp))
+ && !(icmpinfo->invflags & ~IPT_ICMP_INV);
+}
+
+/* The built-in targets: standard (NULL) and error. */
+static struct ipt_target ipt_standard_target = {
+ .name = IPT_STANDARD_TARGET,
+};
+
+static struct ipt_target ipt_error_target = {
+ .name = IPT_ERROR_TARGET,
+ .target = ipt_error,
+};
+
+static struct nf_sockopt_ops ipt_sockopts = {
+ .pf = PF_INET,
+ .set_optmin = IPT_BASE_CTL,
+ .set_optmax = IPT_SO_SET_MAX+1,
+ .set = do_ipt_set_ctl,
+ .get_optmin = IPT_BASE_CTL,
+ .get_optmax = IPT_SO_GET_MAX+1,
+ .get = do_ipt_get_ctl,
+};
+
+static struct ipt_match tcp_matchstruct = {
+ .name = "tcp",
+ .match = &tcp_match,
+ .checkentry = &tcp_checkentry,
+};
+
+static struct ipt_match udp_matchstruct = {
+ .name = "udp",
+ .match = &udp_match,
+ .checkentry = &udp_checkentry,
+};
+
+static struct ipt_match icmp_matchstruct = {
+ .name = "icmp",
+ .match = &icmp_match,
+ .checkentry = &icmp_checkentry,
+};
+
+#ifdef CONFIG_PROC_FS
+static inline int print_name(const char *i,
+ off_t start_offset, char *buffer, int length,
+ off_t *pos, unsigned int *count)
+{
+ if ((*count)++ >= start_offset) {
+ unsigned int namelen;
+
+ namelen = sprintf(buffer + *pos, "%s\n",
+ i + sizeof(struct list_head));
+ if (*pos + namelen > length) {
+ /* Stop iterating */
+ return 1;
+ }
+ *pos += namelen;
+ }
+ return 0;
+}
+
+static inline int print_target(const struct ipt_target *t,
+ off_t start_offset, char *buffer, int length,
+ off_t *pos, unsigned int *count)
+{
+ if (t == &ipt_standard_target || t == &ipt_error_target)
+ return 0;
+ return print_name((char *)t, start_offset, buffer, length, pos, count);
+}
+
+static int ipt_get_tables(char *buffer, char **start, off_t offset, int length)
+{
+ off_t pos = 0;
+ unsigned int count = 0;
+
+ if (down_interruptible(&ipt_mutex) != 0)
+ return 0;
+
+ LIST_FIND(&ipt_tables, print_name, void *,
+ offset, buffer, length, &pos, &count);
+
+ up(&ipt_mutex);
+
+ /* `start' hack - see fs/proc/generic.c line ~105 */
+ *start=(char *)((unsigned long)count-offset);
+ return pos;
+}
+
+static int ipt_get_targets(char *buffer, char **start, off_t offset, int length)
+{
+ off_t pos = 0;
+ unsigned int count = 0;
+
+ if (down_interruptible(&ipt_mutex) != 0)
+ return 0;
+
+ LIST_FIND(&ipt_target, print_target, struct ipt_target *,
+ offset, buffer, length, &pos, &count);
+
+ up(&ipt_mutex);
+
+ *start = (char *)((unsigned long)count - offset);
+ return pos;
+}
+
+static int ipt_get_matches(char *buffer, char **start, off_t offset, int length)
+{
+ off_t pos = 0;
+ unsigned int count = 0;
+
+ if (down_interruptible(&ipt_mutex) != 0)
+ return 0;
+
+ LIST_FIND(&ipt_match, print_name, void *,
+ offset, buffer, length, &pos, &count);
+
+ up(&ipt_mutex);
+
+ *start = (char *)((unsigned long)count - offset);
+ return pos;
+}
+
+static struct { char *name; get_info_t *get_info; } ipt_proc_entry[] =
+{ { "ip_tables_names", ipt_get_tables },
+ { "ip_tables_targets", ipt_get_targets },
+ { "ip_tables_matches", ipt_get_matches },
+ { NULL, NULL} };
+#endif /*CONFIG_PROC_FS*/
+
+static int __init init(void)
+{
+ int ret;
+
+ /* Noone else will be downing sem now, so we won't sleep */
+ down(&ipt_mutex);
+ list_append(&ipt_target, &ipt_standard_target);
+ list_append(&ipt_target, &ipt_error_target);
+ list_append(&ipt_match, &tcp_matchstruct);
+ list_append(&ipt_match, &udp_matchstruct);
+ list_append(&ipt_match, &icmp_matchstruct);
+ up(&ipt_mutex);
+
+ /* Register setsockopt */
+ ret = nf_register_sockopt(&ipt_sockopts);
+ if (ret < 0) {
+ duprintf("Unable to register sockopts.\n");
+ return ret;
+ }
+
+#ifdef CONFIG_PROC_FS
+ {
+ struct proc_dir_entry *proc;
+ int i;
+
+ for (i = 0; ipt_proc_entry[i].name; i++) {
+ proc = proc_net_create(ipt_proc_entry[i].name, 0,
+ ipt_proc_entry[i].get_info);
+ if (!proc) {
+ while (--i >= 0)
+ proc_net_remove(ipt_proc_entry[i].name);
+ nf_unregister_sockopt(&ipt_sockopts);
+ return -ENOMEM;
+ }
+ proc->owner = THIS_MODULE;
+ }
+ }
+#endif
+
+ printk("ip_tables: (C) 2000-2002 Netfilter core team\n");
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ nf_unregister_sockopt(&ipt_sockopts);
+#ifdef CONFIG_PROC_FS
+ {
+ int i;
+ for (i = 0; ipt_proc_entry[i].name; i++)
+ proc_net_remove(ipt_proc_entry[i].name);
+ }
+#endif
+}
+
+EXPORT_SYMBOL(ipt_register_table);
+EXPORT_SYMBOL(ipt_unregister_table);
+EXPORT_SYMBOL(ipt_register_match);
+EXPORT_SYMBOL(ipt_unregister_match);
+EXPORT_SYMBOL(ipt_do_table);
+EXPORT_SYMBOL(ipt_register_target);
+EXPORT_SYMBOL(ipt_unregister_target);
+EXPORT_SYMBOL(ipt_find_target);
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_CLASSIFY.c b/net/ipv4/netfilter/ipt_CLASSIFY.c
new file mode 100644
index 00000000000..9842e6e2318
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_CLASSIFY.c
@@ -0,0 +1,92 @@
+/*
+ * This is a module which is used for setting the skb->priority field
+ * of an skb for qdisc classification.
+ */
+
+/* (C) 2001-2002 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_CLASSIFY.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("iptables qdisc classification target module");
+
+static unsigned int
+target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ const struct ipt_classify_target_info *clinfo = targinfo;
+
+ if((*pskb)->priority != clinfo->priority) {
+ (*pskb)->priority = clinfo->priority;
+ (*pskb)->nfcache |= NFC_ALTERED;
+ }
+
+ return IPT_CONTINUE;
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ if (targinfosize != IPT_ALIGN(sizeof(struct ipt_classify_target_info))){
+ printk(KERN_ERR "CLASSIFY: invalid size (%u != %Zu).\n",
+ targinfosize,
+ IPT_ALIGN(sizeof(struct ipt_classify_target_info)));
+ return 0;
+ }
+
+ if (hook_mask & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_FORWARD) |
+ (1 << NF_IP_POST_ROUTING))) {
+ printk(KERN_ERR "CLASSIFY: only valid in LOCAL_OUT, FORWARD "
+ "and POST_ROUTING.\n");
+ return 0;
+ }
+
+ if (strcmp(tablename, "mangle") != 0) {
+ printk(KERN_ERR "CLASSIFY: can only be called from "
+ "\"mangle\" table, not \"%s\".\n",
+ tablename);
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ipt_target ipt_classify_reg = {
+ .name = "CLASSIFY",
+ .target = target,
+ .checkentry = checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_target(&ipt_classify_reg);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&ipt_classify_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
new file mode 100644
index 00000000000..0f12e3a3dc7
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -0,0 +1,761 @@
+/* Cluster IP hashmark target
+ * (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
+ * based on ideas of Fabio Olive Leite <olive@unixforge.org>
+ *
+ * Development of this code funded by SuSE Linux AG, http://www.suse.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/proc_fs.h>
+#include <linux/jhash.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+#include <net/checksum.h>
+
+#include <linux/netfilter_arp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/lockhelp.h>
+
+#define CLUSTERIP_VERSION "0.6"
+
+#define DEBUG_CLUSTERIP
+
+#ifdef DEBUG_CLUSTERIP
+#define DEBUGP printk
+#else
+#define DEBUGP
+#endif
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables target for CLUSTERIP");
+
+struct clusterip_config {
+ struct list_head list; /* list of all configs */
+ atomic_t refcount; /* reference count */
+
+ u_int32_t clusterip; /* the IP address */
+ u_int8_t clustermac[ETH_ALEN]; /* the MAC address */
+ struct net_device *dev; /* device */
+ u_int16_t num_total_nodes; /* total number of nodes */
+ u_int16_t num_local_nodes; /* number of local nodes */
+ u_int16_t local_nodes[CLUSTERIP_MAX_NODES]; /* node number array */
+
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *pde; /* proc dir entry */
+#endif
+ enum clusterip_hashmode hash_mode; /* which hashing mode */
+ u_int32_t hash_initval; /* hash initialization */
+};
+
+static LIST_HEAD(clusterip_configs);
+
+/* clusterip_lock protects the clusterip_configs list _AND_ the configurable
+ * data within all structurses (num_local_nodes, local_nodes[]) */
+static DECLARE_RWLOCK(clusterip_lock);
+
+#ifdef CONFIG_PROC_FS
+static struct file_operations clusterip_proc_fops;
+static struct proc_dir_entry *clusterip_procdir;
+#endif
+
+static inline void
+clusterip_config_get(struct clusterip_config *c) {
+ atomic_inc(&c->refcount);
+}
+
+static inline void
+clusterip_config_put(struct clusterip_config *c) {
+ if (atomic_dec_and_test(&c->refcount)) {
+ WRITE_LOCK(&clusterip_lock);
+ list_del(&c->list);
+ WRITE_UNLOCK(&clusterip_lock);
+ dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0);
+ dev_put(c->dev);
+ kfree(c);
+ }
+}
+
+
+static struct clusterip_config *
+__clusterip_config_find(u_int32_t clusterip)
+{
+ struct list_head *pos;
+
+ MUST_BE_READ_LOCKED(&clusterip_lock);
+ list_for_each(pos, &clusterip_configs) {
+ struct clusterip_config *c = list_entry(pos,
+ struct clusterip_config, list);
+ if (c->clusterip == clusterip) {
+ return c;
+ }
+ }
+
+ return NULL;
+}
+
+static inline struct clusterip_config *
+clusterip_config_find_get(u_int32_t clusterip)
+{
+ struct clusterip_config *c;
+
+ READ_LOCK(&clusterip_lock);
+ c = __clusterip_config_find(clusterip);
+ if (!c) {
+ READ_UNLOCK(&clusterip_lock);
+ return NULL;
+ }
+ atomic_inc(&c->refcount);
+ READ_UNLOCK(&clusterip_lock);
+
+ return c;
+}
+
+static struct clusterip_config *
+clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip,
+ struct net_device *dev)
+{
+ struct clusterip_config *c;
+ char buffer[16];
+
+ c = kmalloc(sizeof(*c), GFP_ATOMIC);
+ if (!c)
+ return NULL;
+
+ memset(c, 0, sizeof(*c));
+ c->dev = dev;
+ c->clusterip = ip;
+ memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
+ c->num_total_nodes = i->num_total_nodes;
+ c->num_local_nodes = i->num_local_nodes;
+ memcpy(&c->local_nodes, &i->local_nodes, sizeof(&c->local_nodes));
+ c->hash_mode = i->hash_mode;
+ c->hash_initval = i->hash_initval;
+ atomic_set(&c->refcount, 1);
+
+#ifdef CONFIG_PROC_FS
+ /* create proc dir entry */
+ sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(ip));
+ c->pde = create_proc_entry(buffer, S_IWUSR|S_IRUSR, clusterip_procdir);
+ if (!c->pde) {
+ kfree(c);
+ return NULL;
+ }
+ c->pde->proc_fops = &clusterip_proc_fops;
+ c->pde->data = c;
+#endif
+
+ WRITE_LOCK(&clusterip_lock);
+ list_add(&c->list, &clusterip_configs);
+ WRITE_UNLOCK(&clusterip_lock);
+
+ return c;
+}
+
+static int
+clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
+{
+ int i;
+
+ WRITE_LOCK(&clusterip_lock);
+
+ if (c->num_local_nodes >= CLUSTERIP_MAX_NODES
+ || nodenum > CLUSTERIP_MAX_NODES) {
+ WRITE_UNLOCK(&clusterip_lock);
+ return 1;
+ }
+
+ /* check if we alrady have this number in our array */
+ for (i = 0; i < c->num_local_nodes; i++) {
+ if (c->local_nodes[i] == nodenum) {
+ WRITE_UNLOCK(&clusterip_lock);
+ return 1;
+ }
+ }
+
+ c->local_nodes[c->num_local_nodes++] = nodenum;
+
+ WRITE_UNLOCK(&clusterip_lock);
+ return 0;
+}
+
+static int
+clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
+{
+ int i;
+
+ WRITE_LOCK(&clusterip_lock);
+
+ if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) {
+ WRITE_UNLOCK(&clusterip_lock);
+ return 1;
+ }
+
+ for (i = 0; i < c->num_local_nodes; i++) {
+ if (c->local_nodes[i] == nodenum) {
+ int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1));
+ memmove(&c->local_nodes[i], &c->local_nodes[i+1], size);
+ c->num_local_nodes--;
+ WRITE_UNLOCK(&clusterip_lock);
+ return 0;
+ }
+ }
+
+ WRITE_UNLOCK(&clusterip_lock);
+ return 1;
+}
+
+static inline u_int32_t
+clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config)
+{
+ struct iphdr *iph = skb->nh.iph;
+ unsigned long hashval;
+ u_int16_t sport, dport;
+ struct tcphdr *th;
+ struct udphdr *uh;
+ struct icmphdr *ih;
+
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ th = (void *)iph+iph->ihl*4;
+ sport = ntohs(th->source);
+ dport = ntohs(th->dest);
+ break;
+ case IPPROTO_UDP:
+ uh = (void *)iph+iph->ihl*4;
+ sport = ntohs(uh->source);
+ dport = ntohs(uh->dest);
+ break;
+ case IPPROTO_ICMP:
+ ih = (void *)iph+iph->ihl*4;
+ sport = ntohs(ih->un.echo.id);
+ dport = (ih->type<<8)|ih->code;
+ break;
+ default:
+ if (net_ratelimit()) {
+ printk(KERN_NOTICE "CLUSTERIP: unknown protocol `%u'\n",
+ iph->protocol);
+ }
+ sport = dport = 0;
+ }
+
+ switch (config->hash_mode) {
+ case CLUSTERIP_HASHMODE_SIP:
+ hashval = jhash_1word(ntohl(iph->saddr),
+ config->hash_initval);
+ break;
+ case CLUSTERIP_HASHMODE_SIP_SPT:
+ hashval = jhash_2words(ntohl(iph->saddr), sport,
+ config->hash_initval);
+ break;
+ case CLUSTERIP_HASHMODE_SIP_SPT_DPT:
+ hashval = jhash_3words(ntohl(iph->saddr), sport, dport,
+ config->hash_initval);
+ break;
+ default:
+ /* to make gcc happy */
+ hashval = 0;
+ /* This cannot happen, unless the check function wasn't called
+ * at rule load time */
+ printk("CLUSTERIP: unknown mode `%u'\n", config->hash_mode);
+ BUG();
+ break;
+ }
+
+ /* node numbers are 1..n, not 0..n */
+ return ((hashval % config->num_total_nodes)+1);
+}
+
+static inline int
+clusterip_responsible(struct clusterip_config *config, u_int32_t hash)
+{
+ int i;
+
+ READ_LOCK(&clusterip_lock);
+
+ if (config->num_local_nodes == 0) {
+ READ_UNLOCK(&clusterip_lock);
+ return 0;
+ }
+
+ for (i = 0; i < config->num_local_nodes; i++) {
+ if (config->local_nodes[i] == hash) {
+ READ_UNLOCK(&clusterip_lock);
+ return 1;
+ }
+ }
+
+ READ_UNLOCK(&clusterip_lock);
+
+ return 0;
+}
+
+/***********************************************************************
+ * IPTABLES TARGET
+ ***********************************************************************/
+
+static unsigned int
+target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ const struct ipt_clusterip_tgt_info *cipinfo = targinfo;
+ enum ip_conntrack_info ctinfo;
+ struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo);
+ u_int32_t hash;
+
+ /* don't need to clusterip_config_get() here, since refcount
+ * is only decremented by destroy() - and ip_tables guarantees
+ * that the ->target() function isn't called after ->destroy() */
+
+ if (!ct) {
+ printk(KERN_ERR "CLUSTERIP: no conntrack!\n");
+ /* FIXME: need to drop invalid ones, since replies
+ * to outgoing connections of other nodes will be
+ * marked as INVALID */
+ return NF_DROP;
+ }
+
+ /* special case: ICMP error handling. conntrack distinguishes between
+ * error messages (RELATED) and information requests (see below) */
+ if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
+ && (ctinfo == IP_CT_RELATED
+ || ctinfo == IP_CT_IS_REPLY+IP_CT_IS_REPLY))
+ return IPT_CONTINUE;
+
+ /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
+ * TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here
+ * on, which all have an ID field [relevant for hashing]. */
+
+ hash = clusterip_hashfn(*pskb, cipinfo->config);
+
+ switch (ctinfo) {
+ case IP_CT_NEW:
+ ct->mark = hash;
+ break;
+ case IP_CT_RELATED:
+ case IP_CT_RELATED+IP_CT_IS_REPLY:
+ /* FIXME: we don't handle expectations at the
+ * moment. they can arrive on a different node than
+ * the master connection (e.g. FTP passive mode) */
+ case IP_CT_ESTABLISHED:
+ case IP_CT_ESTABLISHED+IP_CT_IS_REPLY:
+ break;
+ default:
+ break;
+ }
+
+#ifdef DEBUG_CLUSTERP
+ DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+#endif
+ DEBUGP("hash=%u ct_hash=%lu ", hash, ct->mark);
+ if (!clusterip_responsible(cipinfo->config, hash)) {
+ DEBUGP("not responsible\n");
+ return NF_DROP;
+ }
+ DEBUGP("responsible\n");
+
+ /* despite being received via linklayer multicast, this is
+ * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */
+ (*pskb)->pkt_type = PACKET_HOST;
+
+ return IPT_CONTINUE;
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ struct ipt_clusterip_tgt_info *cipinfo = targinfo;
+
+ struct clusterip_config *config;
+
+ if (targinfosize != IPT_ALIGN(sizeof(struct ipt_clusterip_tgt_info))) {
+ printk(KERN_WARNING "CLUSTERIP: targinfosize %u != %Zu\n",
+ targinfosize,
+ IPT_ALIGN(sizeof(struct ipt_clusterip_tgt_info)));
+ return 0;
+ }
+
+ if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
+ cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT &&
+ cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) {
+ printk(KERN_WARNING "CLUSTERIP: unknown mode `%u'\n",
+ cipinfo->hash_mode);
+ return 0;
+
+ }
+ if (e->ip.dmsk.s_addr != 0xffffffff
+ || e->ip.dst.s_addr == 0) {
+ printk(KERN_ERR "CLUSTERIP: Please specify destination IP\n");
+ return 0;
+ }
+
+ /* FIXME: further sanity checks */
+
+ config = clusterip_config_find_get(e->ip.dst.s_addr);
+ if (!config) {
+ if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
+ printk(KERN_WARNING "CLUSTERIP: no config found for %u.%u.%u.%u, need 'new'\n", NIPQUAD(e->ip.dst.s_addr));
+ return 0;
+ } else {
+ struct net_device *dev;
+
+ if (e->ip.iniface[0] == '\0') {
+ printk(KERN_WARNING "CLUSTERIP: Please specify an interface name\n");
+ return 0;
+ }
+
+ dev = dev_get_by_name(e->ip.iniface);
+ if (!dev) {
+ printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface);
+ return 0;
+ }
+
+ config = clusterip_config_init(cipinfo,
+ e->ip.dst.s_addr, dev);
+ if (!config) {
+ printk(KERN_WARNING "CLUSTERIP: cannot allocate config\n");
+ dev_put(dev);
+ return 0;
+ }
+ dev_mc_add(config->dev,config->clustermac, ETH_ALEN, 0);
+ }
+ }
+
+ cipinfo->config = config;
+
+ return 1;
+}
+
+/* drop reference count of cluster config when rule is deleted */
+static void destroy(void *matchinfo, unsigned int matchinfosize)
+{
+ struct ipt_clusterip_tgt_info *cipinfo = matchinfo;
+
+ /* we first remove the proc entry and then drop the reference
+ * count. In case anyone still accesses the file, the open/close
+ * functions are also incrementing the refcount on their own */
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry(cipinfo->config->pde->name,
+ cipinfo->config->pde->parent);
+#endif
+ clusterip_config_put(cipinfo->config);
+}
+
+static struct ipt_target clusterip_tgt = {
+ .name = "CLUSTERIP",
+ .target = &target,
+ .checkentry = &checkentry,
+ .destroy = &destroy,
+ .me = THIS_MODULE
+};
+
+
+/***********************************************************************
+ * ARP MANGLING CODE
+ ***********************************************************************/
+
+/* hardcoded for 48bit ethernet and 32bit ipv4 addresses */
+struct arp_payload {
+ u_int8_t src_hw[ETH_ALEN];
+ u_int32_t src_ip;
+ u_int8_t dst_hw[ETH_ALEN];
+ u_int32_t dst_ip;
+} __attribute__ ((packed));
+
+#ifdef CLUSTERIP_DEBUG
+static void arp_print(struct arp_payload *payload)
+{
+#define HBUFFERLEN 30
+ char hbuffer[HBUFFERLEN];
+ int j,k;
+ const char hexbuf[]= "0123456789abcdef";
+
+ for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) {
+ hbuffer[k++]=hexbuf[(payload->src_hw[j]>>4)&15];
+ hbuffer[k++]=hexbuf[payload->src_hw[j]&15];
+ hbuffer[k++]=':';
+ }
+ hbuffer[--k]='\0';
+
+ printk("src %u.%u.%u.%u@%s, dst %u.%u.%u.%u\n",
+ NIPQUAD(payload->src_ip), hbuffer,
+ NIPQUAD(payload->dst_ip));
+}
+#endif
+
+static unsigned int
+arp_mangle(unsigned int hook,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct arphdr *arp = (*pskb)->nh.arph;
+ struct arp_payload *payload;
+ struct clusterip_config *c;
+
+ /* we don't care about non-ethernet and non-ipv4 ARP */
+ if (arp->ar_hrd != htons(ARPHRD_ETHER)
+ || arp->ar_pro != htons(ETH_P_IP)
+ || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
+ return NF_ACCEPT;
+
+ /* we only want to mangle arp replies */
+ if (arp->ar_op != htons(ARPOP_REPLY))
+ return NF_ACCEPT;
+
+ payload = (void *)(arp+1);
+
+ /* if there is no clusterip configuration for the arp reply's
+ * source ip, we don't want to mangle it */
+ c = clusterip_config_find_get(payload->src_ip);
+ if (!c)
+ return NF_ACCEPT;
+
+ /* normally the linux kernel always replies to arp queries of
+ * addresses on different interfacs. However, in the CLUSTERIP case
+ * this wouldn't work, since we didn't subscribe the mcast group on
+ * other interfaces */
+ if (c->dev != out) {
+ DEBUGP("CLUSTERIP: not mangling arp reply on different "
+ "interface: cip'%s'-skb'%s'\n", c->dev->name, out->name);
+ clusterip_config_put(c);
+ return NF_ACCEPT;
+ }
+
+ /* mangle reply hardware address */
+ memcpy(payload->src_hw, c->clustermac, arp->ar_hln);
+
+#ifdef CLUSTERIP_DEBUG
+ DEBUGP(KERN_DEBUG "CLUSTERIP mangled arp reply: ");
+ arp_print(payload);
+#endif
+
+ clusterip_config_put(c);
+
+ return NF_ACCEPT;
+}
+
+static struct nf_hook_ops cip_arp_ops = {
+ .hook = arp_mangle,
+ .pf = NF_ARP,
+ .hooknum = NF_ARP_OUT,
+ .priority = -1
+};
+
+/***********************************************************************
+ * PROC DIR HANDLING
+ ***********************************************************************/
+
+#ifdef CONFIG_PROC_FS
+
+static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
+{
+ struct proc_dir_entry *pde = s->private;
+ struct clusterip_config *c = pde->data;
+ unsigned int *nodeidx;
+
+ READ_LOCK(&clusterip_lock);
+ if (*pos >= c->num_local_nodes)
+ return NULL;
+
+ nodeidx = kmalloc(sizeof(unsigned int), GFP_KERNEL);
+ if (!nodeidx)
+ return ERR_PTR(-ENOMEM);
+
+ *nodeidx = *pos;
+ return nodeidx;
+}
+
+static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct proc_dir_entry *pde = s->private;
+ struct clusterip_config *c = pde->data;
+ unsigned int *nodeidx = (unsigned int *)v;
+
+ *pos = ++(*nodeidx);
+ if (*pos >= c->num_local_nodes) {
+ kfree(v);
+ return NULL;
+ }
+ return nodeidx;
+}
+
+static void clusterip_seq_stop(struct seq_file *s, void *v)
+{
+ kfree(v);
+
+ READ_UNLOCK(&clusterip_lock);
+}
+
+static int clusterip_seq_show(struct seq_file *s, void *v)
+{
+ struct proc_dir_entry *pde = s->private;
+ struct clusterip_config *c = pde->data;
+ unsigned int *nodeidx = (unsigned int *)v;
+
+ if (*nodeidx != 0)
+ seq_putc(s, ',');
+ seq_printf(s, "%u", c->local_nodes[*nodeidx]);
+
+ if (*nodeidx == c->num_local_nodes-1)
+ seq_putc(s, '\n');
+
+ return 0;
+}
+
+static struct seq_operations clusterip_seq_ops = {
+ .start = clusterip_seq_start,
+ .next = clusterip_seq_next,
+ .stop = clusterip_seq_stop,
+ .show = clusterip_seq_show,
+};
+
+static int clusterip_proc_open(struct inode *inode, struct file *file)
+{
+ int ret = seq_open(file, &clusterip_seq_ops);
+
+ if (!ret) {
+ struct seq_file *sf = file->private_data;
+ struct proc_dir_entry *pde = PDE(inode);
+ struct clusterip_config *c = pde->data;
+
+ sf->private = pde;
+
+ clusterip_config_get(c);
+ }
+
+ return ret;
+}
+
+static int clusterip_proc_release(struct inode *inode, struct file *file)
+{
+ struct proc_dir_entry *pde = PDE(inode);
+ struct clusterip_config *c = pde->data;
+ int ret;
+
+ ret = seq_release(inode, file);
+
+ if (!ret)
+ clusterip_config_put(c);
+
+ return ret;
+}
+
+static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
+ size_t size, loff_t *ofs)
+{
+#define PROC_WRITELEN 10
+ char buffer[PROC_WRITELEN+1];
+ struct proc_dir_entry *pde = PDE(file->f_dentry->d_inode);
+ struct clusterip_config *c = pde->data;
+ unsigned long nodenum;
+
+ if (copy_from_user(buffer, input, PROC_WRITELEN))
+ return -EFAULT;
+
+ if (*buffer == '+') {
+ nodenum = simple_strtoul(buffer+1, NULL, 10);
+ if (clusterip_add_node(c, nodenum))
+ return -ENOMEM;
+ } else if (*buffer == '-') {
+ nodenum = simple_strtoul(buffer+1, NULL,10);
+ if (clusterip_del_node(c, nodenum))
+ return -ENOENT;
+ } else
+ return -EIO;
+
+ return size;
+}
+
+static struct file_operations clusterip_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = clusterip_proc_open,
+ .read = seq_read,
+ .write = clusterip_proc_write,
+ .llseek = seq_lseek,
+ .release = clusterip_proc_release,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+static int init_or_cleanup(int fini)
+{
+ int ret;
+
+ if (fini)
+ goto cleanup;
+
+ if (ipt_register_target(&clusterip_tgt)) {
+ ret = -EINVAL;
+ goto cleanup_none;
+ }
+
+ if (nf_register_hook(&cip_arp_ops) < 0) {
+ ret = -EINVAL;
+ goto cleanup_target;
+ }
+
+#ifdef CONFIG_PROC_FS
+ clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", proc_net);
+ if (!clusterip_procdir) {
+ printk(KERN_ERR "CLUSTERIP: Unable to proc dir entry\n");
+ ret = -ENOMEM;
+ goto cleanup_hook;
+ }
+#endif /* CONFIG_PROC_FS */
+
+ printk(KERN_NOTICE "ClusterIP Version %s loaded successfully\n",
+ CLUSTERIP_VERSION);
+
+ return 0;
+
+cleanup:
+ printk(KERN_NOTICE "ClusterIP Version %s unloading\n",
+ CLUSTERIP_VERSION);
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent);
+#endif
+cleanup_hook:
+ nf_unregister_hook(&cip_arp_ops);
+cleanup_target:
+ ipt_unregister_target(&clusterip_tgt);
+cleanup_none:
+ return -EINVAL;
+}
+
+static int __init init(void)
+{
+ return init_or_cleanup(0);
+}
+
+static void __exit fini(void)
+{
+ init_or_cleanup(1);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c
new file mode 100644
index 00000000000..30ddd3e18eb
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_CONNMARK.c
@@ -0,0 +1,118 @@
+/* This kernel module is used to modify the connection mark values, or
+ * to optionally restore the skb nfmark from the connection mark
+ *
+ * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ * by Henrik Nordstrom <hno@marasystems.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+
+MODULE_AUTHOR("Henrik Nordstrom <hno@marasytems.com>");
+MODULE_DESCRIPTION("IP tables CONNMARK matching module");
+MODULE_LICENSE("GPL");
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_CONNMARK.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+
+static unsigned int
+target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ const struct ipt_connmark_target_info *markinfo = targinfo;
+ unsigned long diff;
+ unsigned long nfmark;
+ unsigned long newmark;
+
+ enum ip_conntrack_info ctinfo;
+ struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo);
+ if (ct) {
+ switch(markinfo->mode) {
+ case IPT_CONNMARK_SET:
+ newmark = (ct->mark & ~markinfo->mask) | markinfo->mark;
+ if (newmark != ct->mark)
+ ct->mark = newmark;
+ break;
+ case IPT_CONNMARK_SAVE:
+ newmark = (ct->mark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask);
+ if (ct->mark != newmark)
+ ct->mark = newmark;
+ break;
+ case IPT_CONNMARK_RESTORE:
+ nfmark = (*pskb)->nfmark;
+ diff = (ct->mark ^ nfmark) & markinfo->mask;
+ if (diff != 0) {
+ (*pskb)->nfmark = nfmark ^ diff;
+ (*pskb)->nfcache |= NFC_ALTERED;
+ }
+ break;
+ }
+ }
+
+ return IPT_CONTINUE;
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ struct ipt_connmark_target_info *matchinfo = targinfo;
+ if (targinfosize != IPT_ALIGN(sizeof(struct ipt_connmark_target_info))) {
+ printk(KERN_WARNING "CONNMARK: targinfosize %u != %Zu\n",
+ targinfosize,
+ IPT_ALIGN(sizeof(struct ipt_connmark_target_info)));
+ return 0;
+ }
+
+ if (matchinfo->mode == IPT_CONNMARK_RESTORE) {
+ if (strcmp(tablename, "mangle") != 0) {
+ printk(KERN_WARNING "CONNMARK: restore can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static struct ipt_target ipt_connmark_reg = {
+ .name = "CONNMARK",
+ .target = &target,
+ .checkentry = &checkentry,
+ .me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+ return ipt_register_target(&ipt_connmark_reg);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&ipt_connmark_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c
new file mode 100644
index 00000000000..3ea4509099f
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_DSCP.c
@@ -0,0 +1,106 @@
+/* iptables module for setting the IPv4 DSCP field, Version 1.8
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ * based on ipt_FTOS.c (C) 2000 by Matthew G. Marsh <mgm@paktronix.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * See RFC2474 for a description of the DSCP field within the IP Header.
+ *
+ * ipt_DSCP.c,v 1.8 2002/08/06 18:41:57 laforge Exp
+*/
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_DSCP.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables DSCP modification module");
+MODULE_LICENSE("GPL");
+
+static unsigned int
+target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ const struct ipt_DSCP_info *dinfo = targinfo;
+ u_int8_t sh_dscp = ((dinfo->dscp << IPT_DSCP_SHIFT) & IPT_DSCP_MASK);
+
+
+ if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) {
+ u_int16_t diffs[2];
+
+ if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+ return NF_DROP;
+
+ diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
+ (*pskb)->nh.iph->tos = ((*pskb)->nh.iph->tos & ~IPT_DSCP_MASK)
+ | sh_dscp;
+ diffs[1] = htons((*pskb)->nh.iph->tos);
+ (*pskb)->nh.iph->check
+ = csum_fold(csum_partial((char *)diffs,
+ sizeof(diffs),
+ (*pskb)->nh.iph->check
+ ^ 0xFFFF));
+ (*pskb)->nfcache |= NFC_ALTERED;
+ }
+ return IPT_CONTINUE;
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ const u_int8_t dscp = ((struct ipt_DSCP_info *)targinfo)->dscp;
+
+ if (targinfosize != IPT_ALIGN(sizeof(struct ipt_DSCP_info))) {
+ printk(KERN_WARNING "DSCP: targinfosize %u != %Zu\n",
+ targinfosize,
+ IPT_ALIGN(sizeof(struct ipt_DSCP_info)));
+ return 0;
+ }
+
+ if (strcmp(tablename, "mangle") != 0) {
+ printk(KERN_WARNING "DSCP: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+ return 0;
+ }
+
+ if ((dscp > IPT_DSCP_MAX)) {
+ printk(KERN_WARNING "DSCP: dscp %x out of range\n", dscp);
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ipt_target ipt_dscp_reg = {
+ .name = "DSCP",
+ .target = target,
+ .checkentry = checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_target(&ipt_dscp_reg);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&ipt_dscp_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
new file mode 100644
index 00000000000..ada9911118e
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -0,0 +1,175 @@
+/* iptables module for the IPv4 and TCP ECN bits, Version 1.5
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * ipt_ECN.c,v 1.5 2002/08/18 19:36:51 laforge Exp
+*/
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_ECN.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables ECN modification module");
+
+/* set ECT codepoint from IP header.
+ * return 0 if there was an error. */
+static inline int
+set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
+{
+ if (((*pskb)->nh.iph->tos & IPT_ECN_IP_MASK)
+ != (einfo->ip_ect & IPT_ECN_IP_MASK)) {
+ u_int16_t diffs[2];
+
+ if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+ return 0;
+
+ diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
+ (*pskb)->nh.iph->tos &= ~IPT_ECN_IP_MASK;
+ (*pskb)->nh.iph->tos |= (einfo->ip_ect & IPT_ECN_IP_MASK);
+ diffs[1] = htons((*pskb)->nh.iph->tos);
+ (*pskb)->nh.iph->check
+ = csum_fold(csum_partial((char *)diffs,
+ sizeof(diffs),
+ (*pskb)->nh.iph->check
+ ^0xFFFF));
+ (*pskb)->nfcache |= NFC_ALTERED;
+ }
+ return 1;
+}
+
+/* Return 0 if there was an error. */
+static inline int
+set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
+{
+ struct tcphdr _tcph, *tcph;
+ u_int16_t diffs[2];
+
+ /* Not enought header? */
+ tcph = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4,
+ sizeof(_tcph), &_tcph);
+ if (!tcph)
+ return 0;
+
+ if (!(einfo->operation & IPT_ECN_OP_SET_ECE
+ || tcph->ece == einfo->proto.tcp.ece)
+ && (!(einfo->operation & IPT_ECN_OP_SET_CWR
+ || tcph->cwr == einfo->proto.tcp.cwr)))
+ return 1;
+
+ if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
+ return 0;
+ tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4;
+
+ diffs[0] = ((u_int16_t *)tcph)[6];
+ if (einfo->operation & IPT_ECN_OP_SET_ECE)
+ tcph->ece = einfo->proto.tcp.ece;
+ if (einfo->operation & IPT_ECN_OP_SET_CWR)
+ tcph->cwr = einfo->proto.tcp.cwr;
+ diffs[1] = ((u_int16_t *)tcph)[6];
+ diffs[0] = diffs[0] ^ 0xFFFF;
+
+ if ((*pskb)->ip_summed != CHECKSUM_HW)
+ tcph->check = csum_fold(csum_partial((char *)diffs,
+ sizeof(diffs),
+ tcph->check^0xFFFF));
+ else
+ if (skb_checksum_help(*pskb, inward))
+ return 0;
+ (*pskb)->nfcache |= NFC_ALTERED;
+ return 1;
+}
+
+static unsigned int
+target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ const struct ipt_ECN_info *einfo = targinfo;
+
+ if (einfo->operation & IPT_ECN_OP_SET_IP)
+ if (!set_ect_ip(pskb, einfo))
+ return NF_DROP;
+
+ if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR)
+ && (*pskb)->nh.iph->protocol == IPPROTO_TCP)
+ if (!set_ect_tcp(pskb, einfo, (out == NULL)))
+ return NF_DROP;
+
+ return IPT_CONTINUE;
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ const struct ipt_ECN_info *einfo = (struct ipt_ECN_info *)targinfo;
+
+ if (targinfosize != IPT_ALIGN(sizeof(struct ipt_ECN_info))) {
+ printk(KERN_WARNING "ECN: targinfosize %u != %Zu\n",
+ targinfosize,
+ IPT_ALIGN(sizeof(struct ipt_ECN_info)));
+ return 0;
+ }
+
+ if (strcmp(tablename, "mangle") != 0) {
+ printk(KERN_WARNING "ECN: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+ return 0;
+ }
+
+ if (einfo->operation & IPT_ECN_OP_MASK) {
+ printk(KERN_WARNING "ECN: unsupported ECN operation %x\n",
+ einfo->operation);
+ return 0;
+ }
+ if (einfo->ip_ect & ~IPT_ECN_IP_MASK) {
+ printk(KERN_WARNING "ECN: new ECT codepoint %x out of mask\n",
+ einfo->ip_ect);
+ return 0;
+ }
+
+ if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR))
+ && (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & IPT_INV_PROTO))) {
+ printk(KERN_WARNING "ECN: cannot use TCP operations on a "
+ "non-tcp rule\n");
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ipt_target ipt_ecn_reg = {
+ .name = "ECN",
+ .target = target,
+ .checkentry = checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_target(&ipt_ecn_reg);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&ipt_ecn_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
new file mode 100644
index 00000000000..ef08733d26d
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -0,0 +1,485 @@
+/*
+ * This is a module which is used for logging packets.
+ */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_LOG.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables syslog logging module");
+
+static unsigned int nflog = 1;
+module_param(nflog, int, 0400);
+MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/* Use lock to serialize, so printks don't overlap */
+static DEFINE_SPINLOCK(log_lock);
+
+/* One level of recursion won't kill us */
+static void dump_packet(const struct ipt_log_info *info,
+ const struct sk_buff *skb,
+ unsigned int iphoff)
+{
+ struct iphdr _iph, *ih;
+
+ ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
+ if (ih == NULL) {
+ printk("TRUNCATED");
+ return;
+ }
+
+ /* Important fields:
+ * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
+ /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
+ printk("SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ",
+ NIPQUAD(ih->saddr), NIPQUAD(ih->daddr));
+
+ /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
+ printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+ ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
+ ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
+
+ /* Max length: 6 "CE DF MF " */
+ if (ntohs(ih->frag_off) & IP_CE)
+ printk("CE ");
+ if (ntohs(ih->frag_off) & IP_DF)
+ printk("DF ");
+ if (ntohs(ih->frag_off) & IP_MF)
+ printk("MF ");
+
+ /* Max length: 11 "FRAG:65535 " */
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+
+ if ((info->logflags & IPT_LOG_IPOPT)
+ && ih->ihl * 4 > sizeof(struct iphdr)) {
+ unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op;
+ unsigned int i, optsize;
+
+ optsize = ih->ihl * 4 - sizeof(struct iphdr);
+ op = skb_header_pointer(skb, iphoff+sizeof(_iph),
+ optsize, _opt);
+ if (op == NULL) {
+ printk("TRUNCATED");
+ return;
+ }
+
+ /* Max length: 127 "OPT (" 15*4*2chars ") " */
+ printk("OPT (");
+ for (i = 0; i < optsize; i++)
+ printk("%02X", op[i]);
+ printk(") ");
+ }
+
+ switch (ih->protocol) {
+ case IPPROTO_TCP: {
+ struct tcphdr _tcph, *th;
+
+ /* Max length: 10 "PROTO=TCP " */
+ printk("PROTO=TCP ");
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ th = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+ sizeof(_tcph), &_tcph);
+ if (th == NULL) {
+ printk("INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ /* Max length: 20 "SPT=65535 DPT=65535 " */
+ printk("SPT=%u DPT=%u ",
+ ntohs(th->source), ntohs(th->dest));
+ /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
+ if (info->logflags & IPT_LOG_TCPSEQ)
+ printk("SEQ=%u ACK=%u ",
+ ntohl(th->seq), ntohl(th->ack_seq));
+ /* Max length: 13 "WINDOW=65535 " */
+ printk("WINDOW=%u ", ntohs(th->window));
+ /* Max length: 9 "RES=0x3F " */
+ printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
+ /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
+ if (th->cwr)
+ printk("CWR ");
+ if (th->ece)
+ printk("ECE ");
+ if (th->urg)
+ printk("URG ");
+ if (th->ack)
+ printk("ACK ");
+ if (th->psh)
+ printk("PSH ");
+ if (th->rst)
+ printk("RST ");
+ if (th->syn)
+ printk("SYN ");
+ if (th->fin)
+ printk("FIN ");
+ /* Max length: 11 "URGP=65535 " */
+ printk("URGP=%u ", ntohs(th->urg_ptr));
+
+ if ((info->logflags & IPT_LOG_TCPOPT)
+ && th->doff * 4 > sizeof(struct tcphdr)) {
+ unsigned char _opt[4 * 15 - sizeof(struct tcphdr)];
+ unsigned char *op;
+ unsigned int i, optsize;
+
+ optsize = th->doff * 4 - sizeof(struct tcphdr);
+ op = skb_header_pointer(skb,
+ iphoff+ih->ihl*4+sizeof(_tcph),
+ optsize, _opt);
+ if (op == NULL) {
+ printk("TRUNCATED");
+ return;
+ }
+
+ /* Max length: 127 "OPT (" 15*4*2chars ") " */
+ printk("OPT (");
+ for (i = 0; i < optsize; i++)
+ printk("%02X", op[i]);
+ printk(") ");
+ }
+ break;
+ }
+ case IPPROTO_UDP: {
+ struct udphdr _udph, *uh;
+
+ /* Max length: 10 "PROTO=UDP " */
+ printk("PROTO=UDP ");
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ uh = skb_header_pointer(skb, iphoff+ih->ihl*4,
+ sizeof(_udph), &_udph);
+ if (uh == NULL) {
+ printk("INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ /* Max length: 20 "SPT=65535 DPT=65535 " */
+ printk("SPT=%u DPT=%u LEN=%u ",
+ ntohs(uh->source), ntohs(uh->dest),
+ ntohs(uh->len));
+ break;
+ }
+ case IPPROTO_ICMP: {
+ struct icmphdr _icmph, *ich;
+ static size_t required_len[NR_ICMP_TYPES+1]
+ = { [ICMP_ECHOREPLY] = 4,
+ [ICMP_DEST_UNREACH]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_SOURCE_QUENCH]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_REDIRECT]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_ECHO] = 4,
+ [ICMP_TIME_EXCEEDED]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_PARAMETERPROB]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_TIMESTAMP] = 20,
+ [ICMP_TIMESTAMPREPLY] = 20,
+ [ICMP_ADDRESS] = 12,
+ [ICMP_ADDRESSREPLY] = 12 };
+
+ /* Max length: 11 "PROTO=ICMP " */
+ printk("PROTO=ICMP ");
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+ sizeof(_icmph), &_icmph);
+ if (ich == NULL) {
+ printk("INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ /* Max length: 18 "TYPE=255 CODE=255 " */
+ printk("TYPE=%u CODE=%u ", ich->type, ich->code);
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ if (ich->type <= NR_ICMP_TYPES
+ && required_len[ich->type]
+ && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
+ printk("INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ switch (ich->type) {
+ case ICMP_ECHOREPLY:
+ case ICMP_ECHO:
+ /* Max length: 19 "ID=65535 SEQ=65535 " */
+ printk("ID=%u SEQ=%u ",
+ ntohs(ich->un.echo.id),
+ ntohs(ich->un.echo.sequence));
+ break;
+
+ case ICMP_PARAMETERPROB:
+ /* Max length: 14 "PARAMETER=255 " */
+ printk("PARAMETER=%u ",
+ ntohl(ich->un.gateway) >> 24);
+ break;
+ case ICMP_REDIRECT:
+ /* Max length: 24 "GATEWAY=255.255.255.255 " */
+ printk("GATEWAY=%u.%u.%u.%u ",
+ NIPQUAD(ich->un.gateway));
+ /* Fall through */
+ case ICMP_DEST_UNREACH:
+ case ICMP_SOURCE_QUENCH:
+ case ICMP_TIME_EXCEEDED:
+ /* Max length: 3+maxlen */
+ if (!iphoff) { /* Only recurse once. */
+ printk("[");
+ dump_packet(info, skb,
+ iphoff + ih->ihl*4+sizeof(_icmph));
+ printk("] ");
+ }
+
+ /* Max length: 10 "MTU=65535 " */
+ if (ich->type == ICMP_DEST_UNREACH
+ && ich->code == ICMP_FRAG_NEEDED)
+ printk("MTU=%u ", ntohs(ich->un.frag.mtu));
+ }
+ break;
+ }
+ /* Max Length */
+ case IPPROTO_AH: {
+ struct ip_auth_hdr _ahdr, *ah;
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 9 "PROTO=AH " */
+ printk("PROTO=AH ");
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
+ sizeof(_ahdr), &_ahdr);
+ if (ah == NULL) {
+ printk("INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 " */
+ printk("SPI=0x%x ", ntohl(ah->spi));
+ break;
+ }
+ case IPPROTO_ESP: {
+ struct ip_esp_hdr _esph, *eh;
+
+ /* Max length: 10 "PROTO=ESP " */
+ printk("PROTO=ESP ");
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
+ sizeof(_esph), &_esph);
+ if (eh == NULL) {
+ printk("INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 " */
+ printk("SPI=0x%x ", ntohl(eh->spi));
+ break;
+ }
+ /* Max length: 10 "PROTO 255 " */
+ default:
+ printk("PROTO=%u ", ih->protocol);
+ }
+
+ /* Max length: 15 "UID=4294967295 " */
+ if ((info->logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
+ read_lock_bh(&skb->sk->sk_callback_lock);
+ if (skb->sk->sk_socket && skb->sk->sk_socket->file)
+ printk("UID=%u ", skb->sk->sk_socket->file->f_uid);
+ read_unlock_bh(&skb->sk->sk_callback_lock);
+ }
+
+ /* Proto Max log string length */
+ /* IP: 40+46+6+11+127 = 230 */
+ /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */
+ /* UDP: 10+max(25,20) = 35 */
+ /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
+ /* ESP: 10+max(25)+15 = 50 */
+ /* AH: 9+max(25)+15 = 49 */
+ /* unknown: 10 */
+
+ /* (ICMP allows recursion one level deep) */
+ /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */
+ /* maxlen = 230+ 91 + 230 + 252 = 803 */
+}
+
+static void
+ipt_log_packet(unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct ipt_log_info *loginfo,
+ const char *level_string,
+ const char *prefix)
+{
+ spin_lock_bh(&log_lock);
+ printk(level_string);
+ printk("%sIN=%s OUT=%s ",
+ prefix == NULL ? loginfo->prefix : prefix,
+ in ? in->name : "",
+ out ? out->name : "");
+#ifdef CONFIG_BRIDGE_NETFILTER
+ if (skb->nf_bridge) {
+ struct net_device *physindev = skb->nf_bridge->physindev;
+ struct net_device *physoutdev = skb->nf_bridge->physoutdev;
+
+ if (physindev && in != physindev)
+ printk("PHYSIN=%s ", physindev->name);
+ if (physoutdev && out != physoutdev)
+ printk("PHYSOUT=%s ", physoutdev->name);
+ }
+#endif
+
+ if (in && !out) {
+ /* MAC logging for input chain only. */
+ printk("MAC=");
+ if (skb->dev && skb->dev->hard_header_len
+ && skb->mac.raw != (void*)skb->nh.iph) {
+ int i;
+ unsigned char *p = skb->mac.raw;
+ for (i = 0; i < skb->dev->hard_header_len; i++,p++)
+ printk("%02x%c", *p,
+ i==skb->dev->hard_header_len - 1
+ ? ' ':':');
+ } else
+ printk(" ");
+ }
+
+ dump_packet(loginfo, skb, 0);
+ printk("\n");
+ spin_unlock_bh(&log_lock);
+}
+
+static unsigned int
+ipt_log_target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ const struct ipt_log_info *loginfo = targinfo;
+ char level_string[4] = "< >";
+
+ level_string[1] = '0' + (loginfo->level % 8);
+ ipt_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL);
+
+ return IPT_CONTINUE;
+}
+
+static void
+ipt_logfn(unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const char *prefix)
+{
+ struct ipt_log_info loginfo = {
+ .level = 0,
+ .logflags = IPT_LOG_MASK,
+ .prefix = ""
+ };
+
+ ipt_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix);
+}
+
+static int ipt_log_checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ const struct ipt_log_info *loginfo = targinfo;
+
+ if (targinfosize != IPT_ALIGN(sizeof(struct ipt_log_info))) {
+ DEBUGP("LOG: targinfosize %u != %u\n",
+ targinfosize, IPT_ALIGN(sizeof(struct ipt_log_info)));
+ return 0;
+ }
+
+ if (loginfo->level >= 8) {
+ DEBUGP("LOG: level %u >= 8\n", loginfo->level);
+ return 0;
+ }
+
+ if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') {
+ DEBUGP("LOG: prefix term %i\n",
+ loginfo->prefix[sizeof(loginfo->prefix)-1]);
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ipt_target ipt_log_reg = {
+ .name = "LOG",
+ .target = ipt_log_target,
+ .checkentry = ipt_log_checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ if (ipt_register_target(&ipt_log_reg))
+ return -EINVAL;
+ if (nflog)
+ nf_log_register(PF_INET, &ipt_logfn);
+
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ if (nflog)
+ nf_log_unregister(PF_INET, &ipt_logfn);
+ ipt_unregister_target(&ipt_log_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c
new file mode 100644
index 00000000000..33c6f9b63b8
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_MARK.c
@@ -0,0 +1,162 @@
+/* This is a module which is used for setting the NFMARK field of an skb. */
+
+/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_MARK.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("iptables MARK modification module");
+
+static unsigned int
+target_v0(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ const struct ipt_mark_target_info *markinfo = targinfo;
+
+ if((*pskb)->nfmark != markinfo->mark) {
+ (*pskb)->nfmark = markinfo->mark;
+ (*pskb)->nfcache |= NFC_ALTERED;
+ }
+ return IPT_CONTINUE;
+}
+
+static unsigned int
+target_v1(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ const struct ipt_mark_target_info_v1 *markinfo = targinfo;
+ int mark = 0;
+
+ switch (markinfo->mode) {
+ case IPT_MARK_SET:
+ mark = markinfo->mark;
+ break;
+
+ case IPT_MARK_AND:
+ mark = (*pskb)->nfmark & markinfo->mark;
+ break;
+
+ case IPT_MARK_OR:
+ mark = (*pskb)->nfmark | markinfo->mark;
+ break;
+ }
+
+ if((*pskb)->nfmark != mark) {
+ (*pskb)->nfmark = mark;
+ (*pskb)->nfcache |= NFC_ALTERED;
+ }
+ return IPT_CONTINUE;
+}
+
+
+static int
+checkentry_v0(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) {
+ printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n",
+ targinfosize,
+ IPT_ALIGN(sizeof(struct ipt_mark_target_info)));
+ return 0;
+ }
+
+ if (strcmp(tablename, "mangle") != 0) {
+ printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+ return 0;
+ }
+
+ return 1;
+}
+
+static int
+checkentry_v1(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ struct ipt_mark_target_info_v1 *markinfo = targinfo;
+
+ if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info_v1))){
+ printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n",
+ targinfosize,
+ IPT_ALIGN(sizeof(struct ipt_mark_target_info_v1)));
+ return 0;
+ }
+
+ if (strcmp(tablename, "mangle") != 0) {
+ printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+ return 0;
+ }
+
+ if (markinfo->mode != IPT_MARK_SET
+ && markinfo->mode != IPT_MARK_AND
+ && markinfo->mode != IPT_MARK_OR) {
+ printk(KERN_WARNING "MARK: unknown mode %u\n",
+ markinfo->mode);
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ipt_target ipt_mark_reg_v0 = {
+ .name = "MARK",
+ .target = target_v0,
+ .checkentry = checkentry_v0,
+ .me = THIS_MODULE,
+ .revision = 0,
+};
+
+static struct ipt_target ipt_mark_reg_v1 = {
+ .name = "MARK",
+ .target = target_v1,
+ .checkentry = checkentry_v1,
+ .me = THIS_MODULE,
+ .revision = 1,
+};
+
+static int __init init(void)
+{
+ int err;
+
+ err = ipt_register_target(&ipt_mark_reg_v0);
+ if (!err) {
+ err = ipt_register_target(&ipt_mark_reg_v1);
+ if (err)
+ ipt_unregister_target(&ipt_mark_reg_v0);
+ }
+ return err;
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&ipt_mark_reg_v0);
+ ipt_unregister_target(&ipt_mark_reg_v1);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
new file mode 100644
index 00000000000..57e9f6cf1c3
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -0,0 +1,207 @@
+/* Masquerade. Simple mapping which alters range to a local IP address
+ (depending on route). */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <net/protocol.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables MASQUERADE target module");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/* Lock protects masq region inside conntrack */
+static DECLARE_RWLOCK(masq_lock);
+
+/* FIXME: Multiple targets. --RR */
+static int
+masquerade_check(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ const struct ip_nat_multi_range_compat *mr = targinfo;
+
+ if (strcmp(tablename, "nat") != 0) {
+ DEBUGP("masquerade_check: bad table `%s'.\n", tablename);
+ return 0;
+ }
+ if (targinfosize != IPT_ALIGN(sizeof(*mr))) {
+ DEBUGP("masquerade_check: size %u != %u.\n",
+ targinfosize, sizeof(*mr));
+ return 0;
+ }
+ if (hook_mask & ~(1 << NF_IP_POST_ROUTING)) {
+ DEBUGP("masquerade_check: bad hooks %x.\n", hook_mask);
+ return 0;
+ }
+ if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
+ DEBUGP("masquerade_check: bad MAP_IPS.\n");
+ return 0;
+ }
+ if (mr->rangesize != 1) {
+ DEBUGP("masquerade_check: bad rangesize %u.\n", mr->rangesize);
+ return 0;
+ }
+ return 1;
+}
+
+static unsigned int
+masquerade_target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ struct ip_conntrack *ct;
+ enum ip_conntrack_info ctinfo;
+ const struct ip_nat_multi_range_compat *mr;
+ struct ip_nat_range newrange;
+ struct rtable *rt;
+ u_int32_t newsrc;
+
+ IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING);
+
+ /* FIXME: For the moment, don't do local packets, breaks
+ testsuite for 2.3.49 --RR */
+ if ((*pskb)->sk)
+ return NF_ACCEPT;
+
+ ct = ip_conntrack_get(*pskb, &ctinfo);
+ IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED
+ || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
+
+ mr = targinfo;
+ rt = (struct rtable *)(*pskb)->dst;
+ newsrc = inet_select_addr(out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
+ if (!newsrc) {
+ printk("MASQUERADE: %s ate my IP address\n", out->name);
+ return NF_DROP;
+ }
+
+ WRITE_LOCK(&masq_lock);
+ ct->nat.masq_index = out->ifindex;
+ WRITE_UNLOCK(&masq_lock);
+
+ /* Transfer from original range. */
+ newrange = ((struct ip_nat_range)
+ { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
+ newsrc, newsrc,
+ mr->range[0].min, mr->range[0].max });
+
+ /* Hand modified range to generic setup. */
+ return ip_nat_setup_info(ct, &newrange, hooknum);
+}
+
+static inline int
+device_cmp(struct ip_conntrack *i, void *ifindex)
+{
+ int ret;
+
+ READ_LOCK(&masq_lock);
+ ret = (i->nat.masq_index == (int)(long)ifindex);
+ READ_UNLOCK(&masq_lock);
+
+ return ret;
+}
+
+static int masq_device_event(struct notifier_block *this,
+ unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = ptr;
+
+ if (event == NETDEV_DOWN) {
+ /* Device was downed. Search entire table for
+ conntracks which were associated with that device,
+ and forget them. */
+ IP_NF_ASSERT(dev->ifindex != 0);
+
+ ip_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int masq_inet_event(struct notifier_block *this,
+ unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
+
+ if (event == NETDEV_DOWN) {
+ /* IP address was deleted. Search entire table for
+ conntracks which were associated with that device,
+ and forget them. */
+ IP_NF_ASSERT(dev->ifindex != 0);
+
+ ip_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block masq_dev_notifier = {
+ .notifier_call = masq_device_event,
+};
+
+static struct notifier_block masq_inet_notifier = {
+ .notifier_call = masq_inet_event,
+};
+
+static struct ipt_target masquerade = {
+ .name = "MASQUERADE",
+ .target = masquerade_target,
+ .checkentry = masquerade_check,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ int ret;
+
+ ret = ipt_register_target(&masquerade);
+
+ if (ret == 0) {
+ /* Register for device down reports */
+ register_netdevice_notifier(&masq_dev_notifier);
+ /* Register IP address change reports */
+ register_inetaddr_notifier(&masq_inet_notifier);
+ }
+
+ return ret;
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&masquerade);
+ unregister_netdevice_notifier(&masq_dev_notifier);
+ unregister_inetaddr_notifier(&masq_inet_notifier);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
new file mode 100644
index 00000000000..06254b29d03
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -0,0 +1,117 @@
+/* NETMAP - static NAT mapping of IP network addresses (1:1).
+ * The mapping can be applied to source (POSTROUTING),
+ * destination (PREROUTING), or both (with separate rules).
+ */
+
+/* (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+
+#define MODULENAME "NETMAP"
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>");
+MODULE_DESCRIPTION("iptables 1:1 NAT mapping of IP networks target");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static int
+check(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ const struct ip_nat_multi_range_compat *mr = targinfo;
+
+ if (strcmp(tablename, "nat") != 0) {
+ DEBUGP(MODULENAME":check: bad table `%s'.\n", tablename);
+ return 0;
+ }
+ if (targinfosize != IPT_ALIGN(sizeof(*mr))) {
+ DEBUGP(MODULENAME":check: size %u.\n", targinfosize);
+ return 0;
+ }
+ if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING))) {
+ DEBUGP(MODULENAME":check: bad hooks %x.\n", hook_mask);
+ return 0;
+ }
+ if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) {
+ DEBUGP(MODULENAME":check: bad MAP_IPS.\n");
+ return 0;
+ }
+ if (mr->rangesize != 1) {
+ DEBUGP(MODULENAME":check: bad rangesize %u.\n", mr->rangesize);
+ return 0;
+ }
+ return 1;
+}
+
+static unsigned int
+target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ struct ip_conntrack *ct;
+ enum ip_conntrack_info ctinfo;
+ u_int32_t new_ip, netmask;
+ const struct ip_nat_multi_range_compat *mr = targinfo;
+ struct ip_nat_range newrange;
+
+ IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
+ || hooknum == NF_IP_POST_ROUTING);
+ ct = ip_conntrack_get(*pskb, &ctinfo);
+
+ netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
+
+ if (hooknum == NF_IP_PRE_ROUTING)
+ new_ip = (*pskb)->nh.iph->daddr & ~netmask;
+ else
+ new_ip = (*pskb)->nh.iph->saddr & ~netmask;
+ new_ip |= mr->range[0].min_ip & netmask;
+
+ newrange = ((struct ip_nat_range)
+ { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
+ new_ip, new_ip,
+ mr->range[0].min, mr->range[0].max });
+
+ /* Hand modified range to generic setup. */
+ return ip_nat_setup_info(ct, &newrange, hooknum);
+}
+
+static struct ipt_target target_module = {
+ .name = MODULENAME,
+ .target = target,
+ .checkentry = check,
+ .me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+ return ipt_register_target(&target_module);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&target_module);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_NOTRACK.c b/net/ipv4/netfilter/ipt_NOTRACK.c
new file mode 100644
index 00000000000..a4bb9b3bc29
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_NOTRACK.c
@@ -0,0 +1,76 @@
+/* This is a module which is used for setting up fake conntracks
+ * on packets so that they are not seen by the conntrack/NAT code.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+
+static unsigned int
+target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ /* Previously seen (loopback)? Ignore. */
+ if ((*pskb)->nfct != NULL)
+ return IPT_CONTINUE;
+
+ /* Attach fake conntrack entry.
+ If there is a real ct entry correspondig to this packet,
+ it'll hang aroun till timing out. We don't deal with it
+ for performance reasons. JK */
+ (*pskb)->nfct = &ip_conntrack_untracked.ct_general;
+ (*pskb)->nfctinfo = IP_CT_NEW;
+ nf_conntrack_get((*pskb)->nfct);
+
+ return IPT_CONTINUE;
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ if (targinfosize != 0) {
+ printk(KERN_WARNING "NOTRACK: targinfosize %u != 0\n",
+ targinfosize);
+ return 0;
+ }
+
+ if (strcmp(tablename, "raw") != 0) {
+ printk(KERN_WARNING "NOTRACK: can only be called from \"raw\" table, not \"%s\"\n", tablename);
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ipt_target ipt_notrack_reg = {
+ .name = "NOTRACK",
+ .target = target,
+ .checkentry = checkentry,
+ .me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+ if (ipt_register_target(&ipt_notrack_reg))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&ipt_notrack_reg);
+}
+
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
new file mode 100644
index 00000000000..d2e13447678
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -0,0 +1,129 @@
+/* Redirect. Simple mapping which alters dst to a local IP address. */
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/inetdevice.h>
+#include <net/protocol.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables REDIRECT target module");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/* FIXME: Take multiple ranges --RR */
+static int
+redirect_check(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ const struct ip_nat_multi_range_compat *mr = targinfo;
+
+ if (strcmp(tablename, "nat") != 0) {
+ DEBUGP("redirect_check: bad table `%s'.\n", table);
+ return 0;
+ }
+ if (targinfosize != IPT_ALIGN(sizeof(*mr))) {
+ DEBUGP("redirect_check: size %u.\n", targinfosize);
+ return 0;
+ }
+ if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))) {
+ DEBUGP("redirect_check: bad hooks %x.\n", hook_mask);
+ return 0;
+ }
+ if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
+ DEBUGP("redirect_check: bad MAP_IPS.\n");
+ return 0;
+ }
+ if (mr->rangesize != 1) {
+ DEBUGP("redirect_check: bad rangesize %u.\n", mr->rangesize);
+ return 0;
+ }
+ return 1;
+}
+
+static unsigned int
+redirect_target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ struct ip_conntrack *ct;
+ enum ip_conntrack_info ctinfo;
+ u_int32_t newdst;
+ const struct ip_nat_multi_range_compat *mr = targinfo;
+ struct ip_nat_range newrange;
+
+ IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
+ || hooknum == NF_IP_LOCAL_OUT);
+
+ ct = ip_conntrack_get(*pskb, &ctinfo);
+ IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
+
+ /* Local packets: make them go to loopback */
+ if (hooknum == NF_IP_LOCAL_OUT)
+ newdst = htonl(0x7F000001);
+ else {
+ struct in_device *indev;
+
+ /* Device might not have an associated in_device. */
+ indev = (struct in_device *)(*pskb)->dev->ip_ptr;
+ if (indev == NULL || indev->ifa_list == NULL)
+ return NF_DROP;
+
+ /* Grab first address on interface. */
+ newdst = indev->ifa_list->ifa_local;
+ }
+
+ /* Transfer from original range. */
+ newrange = ((struct ip_nat_range)
+ { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
+ newdst, newdst,
+ mr->range[0].min, mr->range[0].max });
+
+ /* Hand modified range to generic setup. */
+ return ip_nat_setup_info(ct, &newrange, hooknum);
+}
+
+static struct ipt_target redirect_reg = {
+ .name = "REDIRECT",
+ .target = redirect_target,
+ .checkentry = redirect_check,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_target(&redirect_reg);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&redirect_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
new file mode 100644
index 00000000000..266d6497928
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -0,0 +1,335 @@
+/*
+ * This is a module which is used for rejecting packets.
+ * Added support for customized reject packets (Jozsef Kadlecsik).
+ * Added support for ICMP type-3-code-13 (Maciej Soltysiak). [RFC 1812]
+ */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/route.h>
+#include <net/dst.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_REJECT.h>
+#ifdef CONFIG_BRIDGE_NETFILTER
+#include <linux/netfilter_bridge.h>
+#endif
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables REJECT target module");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static inline struct rtable *route_reverse(struct sk_buff *skb,
+ struct tcphdr *tcph, int hook)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct dst_entry *odst;
+ struct flowi fl = {};
+ struct rtable *rt;
+
+ /* We don't require ip forwarding to be enabled to be able to
+ * send a RST reply for bridged traffic. */
+ if (hook != NF_IP_FORWARD
+#ifdef CONFIG_BRIDGE_NETFILTER
+ || (skb->nf_bridge && skb->nf_bridge->mask & BRNF_BRIDGED)
+#endif
+ ) {
+ fl.nl_u.ip4_u.daddr = iph->saddr;
+ if (hook == NF_IP_LOCAL_IN)
+ fl.nl_u.ip4_u.saddr = iph->daddr;
+ fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
+
+ if (ip_route_output_key(&rt, &fl) != 0)
+ return NULL;
+ } else {
+ /* non-local src, find valid iif to satisfy
+ * rp-filter when calling ip_route_input. */
+ fl.nl_u.ip4_u.daddr = iph->daddr;
+ if (ip_route_output_key(&rt, &fl) != 0)
+ return NULL;
+
+ odst = skb->dst;
+ if (ip_route_input(skb, iph->saddr, iph->daddr,
+ RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
+ dst_release(&rt->u.dst);
+ return NULL;
+ }
+ dst_release(&rt->u.dst);
+ rt = (struct rtable *)skb->dst;
+ skb->dst = odst;
+
+ fl.nl_u.ip4_u.daddr = iph->saddr;
+ fl.nl_u.ip4_u.saddr = iph->daddr;
+ fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
+ }
+
+ if (rt->u.dst.error) {
+ dst_release(&rt->u.dst);
+ return NULL;
+ }
+
+ fl.proto = IPPROTO_TCP;
+ fl.fl_ip_sport = tcph->dest;
+ fl.fl_ip_dport = tcph->source;
+
+ if (xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0)) {
+ dst_release(&rt->u.dst);
+ rt = NULL;
+ }
+
+ return rt;
+}
+
+/* Send RST reply */
+static void send_reset(struct sk_buff *oldskb, int hook)
+{
+ struct sk_buff *nskb;
+ struct tcphdr _otcph, *oth, *tcph;
+ struct rtable *rt;
+ u_int16_t tmp_port;
+ u_int32_t tmp_addr;
+ int needs_ack;
+ int hh_len;
+
+ /* IP header checks: fragment. */
+ if (oldskb->nh.iph->frag_off & htons(IP_OFFSET))
+ return;
+
+ oth = skb_header_pointer(oldskb, oldskb->nh.iph->ihl * 4,
+ sizeof(_otcph), &_otcph);
+ if (oth == NULL)
+ return;
+
+ /* No RST for RST. */
+ if (oth->rst)
+ return;
+
+ /* FIXME: Check checksum --RR */
+ if ((rt = route_reverse(oldskb, oth, hook)) == NULL)
+ return;
+
+ hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+
+ /* We need a linear, writeable skb. We also need to expand
+ headroom in case hh_len of incoming interface < hh_len of
+ outgoing interface */
+ nskb = skb_copy_expand(oldskb, hh_len, skb_tailroom(oldskb),
+ GFP_ATOMIC);
+ if (!nskb) {
+ dst_release(&rt->u.dst);
+ return;
+ }
+
+ dst_release(nskb->dst);
+ nskb->dst = &rt->u.dst;
+
+ /* This packet will not be the same as the other: clear nf fields */
+ nf_reset(nskb);
+ nskb->nfcache = 0;
+ nskb->nfmark = 0;
+#ifdef CONFIG_BRIDGE_NETFILTER
+ nf_bridge_put(nskb->nf_bridge);
+ nskb->nf_bridge = NULL;
+#endif
+
+ tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl);
+
+ /* Swap source and dest */
+ tmp_addr = nskb->nh.iph->saddr;
+ nskb->nh.iph->saddr = nskb->nh.iph->daddr;
+ nskb->nh.iph->daddr = tmp_addr;
+ tmp_port = tcph->source;
+ tcph->source = tcph->dest;
+ tcph->dest = tmp_port;
+
+ /* Truncate to length (no data) */
+ tcph->doff = sizeof(struct tcphdr)/4;
+ skb_trim(nskb, nskb->nh.iph->ihl*4 + sizeof(struct tcphdr));
+ nskb->nh.iph->tot_len = htons(nskb->len);
+
+ if (tcph->ack) {
+ needs_ack = 0;
+ tcph->seq = oth->ack_seq;
+ tcph->ack_seq = 0;
+ } else {
+ needs_ack = 1;
+ tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin
+ + oldskb->len - oldskb->nh.iph->ihl*4
+ - (oth->doff<<2));
+ tcph->seq = 0;
+ }
+
+ /* Reset flags */
+ ((u_int8_t *)tcph)[13] = 0;
+ tcph->rst = 1;
+ tcph->ack = needs_ack;
+
+ tcph->window = 0;
+ tcph->urg_ptr = 0;
+
+ /* Adjust TCP checksum */
+ tcph->check = 0;
+ tcph->check = tcp_v4_check(tcph, sizeof(struct tcphdr),
+ nskb->nh.iph->saddr,
+ nskb->nh.iph->daddr,
+ csum_partial((char *)tcph,
+ sizeof(struct tcphdr), 0));
+
+ /* Adjust IP TTL, DF */
+ nskb->nh.iph->ttl = MAXTTL;
+ /* Set DF, id = 0 */
+ nskb->nh.iph->frag_off = htons(IP_DF);
+ nskb->nh.iph->id = 0;
+
+ /* Adjust IP checksum */
+ nskb->nh.iph->check = 0;
+ nskb->nh.iph->check = ip_fast_csum((unsigned char *)nskb->nh.iph,
+ nskb->nh.iph->ihl);
+
+ /* "Never happens" */
+ if (nskb->len > dst_mtu(nskb->dst))
+ goto free_nskb;
+
+ nf_ct_attach(nskb, oldskb);
+
+ NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, nskb, NULL, nskb->dst->dev,
+ dst_output);
+ return;
+
+ free_nskb:
+ kfree_skb(nskb);
+}
+
+static inline void send_unreach(struct sk_buff *skb_in, int code)
+{
+ icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
+}
+
+static unsigned int reject(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ const struct ipt_reject_info *reject = targinfo;
+
+ /* Our naive response construction doesn't deal with IP
+ options, and probably shouldn't try. */
+ if ((*pskb)->nh.iph->ihl<<2 != sizeof(struct iphdr))
+ return NF_DROP;
+
+ /* WARNING: This code causes reentry within iptables.
+ This means that the iptables jump stack is now crap. We
+ must return an absolute verdict. --RR */
+ switch (reject->with) {
+ case IPT_ICMP_NET_UNREACHABLE:
+ send_unreach(*pskb, ICMP_NET_UNREACH);
+ break;
+ case IPT_ICMP_HOST_UNREACHABLE:
+ send_unreach(*pskb, ICMP_HOST_UNREACH);
+ break;
+ case IPT_ICMP_PROT_UNREACHABLE:
+ send_unreach(*pskb, ICMP_PROT_UNREACH);
+ break;
+ case IPT_ICMP_PORT_UNREACHABLE:
+ send_unreach(*pskb, ICMP_PORT_UNREACH);
+ break;
+ case IPT_ICMP_NET_PROHIBITED:
+ send_unreach(*pskb, ICMP_NET_ANO);
+ break;
+ case IPT_ICMP_HOST_PROHIBITED:
+ send_unreach(*pskb, ICMP_HOST_ANO);
+ break;
+ case IPT_ICMP_ADMIN_PROHIBITED:
+ send_unreach(*pskb, ICMP_PKT_FILTERED);
+ break;
+ case IPT_TCP_RESET:
+ send_reset(*pskb, hooknum);
+ case IPT_ICMP_ECHOREPLY:
+ /* Doesn't happen. */
+ break;
+ }
+
+ return NF_DROP;
+}
+
+static int check(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ const struct ipt_reject_info *rejinfo = targinfo;
+
+ if (targinfosize != IPT_ALIGN(sizeof(struct ipt_reject_info))) {
+ DEBUGP("REJECT: targinfosize %u != 0\n", targinfosize);
+ return 0;
+ }
+
+ /* Only allow these for packet filtering. */
+ if (strcmp(tablename, "filter") != 0) {
+ DEBUGP("REJECT: bad table `%s'.\n", tablename);
+ return 0;
+ }
+ if ((hook_mask & ~((1 << NF_IP_LOCAL_IN)
+ | (1 << NF_IP_FORWARD)
+ | (1 << NF_IP_LOCAL_OUT))) != 0) {
+ DEBUGP("REJECT: bad hook mask %X\n", hook_mask);
+ return 0;
+ }
+
+ if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
+ printk("REJECT: ECHOREPLY no longer supported.\n");
+ return 0;
+ } else if (rejinfo->with == IPT_TCP_RESET) {
+ /* Must specify that it's a TCP packet */
+ if (e->ip.proto != IPPROTO_TCP
+ || (e->ip.invflags & IPT_INV_PROTO)) {
+ DEBUGP("REJECT: TCP_RESET invalid for non-tcp\n");
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static struct ipt_target ipt_reject_reg = {
+ .name = "REJECT",
+ .target = reject,
+ .checkentry = check,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_target(&ipt_reject_reg);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&ipt_reject_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_SAME.c b/net/ipv4/netfilter/ipt_SAME.c
new file mode 100644
index 00000000000..7a0536d864a
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_SAME.c
@@ -0,0 +1,211 @@
+/* Same. Just like SNAT, only try to make the connections
+ * between client A and server B always have the same source ip.
+ *
+ * (C) 2000 Paul `Rusty' Russell
+ * (C) 2001 Martin Josefsson
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 010320 Martin Josefsson <gandalf@wlug.westbo.se>
+ * * copied ipt_BALANCE.c to ipt_SAME.c and changed a few things.
+ * 010728 Martin Josefsson <gandalf@wlug.westbo.se>
+ * * added --nodst to not include destination-ip in new source
+ * calculations.
+ * * added some more sanity-checks.
+ * 010729 Martin Josefsson <gandalf@wlug.westbo.se>
+ * * fixed a buggy if-statement in same_check(), should have
+ * used ntohl() but didn't.
+ * * added support for multiple ranges. IPT_SAME_MAX_RANGE is
+ * defined in linux/include/linux/netfilter_ipv4/ipt_SAME.h
+ * and is currently set to 10.
+ * * added support for 1-address range, nice to have now that
+ * we have multiple ranges.
+ */
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/inetdevice.h>
+#include <net/protocol.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ipt_SAME.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Josefsson <gandalf@wlug.westbo.se>");
+MODULE_DESCRIPTION("iptables special SNAT module for consistent sourceip");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static int
+same_check(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ unsigned int count, countess, rangeip, index = 0;
+ struct ipt_same_info *mr = targinfo;
+
+ mr->ipnum = 0;
+
+ if (strcmp(tablename, "nat") != 0) {
+ DEBUGP("same_check: bad table `%s'.\n", tablename);
+ return 0;
+ }
+ if (targinfosize != IPT_ALIGN(sizeof(*mr))) {
+ DEBUGP("same_check: size %u.\n", targinfosize);
+ return 0;
+ }
+ if (hook_mask & ~(1 << NF_IP_PRE_ROUTING | 1 << NF_IP_POST_ROUTING)) {
+ DEBUGP("same_check: bad hooks %x.\n", hook_mask);
+ return 0;
+ }
+ if (mr->rangesize < 1) {
+ DEBUGP("same_check: need at least one dest range.\n");
+ return 0;
+ }
+ if (mr->rangesize > IPT_SAME_MAX_RANGE) {
+ DEBUGP("same_check: too many ranges specified, maximum "
+ "is %u ranges\n",
+ IPT_SAME_MAX_RANGE);
+ return 0;
+ }
+ for (count = 0; count < mr->rangesize; count++) {
+ if (ntohl(mr->range[count].min_ip) >
+ ntohl(mr->range[count].max_ip)) {
+ DEBUGP("same_check: min_ip is larger than max_ip in "
+ "range `%u.%u.%u.%u-%u.%u.%u.%u'.\n",
+ NIPQUAD(mr->range[count].min_ip),
+ NIPQUAD(mr->range[count].max_ip));
+ return 0;
+ }
+ if (!(mr->range[count].flags & IP_NAT_RANGE_MAP_IPS)) {
+ DEBUGP("same_check: bad MAP_IPS.\n");
+ return 0;
+ }
+ rangeip = (ntohl(mr->range[count].max_ip) -
+ ntohl(mr->range[count].min_ip) + 1);
+ mr->ipnum += rangeip;
+
+ DEBUGP("same_check: range %u, ipnum = %u\n", count, rangeip);
+ }
+ DEBUGP("same_check: total ipaddresses = %u\n", mr->ipnum);
+
+ mr->iparray = kmalloc((sizeof(u_int32_t) * mr->ipnum), GFP_KERNEL);
+ if (!mr->iparray) {
+ DEBUGP("same_check: Couldn't allocate %u bytes "
+ "for %u ipaddresses!\n",
+ (sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
+ return 0;
+ }
+ DEBUGP("same_check: Allocated %u bytes for %u ipaddresses.\n",
+ (sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
+
+ for (count = 0; count < mr->rangesize; count++) {
+ for (countess = ntohl(mr->range[count].min_ip);
+ countess <= ntohl(mr->range[count].max_ip);
+ countess++) {
+ mr->iparray[index] = countess;
+ DEBUGP("same_check: Added ipaddress `%u.%u.%u.%u' "
+ "in index %u.\n",
+ HIPQUAD(countess), index);
+ index++;
+ }
+ }
+ return 1;
+}
+
+static void
+same_destroy(void *targinfo,
+ unsigned int targinfosize)
+{
+ struct ipt_same_info *mr = targinfo;
+
+ kfree(mr->iparray);
+
+ DEBUGP("same_destroy: Deallocated %u bytes for %u ipaddresses.\n",
+ (sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
+}
+
+static unsigned int
+same_target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ struct ip_conntrack *ct;
+ enum ip_conntrack_info ctinfo;
+ u_int32_t tmpip, aindex, new_ip;
+ const struct ipt_same_info *same = targinfo;
+ struct ip_nat_range newrange;
+ const struct ip_conntrack_tuple *t;
+
+ IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING ||
+ hooknum == NF_IP_POST_ROUTING);
+ ct = ip_conntrack_get(*pskb, &ctinfo);
+
+ t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+
+ /* Base new source on real src ip and optionally dst ip,
+ giving some hope for consistency across reboots.
+ Here we calculate the index in same->iparray which
+ holds the ipaddress we should use */
+
+ tmpip = ntohl(t->src.ip);
+
+ if (!(same->info & IPT_SAME_NODST))
+ tmpip += ntohl(t->dst.ip);
+
+ aindex = tmpip % same->ipnum;
+
+ new_ip = htonl(same->iparray[aindex]);
+
+ DEBUGP("ipt_SAME: src=%u.%u.%u.%u dst=%u.%u.%u.%u, "
+ "new src=%u.%u.%u.%u\n",
+ NIPQUAD(t->src.ip), NIPQUAD(t->dst.ip),
+ NIPQUAD(new_ip));
+
+ /* Transfer from original range. */
+ newrange = ((struct ip_nat_range)
+ { same->range[0].flags, new_ip, new_ip,
+ /* FIXME: Use ports from correct range! */
+ same->range[0].min, same->range[0].max });
+
+ /* Hand modified range to generic setup. */
+ return ip_nat_setup_info(ct, &newrange, hooknum);
+}
+
+static struct ipt_target same_reg = {
+ .name = "SAME",
+ .target = same_target,
+ .checkentry = same_check,
+ .destroy = same_destroy,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_target(&same_reg);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&same_reg);
+}
+
+module_init(init);
+module_exit(fini);
+
diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c
new file mode 100644
index 00000000000..1049050b2bf
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_TCPMSS.c
@@ -0,0 +1,262 @@
+/*
+ * This is a module which is used for setting the MSS option in TCP packets.
+ *
+ * Copyright (C) 2000 Marc Boucher <marc@mbsi.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/ip.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_TCPMSS.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("iptables TCP MSS modification module");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static u_int16_t
+cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
+{
+ u_int32_t diffs[] = { oldvalinv, newval };
+ return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
+ oldcheck^0xFFFF));
+}
+
+static inline unsigned int
+optlen(const u_int8_t *opt, unsigned int offset)
+{
+ /* Beware zero-length options: make finite progress */
+ if (opt[offset] <= TCPOPT_NOP || opt[offset+1] == 0) return 1;
+ else return opt[offset+1];
+}
+
+static unsigned int
+ipt_tcpmss_target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ const struct ipt_tcpmss_info *tcpmssinfo = targinfo;
+ struct tcphdr *tcph;
+ struct iphdr *iph;
+ u_int16_t tcplen, newtotlen, oldval, newmss;
+ unsigned int i;
+ u_int8_t *opt;
+
+ if (!skb_ip_make_writable(pskb, (*pskb)->len))
+ return NF_DROP;
+
+ iph = (*pskb)->nh.iph;
+ tcplen = (*pskb)->len - iph->ihl*4;
+
+ tcph = (void *)iph + iph->ihl*4;
+
+ /* Since it passed flags test in tcp match, we know it is is
+ not a fragment, and has data >= tcp header length. SYN
+ packets should not contain data: if they did, then we risk
+ running over MTU, sending Frag Needed and breaking things
+ badly. --RR */
+ if (tcplen != tcph->doff*4) {
+ if (net_ratelimit())
+ printk(KERN_ERR
+ "ipt_tcpmss_target: bad length (%d bytes)\n",
+ (*pskb)->len);
+ return NF_DROP;
+ }
+
+ if(tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU) {
+ if(!(*pskb)->dst) {
+ if (net_ratelimit())
+ printk(KERN_ERR
+ "ipt_tcpmss_target: no dst?! can't determine path-MTU\n");
+ return NF_DROP; /* or IPT_CONTINUE ?? */
+ }
+
+ if(dst_mtu((*pskb)->dst) <= (sizeof(struct iphdr) + sizeof(struct tcphdr))) {
+ if (net_ratelimit())
+ printk(KERN_ERR
+ "ipt_tcpmss_target: unknown or invalid path-MTU (%d)\n", dst_mtu((*pskb)->dst));
+ return NF_DROP; /* or IPT_CONTINUE ?? */
+ }
+
+ newmss = dst_mtu((*pskb)->dst) - sizeof(struct iphdr) - sizeof(struct tcphdr);
+ } else
+ newmss = tcpmssinfo->mss;
+
+ opt = (u_int8_t *)tcph;
+ for (i = sizeof(struct tcphdr); i < tcph->doff*4; i += optlen(opt, i)){
+ if ((opt[i] == TCPOPT_MSS) &&
+ ((tcph->doff*4 - i) >= TCPOLEN_MSS) &&
+ (opt[i+1] == TCPOLEN_MSS)) {
+ u_int16_t oldmss;
+
+ oldmss = (opt[i+2] << 8) | opt[i+3];
+
+ if((tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU) &&
+ (oldmss <= newmss))
+ return IPT_CONTINUE;
+
+ opt[i+2] = (newmss & 0xff00) >> 8;
+ opt[i+3] = (newmss & 0x00ff);
+
+ tcph->check = cheat_check(htons(oldmss)^0xFFFF,
+ htons(newmss),
+ tcph->check);
+
+ DEBUGP(KERN_INFO "ipt_tcpmss_target: %u.%u.%u.%u:%hu"
+ "->%u.%u.%u.%u:%hu changed TCP MSS option"
+ " (from %u to %u)\n",
+ NIPQUAD((*pskb)->nh.iph->saddr),
+ ntohs(tcph->source),
+ NIPQUAD((*pskb)->nh.iph->daddr),
+ ntohs(tcph->dest),
+ oldmss, newmss);
+ goto retmodified;
+ }
+ }
+
+ /*
+ * MSS Option not found ?! add it..
+ */
+ if (skb_tailroom((*pskb)) < TCPOLEN_MSS) {
+ struct sk_buff *newskb;
+
+ newskb = skb_copy_expand(*pskb, skb_headroom(*pskb),
+ TCPOLEN_MSS, GFP_ATOMIC);
+ if (!newskb) {
+ if (net_ratelimit())
+ printk(KERN_ERR "ipt_tcpmss_target:"
+ " unable to allocate larger skb\n");
+ return NF_DROP;
+ }
+
+ kfree_skb(*pskb);
+ *pskb = newskb;
+ iph = (*pskb)->nh.iph;
+ tcph = (void *)iph + iph->ihl*4;
+ }
+
+ skb_put((*pskb), TCPOLEN_MSS);
+
+ opt = (u_int8_t *)tcph + sizeof(struct tcphdr);
+ memmove(opt + TCPOLEN_MSS, opt, tcplen - sizeof(struct tcphdr));
+
+ tcph->check = cheat_check(htons(tcplen) ^ 0xFFFF,
+ htons(tcplen + TCPOLEN_MSS), tcph->check);
+ tcplen += TCPOLEN_MSS;
+
+ opt[0] = TCPOPT_MSS;
+ opt[1] = TCPOLEN_MSS;
+ opt[2] = (newmss & 0xff00) >> 8;
+ opt[3] = (newmss & 0x00ff);
+
+ tcph->check = cheat_check(~0, *((u_int32_t *)opt), tcph->check);
+
+ oldval = ((u_int16_t *)tcph)[6];
+ tcph->doff += TCPOLEN_MSS/4;
+ tcph->check = cheat_check(oldval ^ 0xFFFF,
+ ((u_int16_t *)tcph)[6], tcph->check);
+
+ newtotlen = htons(ntohs(iph->tot_len) + TCPOLEN_MSS);
+ iph->check = cheat_check(iph->tot_len ^ 0xFFFF,
+ newtotlen, iph->check);
+ iph->tot_len = newtotlen;
+
+ DEBUGP(KERN_INFO "ipt_tcpmss_target: %u.%u.%u.%u:%hu"
+ "->%u.%u.%u.%u:%hu added TCP MSS option (%u)\n",
+ NIPQUAD((*pskb)->nh.iph->saddr),
+ ntohs(tcph->source),
+ NIPQUAD((*pskb)->nh.iph->daddr),
+ ntohs(tcph->dest),
+ newmss);
+
+ retmodified:
+ /* We never hw checksum SYN packets. */
+ BUG_ON((*pskb)->ip_summed == CHECKSUM_HW);
+
+ (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED;
+ return IPT_CONTINUE;
+}
+
+#define TH_SYN 0x02
+
+static inline int find_syn_match(const struct ipt_entry_match *m)
+{
+ const struct ipt_tcp *tcpinfo = (const struct ipt_tcp *)m->data;
+
+ if (strcmp(m->u.kernel.match->name, "tcp") == 0
+ && (tcpinfo->flg_cmp & TH_SYN)
+ && !(tcpinfo->invflags & IPT_TCP_INV_FLAGS))
+ return 1;
+
+ return 0;
+}
+
+/* Must specify -p tcp --syn/--tcp-flags SYN */
+static int
+ipt_tcpmss_checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ const struct ipt_tcpmss_info *tcpmssinfo = targinfo;
+
+ if (targinfosize != IPT_ALIGN(sizeof(struct ipt_tcpmss_info))) {
+ DEBUGP("ipt_tcpmss_checkentry: targinfosize %u != %u\n",
+ targinfosize, IPT_ALIGN(sizeof(struct ipt_tcpmss_info)));
+ return 0;
+ }
+
+
+ if((tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU) &&
+ ((hook_mask & ~((1 << NF_IP_FORWARD)
+ | (1 << NF_IP_LOCAL_OUT)
+ | (1 << NF_IP_POST_ROUTING))) != 0)) {
+ printk("TCPMSS: path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n");
+ return 0;
+ }
+
+ if (e->ip.proto == IPPROTO_TCP
+ && !(e->ip.invflags & IPT_INV_PROTO)
+ && IPT_MATCH_ITERATE(e, find_syn_match))
+ return 1;
+
+ printk("TCPMSS: Only works on TCP SYN packets\n");
+ return 0;
+}
+
+static struct ipt_target ipt_tcpmss_reg = {
+ .name = "TCPMSS",
+ .target = ipt_tcpmss_target,
+ .checkentry = ipt_tcpmss_checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_target(&ipt_tcpmss_reg);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&ipt_tcpmss_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c
new file mode 100644
index 00000000000..85c70d240f8
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_TOS.c
@@ -0,0 +1,105 @@
+/* This is a module which is used for setting the TOS field of a packet. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_TOS.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables TOS mangling module");
+
+static unsigned int
+target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ const struct ipt_tos_target_info *tosinfo = targinfo;
+
+ if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) {
+ u_int16_t diffs[2];
+
+ if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+ return NF_DROP;
+
+ diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
+ (*pskb)->nh.iph->tos
+ = ((*pskb)->nh.iph->tos & IPTOS_PREC_MASK)
+ | tosinfo->tos;
+ diffs[1] = htons((*pskb)->nh.iph->tos);
+ (*pskb)->nh.iph->check
+ = csum_fold(csum_partial((char *)diffs,
+ sizeof(diffs),
+ (*pskb)->nh.iph->check
+ ^0xFFFF));
+ (*pskb)->nfcache |= NFC_ALTERED;
+ }
+ return IPT_CONTINUE;
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ const u_int8_t tos = ((struct ipt_tos_target_info *)targinfo)->tos;
+
+ if (targinfosize != IPT_ALIGN(sizeof(struct ipt_tos_target_info))) {
+ printk(KERN_WARNING "TOS: targinfosize %u != %Zu\n",
+ targinfosize,
+ IPT_ALIGN(sizeof(struct ipt_tos_target_info)));
+ return 0;
+ }
+
+ if (strcmp(tablename, "mangle") != 0) {
+ printk(KERN_WARNING "TOS: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+ return 0;
+ }
+
+ if (tos != IPTOS_LOWDELAY
+ && tos != IPTOS_THROUGHPUT
+ && tos != IPTOS_RELIABILITY
+ && tos != IPTOS_MINCOST
+ && tos != IPTOS_NORMALSVC) {
+ printk(KERN_WARNING "TOS: bad tos value %#x\n", tos);
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ipt_target ipt_tos_reg = {
+ .name = "TOS",
+ .target = target,
+ .checkentry = checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_target(&ipt_tos_reg);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&ipt_tos_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
new file mode 100644
index 00000000000..6f2cefbe16c
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -0,0 +1,419 @@
+/*
+ * netfilter module for userspace packet logging daemons
+ *
+ * (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
+ *
+ * 2000/09/22 ulog-cprange feature added
+ * 2001/01/04 in-kernel queue as proposed by Sebastian Zander
+ * <zander@fokus.gmd.de>
+ * 2001/01/30 per-rule nlgroup conflicts with global queue.
+ * nlgroup now global (sysctl)
+ * 2001/04/19 ulog-queue reworked, now fixed buffer size specified at
+ * module loadtime -HW
+ * 2002/07/07 remove broken nflog_rcv() function -HW
+ * 2002/08/29 fix shifted/unshifted nlgroup bug -HW
+ * 2002/10/30 fix uninitialized mac_len field - <Anders K. Pedersen>
+ * 2004/10/25 fix erroneous calculation of 'len' parameter to NLMSG_PUT
+ * resulting in bogus 'error during NLMSG_PUT' messages.
+ *
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This module accepts two parameters:
+ *
+ * nlbufsiz:
+ * The parameter specifies how big the buffer for each netlink multicast
+ * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will
+ * get accumulated in the kernel until they are sent to userspace. It is
+ * NOT possible to allocate more than 128kB, and it is strongly discouraged,
+ * because atomically allocating 128kB inside the network rx softirq is not
+ * reliable. Please also keep in mind that this buffer size is allocated for
+ * each nlgroup you are using, so the total kernel memory usage increases
+ * by that factor.
+ *
+ * flushtimeout:
+ * Specify, after how many hundredths of a second the queue should be
+ * flushed even if it is not full yet.
+ *
+ * ipt_ULOG.c,v 1.22 2002/10/30 09:07:31 laforge Exp
+ */
+
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/spinlock.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/netlink.h>
+#include <linux/netdevice.h>
+#include <linux/mm.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_ULOG.h>
+#include <linux/netfilter_ipv4/lockhelp.h>
+#include <net/sock.h>
+#include <linux/bitops.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("iptables userspace logging module");
+
+#define ULOG_NL_EVENT 111 /* Harald's favorite number */
+#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */
+
+#if 0
+#define DEBUGP(format, args...) printk("%s:%s:" format, \
+ __FILE__, __FUNCTION__ , ## args)
+#else
+#define DEBUGP(format, args...)
+#endif
+
+#define PRINTR(format, args...) do { if (net_ratelimit()) printk(format , ## args); } while (0)
+
+static unsigned int nlbufsiz = 4096;
+module_param(nlbufsiz, uint, 0600); /* FIXME: Check size < 128k --RR */
+MODULE_PARM_DESC(nlbufsiz, "netlink buffer size");
+
+static unsigned int flushtimeout = 10;
+module_param(flushtimeout, int, 0600);
+MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)");
+
+static unsigned int nflog = 1;
+module_param(nflog, int, 0400);
+MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
+
+/* global data structures */
+
+typedef struct {
+ unsigned int qlen; /* number of nlmsgs' in the skb */
+ struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */
+ struct sk_buff *skb; /* the pre-allocated skb */
+ struct timer_list timer; /* the timer function */
+} ulog_buff_t;
+
+static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */
+
+static struct sock *nflognl; /* our socket */
+static DECLARE_LOCK(ulog_lock); /* spinlock */
+
+/* send one ulog_buff_t to userspace */
+static void ulog_send(unsigned int nlgroupnum)
+{
+ ulog_buff_t *ub = &ulog_buffers[nlgroupnum];
+
+ if (timer_pending(&ub->timer)) {
+ DEBUGP("ipt_ULOG: ulog_send: timer was pending, deleting\n");
+ del_timer(&ub->timer);
+ }
+
+ /* last nlmsg needs NLMSG_DONE */
+ if (ub->qlen > 1)
+ ub->lastnlh->nlmsg_type = NLMSG_DONE;
+
+ NETLINK_CB(ub->skb).dst_groups = (1 << nlgroupnum);
+ DEBUGP("ipt_ULOG: throwing %d packets to netlink mask %u\n",
+ ub->qlen, nlgroupnum);
+ netlink_broadcast(nflognl, ub->skb, 0, (1 << nlgroupnum), GFP_ATOMIC);
+
+ ub->qlen = 0;
+ ub->skb = NULL;
+ ub->lastnlh = NULL;
+
+}
+
+
+/* timer function to flush queue in flushtimeout time */
+static void ulog_timer(unsigned long data)
+{
+ DEBUGP("ipt_ULOG: timer function called, calling ulog_send\n");
+
+ /* lock to protect against somebody modifying our structure
+ * from ipt_ulog_target at the same time */
+ LOCK_BH(&ulog_lock);
+ ulog_send(data);
+ UNLOCK_BH(&ulog_lock);
+}
+
+static struct sk_buff *ulog_alloc_skb(unsigned int size)
+{
+ struct sk_buff *skb;
+
+ /* alloc skb which should be big enough for a whole
+ * multipart message. WARNING: has to be <= 131000
+ * due to slab allocator restrictions */
+
+ skb = alloc_skb(nlbufsiz, GFP_ATOMIC);
+ if (!skb) {
+ PRINTR("ipt_ULOG: can't alloc whole buffer %ub!\n",
+ nlbufsiz);
+
+ /* try to allocate only as much as we need for
+ * current packet */
+
+ skb = alloc_skb(size, GFP_ATOMIC);
+ if (!skb)
+ PRINTR("ipt_ULOG: can't even allocate %ub\n", size);
+ }
+
+ return skb;
+}
+
+static void ipt_ulog_packet(unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct ipt_ulog_info *loginfo,
+ const char *prefix)
+{
+ ulog_buff_t *ub;
+ ulog_packet_msg_t *pm;
+ size_t size, copy_len;
+ struct nlmsghdr *nlh;
+
+ /* ffs == find first bit set, necessary because userspace
+ * is already shifting groupnumber, but we need unshifted.
+ * ffs() returns [1..32], we need [0..31] */
+ unsigned int groupnum = ffs(loginfo->nl_group) - 1;
+
+ /* calculate the size of the skb needed */
+ if ((loginfo->copy_range == 0) ||
+ (loginfo->copy_range > skb->len)) {
+ copy_len = skb->len;
+ } else {
+ copy_len = loginfo->copy_range;
+ }
+
+ size = NLMSG_SPACE(sizeof(*pm) + copy_len);
+
+ ub = &ulog_buffers[groupnum];
+
+ LOCK_BH(&ulog_lock);
+
+ if (!ub->skb) {
+ if (!(ub->skb = ulog_alloc_skb(size)))
+ goto alloc_failure;
+ } else if (ub->qlen >= loginfo->qthreshold ||
+ size > skb_tailroom(ub->skb)) {
+ /* either the queue len is too high or we don't have
+ * enough room in nlskb left. send it to userspace. */
+
+ ulog_send(groupnum);
+
+ if (!(ub->skb = ulog_alloc_skb(size)))
+ goto alloc_failure;
+ }
+
+ DEBUGP("ipt_ULOG: qlen %d, qthreshold %d\n", ub->qlen,
+ loginfo->qthreshold);
+
+ /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */
+ nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
+ sizeof(*pm)+copy_len);
+ ub->qlen++;
+
+ pm = NLMSG_DATA(nlh);
+
+ /* We might not have a timestamp, get one */
+ if (skb->stamp.tv_sec == 0)
+ do_gettimeofday((struct timeval *)&skb->stamp);
+
+ /* copy hook, prefix, timestamp, payload, etc. */
+ pm->data_len = copy_len;
+ pm->timestamp_sec = skb->stamp.tv_sec;
+ pm->timestamp_usec = skb->stamp.tv_usec;
+ pm->mark = skb->nfmark;
+ pm->hook = hooknum;
+ if (prefix != NULL)
+ strncpy(pm->prefix, prefix, sizeof(pm->prefix));
+ else if (loginfo->prefix[0] != '\0')
+ strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix));
+ else
+ *(pm->prefix) = '\0';
+
+ if (in && in->hard_header_len > 0
+ && skb->mac.raw != (void *) skb->nh.iph
+ && in->hard_header_len <= ULOG_MAC_LEN) {
+ memcpy(pm->mac, skb->mac.raw, in->hard_header_len);
+ pm->mac_len = in->hard_header_len;
+ } else
+ pm->mac_len = 0;
+
+ if (in)
+ strncpy(pm->indev_name, in->name, sizeof(pm->indev_name));
+ else
+ pm->indev_name[0] = '\0';
+
+ if (out)
+ strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
+ else
+ pm->outdev_name[0] = '\0';
+
+ /* copy_len <= skb->len, so can't fail. */
+ if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0)
+ BUG();
+
+ /* check if we are building multi-part messages */
+ if (ub->qlen > 1) {
+ ub->lastnlh->nlmsg_flags |= NLM_F_MULTI;
+ }
+
+ ub->lastnlh = nlh;
+
+ /* if timer isn't already running, start it */
+ if (!timer_pending(&ub->timer)) {
+ ub->timer.expires = jiffies + flushtimeout * HZ / 100;
+ add_timer(&ub->timer);
+ }
+
+ /* if threshold is reached, send message to userspace */
+ if (ub->qlen >= loginfo->qthreshold) {
+ if (loginfo->qthreshold > 1)
+ nlh->nlmsg_type = NLMSG_DONE;
+ ulog_send(groupnum);
+ }
+
+ UNLOCK_BH(&ulog_lock);
+
+ return;
+
+nlmsg_failure:
+ PRINTR("ipt_ULOG: error during NLMSG_PUT\n");
+
+alloc_failure:
+ PRINTR("ipt_ULOG: Error building netlink message\n");
+
+ UNLOCK_BH(&ulog_lock);
+}
+
+static unsigned int ipt_ulog_target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo, void *userinfo)
+{
+ struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo;
+
+ ipt_ulog_packet(hooknum, *pskb, in, out, loginfo, NULL);
+
+ return IPT_CONTINUE;
+}
+
+static void ipt_logfn(unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const char *prefix)
+{
+ struct ipt_ulog_info loginfo = {
+ .nl_group = ULOG_DEFAULT_NLGROUP,
+ .copy_range = 0,
+ .qthreshold = ULOG_DEFAULT_QTHRESHOLD,
+ .prefix = ""
+ };
+
+ ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
+}
+
+static int ipt_ulog_checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hookmask)
+{
+ struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo;
+
+ if (targinfosize != IPT_ALIGN(sizeof(struct ipt_ulog_info))) {
+ DEBUGP("ipt_ULOG: targinfosize %u != 0\n", targinfosize);
+ return 0;
+ }
+
+ if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') {
+ DEBUGP("ipt_ULOG: prefix term %i\n",
+ loginfo->prefix[sizeof(loginfo->prefix) - 1]);
+ return 0;
+ }
+
+ if (loginfo->qthreshold > ULOG_MAX_QLEN) {
+ DEBUGP("ipt_ULOG: queue threshold %i > MAX_QLEN\n",
+ loginfo->qthreshold);
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ipt_target ipt_ulog_reg = {
+ .name = "ULOG",
+ .target = ipt_ulog_target,
+ .checkentry = ipt_ulog_checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ int i;
+
+ DEBUGP("ipt_ULOG: init module\n");
+
+ if (nlbufsiz >= 128*1024) {
+ printk("Netlink buffer has to be <= 128kB\n");
+ return -EINVAL;
+ }
+
+ /* initialize ulog_buffers */
+ for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
+ init_timer(&ulog_buffers[i].timer);
+ ulog_buffers[i].timer.function = ulog_timer;
+ ulog_buffers[i].timer.data = i;
+ }
+
+ nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL);
+ if (!nflognl)
+ return -ENOMEM;
+
+ if (ipt_register_target(&ipt_ulog_reg) != 0) {
+ sock_release(nflognl->sk_socket);
+ return -EINVAL;
+ }
+ if (nflog)
+ nf_log_register(PF_INET, &ipt_logfn);
+
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ ulog_buff_t *ub;
+ int i;
+
+ DEBUGP("ipt_ULOG: cleanup_module\n");
+
+ if (nflog)
+ nf_log_unregister(PF_INET, &ipt_logfn);
+ ipt_unregister_target(&ipt_ulog_reg);
+ sock_release(nflognl->sk_socket);
+
+ /* remove pending timers and free allocated skb's */
+ for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
+ ub = &ulog_buffers[i];
+ if (timer_pending(&ub->timer)) {
+ DEBUGP("timer was pending, deleting\n");
+ del_timer(&ub->timer);
+ }
+
+ if (ub->skb) {
+ kfree_skb(ub->skb);
+ ub->skb = NULL;
+ }
+ }
+
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
new file mode 100644
index 00000000000..f5909a4c3fc
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_addrtype.c
@@ -0,0 +1,77 @@
+/*
+ * iptables module to match inet_addr_type() of an ip.
+ *
+ * Copyright (c) 2004 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/ip.h>
+#include <net/route.h>
+
+#include <linux/netfilter_ipv4/ipt_addrtype.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("iptables addrtype match");
+
+static inline int match_type(u_int32_t addr, u_int16_t mask)
+{
+ return !!(mask & (1 << inet_addr_type(addr)));
+}
+
+static int match(const struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, const void *matchinfo,
+ int offset, int *hotdrop)
+{
+ const struct ipt_addrtype_info *info = matchinfo;
+ const struct iphdr *iph = skb->nh.iph;
+ int ret = 1;
+
+ if (info->source)
+ ret &= match_type(iph->saddr, info->source)^info->invert_source;
+ if (info->dest)
+ ret &= match_type(iph->daddr, info->dest)^info->invert_dest;
+
+ return ret;
+}
+
+static int checkentry(const char *tablename, const struct ipt_ip *ip,
+ void *matchinfo, unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_addrtype_info))) {
+ printk(KERN_ERR "ipt_addrtype: invalid size (%u != %Zu)\n.",
+ matchsize, IPT_ALIGN(sizeof(struct ipt_addrtype_info)));
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ipt_match addrtype_match = {
+ .name = "addrtype",
+ .match = match,
+ .checkentry = checkentry,
+ .me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&addrtype_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&addrtype_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
new file mode 100644
index 00000000000..a0fea847cb7
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -0,0 +1,117 @@
+/* Kernel module to match AH parameters. */
+/* (C) 1999-2000 Yon Uriarte <yon@astaro.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+
+#include <linux/netfilter_ipv4/ipt_ah.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>");
+MODULE_DESCRIPTION("iptables AH SPI match module");
+
+#ifdef DEBUG_CONNTRACK
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+/* Returns 1 if the spi is matched by the range, 0 otherwise */
+static inline int
+spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert)
+{
+ int r=0;
+ duprintf("ah spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ',
+ min,spi,max);
+ r=(spi >= min && spi <= max) ^ invert;
+ duprintf(" result %s\n",r? "PASS" : "FAILED");
+ return r;
+}
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ struct ip_auth_hdr _ahdr, *ah;
+ const struct ipt_ah *ahinfo = matchinfo;
+
+ /* Must not be a fragment. */
+ if (offset)
+ return 0;
+
+ ah = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+ sizeof(_ahdr), &_ahdr);
+ if (ah == NULL) {
+ /* We've been asked to examine this packet, and we
+ * can't. Hence, no choice but to drop.
+ */
+ duprintf("Dropping evil AH tinygram.\n");
+ *hotdrop = 1;
+ return 0;
+ }
+
+ return spi_match(ahinfo->spis[0], ahinfo->spis[1],
+ ntohl(ah->spi),
+ !!(ahinfo->invflags & IPT_AH_INV_SPI));
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int
+checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchinfosize,
+ unsigned int hook_mask)
+{
+ const struct ipt_ah *ahinfo = matchinfo;
+
+ /* Must specify proto == AH, and no unknown invflags */
+ if (ip->proto != IPPROTO_AH || (ip->invflags & IPT_INV_PROTO)) {
+ duprintf("ipt_ah: Protocol %u != %u\n", ip->proto,
+ IPPROTO_AH);
+ return 0;
+ }
+ if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_ah))) {
+ duprintf("ipt_ah: matchsize %u != %u\n",
+ matchinfosize, IPT_ALIGN(sizeof(struct ipt_ah)));
+ return 0;
+ }
+ if (ahinfo->invflags & ~IPT_AH_INV_MASK) {
+ duprintf("ipt_ah: unknown flags %X\n",
+ ahinfo->invflags);
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ipt_match ah_match = {
+ .name = "ah",
+ .match = &match,
+ .checkentry = &checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&ah_match);
+}
+
+static void __exit cleanup(void)
+{
+ ipt_unregister_match(&ah_match);
+}
+
+module_init(init);
+module_exit(cleanup);
diff --git a/net/ipv4/netfilter/ipt_comment.c b/net/ipv4/netfilter/ipt_comment.c
new file mode 100644
index 00000000000..6b76a1ea524
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_comment.c
@@ -0,0 +1,59 @@
+/*
+ * Implements a dummy match to allow attaching comments to rules
+ *
+ * 2003-05-13 Brad Fisher (brad@info-link.net)
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_comment.h>
+
+MODULE_AUTHOR("Brad Fisher <brad@info-link.net>");
+MODULE_DESCRIPTION("iptables comment match module");
+MODULE_LICENSE("GPL");
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ /* We always match */
+ return 1;
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ /* Check the size */
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_comment_info)))
+ return 0;
+ return 1;
+}
+
+static struct ipt_match comment_match = {
+ .name = "comment",
+ .match = match,
+ .checkentry = checkentry,
+ .me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&comment_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&comment_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c
new file mode 100644
index 00000000000..2706f96cea5
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_connmark.c
@@ -0,0 +1,81 @@
+/* This kernel module matches connection mark values set by the
+ * CONNMARK target
+ *
+ * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ * by Henrik Nordstrom <hno@marasystems.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+MODULE_AUTHOR("Henrik Nordstrom <hno@marasytems.com>");
+MODULE_DESCRIPTION("IP tables connmark match module");
+MODULE_LICENSE("GPL");
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_connmark.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ const struct ipt_connmark_info *info = matchinfo;
+ enum ip_conntrack_info ctinfo;
+ struct ip_conntrack *ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo);
+ if (!ct)
+ return 0;
+
+ return ((ct->mark & info->mask) == info->mark) ^ info->invert;
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info)))
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_match connmark_match = {
+ .name = "connmark",
+ .match = &match,
+ .checkentry = &checkentry,
+ .me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&connmark_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&connmark_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_conntrack.c b/net/ipv4/netfilter/ipt_conntrack.c
new file mode 100644
index 00000000000..c1d22801b7c
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_conntrack.c
@@ -0,0 +1,136 @@
+/* Kernel module to match connection tracking information.
+ * Superset of Rusty's minimalistic state match.
+ *
+ * (C) 2001 Marc Boucher (marc@mbsi.ca).
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_conntrack.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("iptables connection tracking match module");
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ const struct ipt_conntrack_info *sinfo = matchinfo;
+ struct ip_conntrack *ct;
+ enum ip_conntrack_info ctinfo;
+ unsigned int statebit;
+
+ ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo);
+
+#define FWINV(bool,invflg) ((bool) ^ !!(sinfo->invflags & invflg))
+
+ if (ct == &ip_conntrack_untracked)
+ statebit = IPT_CONNTRACK_STATE_UNTRACKED;
+ else if (ct)
+ statebit = IPT_CONNTRACK_STATE_BIT(ctinfo);
+ else
+ statebit = IPT_CONNTRACK_STATE_INVALID;
+
+ if(sinfo->flags & IPT_CONNTRACK_STATE) {
+ if (ct) {
+ if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip !=
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip)
+ statebit |= IPT_CONNTRACK_STATE_SNAT;
+
+ if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip !=
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip)
+ statebit |= IPT_CONNTRACK_STATE_DNAT;
+ }
+
+ if (FWINV((statebit & sinfo->statemask) == 0, IPT_CONNTRACK_STATE))
+ return 0;
+ }
+
+ if(sinfo->flags & IPT_CONNTRACK_PROTO) {
+ if (!ct || FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, IPT_CONNTRACK_PROTO))
+ return 0;
+ }
+
+ if(sinfo->flags & IPT_CONNTRACK_ORIGSRC) {
+ if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip&sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, IPT_CONNTRACK_ORIGSRC))
+ return 0;
+ }
+
+ if(sinfo->flags & IPT_CONNTRACK_ORIGDST) {
+ if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip&sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, IPT_CONNTRACK_ORIGDST))
+ return 0;
+ }
+
+ if(sinfo->flags & IPT_CONNTRACK_REPLSRC) {
+ if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip&sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].src.ip, IPT_CONNTRACK_REPLSRC))
+ return 0;
+ }
+
+ if(sinfo->flags & IPT_CONNTRACK_REPLDST) {
+ if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip&sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, IPT_CONNTRACK_REPLDST))
+ return 0;
+ }
+
+ if(sinfo->flags & IPT_CONNTRACK_STATUS) {
+ if (!ct || FWINV((ct->status & sinfo->statusmask) == 0, IPT_CONNTRACK_STATUS))
+ return 0;
+ }
+
+ if(sinfo->flags & IPT_CONNTRACK_EXPIRES) {
+ unsigned long expires;
+
+ if(!ct)
+ return 0;
+
+ expires = timer_pending(&ct->timeout) ? (ct->timeout.expires - jiffies)/HZ : 0;
+
+ if (FWINV(!(expires >= sinfo->expires_min && expires <= sinfo->expires_max), IPT_CONNTRACK_EXPIRES))
+ return 0;
+ }
+
+ return 1;
+}
+
+static int check(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_conntrack_info)))
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_match conntrack_match = {
+ .name = "conntrack",
+ .match = &match,
+ .checkentry = &check,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ need_ip_conntrack();
+ return ipt_register_match(&conntrack_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&conntrack_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_dscp.c b/net/ipv4/netfilter/ipt_dscp.c
new file mode 100644
index 00000000000..5df52a64a5d
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_dscp.c
@@ -0,0 +1,63 @@
+/* IP tables module for matching the value of the IPv4 DSCP field
+ *
+ * ipt_dscp.c,v 1.3 2002/08/05 19:00:21 laforge Exp
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter_ipv4/ipt_dscp.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables DSCP matching module");
+MODULE_LICENSE("GPL");
+
+static int match(const struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, const void *matchinfo,
+ int offset, int *hotdrop)
+{
+ const struct ipt_dscp_info *info = matchinfo;
+ const struct iphdr *iph = skb->nh.iph;
+
+ u_int8_t sh_dscp = ((info->dscp << IPT_DSCP_SHIFT) & IPT_DSCP_MASK);
+
+ return ((iph->tos&IPT_DSCP_MASK) == sh_dscp) ^ info->invert;
+}
+
+static int checkentry(const char *tablename, const struct ipt_ip *ip,
+ void *matchinfo, unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_dscp_info)))
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_match dscp_match = {
+ .name = "dscp",
+ .match = &match,
+ .checkentry = &checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&dscp_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&dscp_match);
+
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
new file mode 100644
index 00000000000..b6f7181e89c
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -0,0 +1,131 @@
+/* IP tables module for matching the value of the IPv4 and TCP ECN bits
+ *
+ * ipt_ecn.c,v 1.3 2002/05/29 15:09:00 laforge Exp
+ *
+ * (C) 2002 by Harald Welte <laforge@gnumonks.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_ecn.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables ECN matching module");
+MODULE_LICENSE("GPL");
+
+static inline int match_ip(const struct sk_buff *skb,
+ const struct ipt_ecn_info *einfo)
+{
+ return ((skb->nh.iph->tos&IPT_ECN_IP_MASK) == einfo->ip_ect);
+}
+
+static inline int match_tcp(const struct sk_buff *skb,
+ const struct ipt_ecn_info *einfo,
+ int *hotdrop)
+{
+ struct tcphdr _tcph, *th;
+
+ /* In practice, TCP match does this, so can't fail. But let's
+ * be good citizens.
+ */
+ th = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+ sizeof(_tcph), &_tcph);
+ if (th == NULL) {
+ *hotdrop = 0;
+ return 0;
+ }
+
+ if (einfo->operation & IPT_ECN_OP_MATCH_ECE) {
+ if (einfo->invert & IPT_ECN_OP_MATCH_ECE) {
+ if (th->ece == 1)
+ return 0;
+ } else {
+ if (th->ece == 0)
+ return 0;
+ }
+ }
+
+ if (einfo->operation & IPT_ECN_OP_MATCH_CWR) {
+ if (einfo->invert & IPT_ECN_OP_MATCH_CWR) {
+ if (th->cwr == 1)
+ return 0;
+ } else {
+ if (th->cwr == 0)
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static int match(const struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, const void *matchinfo,
+ int offset, int *hotdrop)
+{
+ const struct ipt_ecn_info *info = matchinfo;
+
+ if (info->operation & IPT_ECN_OP_MATCH_IP)
+ if (!match_ip(skb, info))
+ return 0;
+
+ if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) {
+ if (skb->nh.iph->protocol != IPPROTO_TCP)
+ return 0;
+ if (!match_tcp(skb, info, hotdrop))
+ return 0;
+ }
+
+ return 1;
+}
+
+static int checkentry(const char *tablename, const struct ipt_ip *ip,
+ void *matchinfo, unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ const struct ipt_ecn_info *info = matchinfo;
+
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_ecn_info)))
+ return 0;
+
+ if (info->operation & IPT_ECN_OP_MATCH_MASK)
+ return 0;
+
+ if (info->invert & IPT_ECN_OP_MATCH_MASK)
+ return 0;
+
+ if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)
+ && ip->proto != IPPROTO_TCP) {
+ printk(KERN_WARNING "ipt_ecn: can't match TCP bits in rule for"
+ " non-tcp packets\n");
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ipt_match ecn_match = {
+ .name = "ecn",
+ .match = &match,
+ .checkentry = &checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&ecn_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&ecn_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_esp.c b/net/ipv4/netfilter/ipt_esp.c
new file mode 100644
index 00000000000..e1d0dd31e11
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_esp.c
@@ -0,0 +1,118 @@
+/* Kernel module to match ESP parameters. */
+
+/* (C) 1999-2000 Yon Uriarte <yon@astaro.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+
+#include <linux/netfilter_ipv4/ipt_esp.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>");
+MODULE_DESCRIPTION("iptables ESP SPI match module");
+
+#ifdef DEBUG_CONNTRACK
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+/* Returns 1 if the spi is matched by the range, 0 otherwise */
+static inline int
+spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert)
+{
+ int r=0;
+ duprintf("esp spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ',
+ min,spi,max);
+ r=(spi >= min && spi <= max) ^ invert;
+ duprintf(" result %s\n",r? "PASS" : "FAILED");
+ return r;
+}
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ struct ip_esp_hdr _esp, *eh;
+ const struct ipt_esp *espinfo = matchinfo;
+
+ /* Must not be a fragment. */
+ if (offset)
+ return 0;
+
+ eh = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+ sizeof(_esp), &_esp);
+ if (eh == NULL) {
+ /* We've been asked to examine this packet, and we
+ * can't. Hence, no choice but to drop.
+ */
+ duprintf("Dropping evil ESP tinygram.\n");
+ *hotdrop = 1;
+ return 0;
+ }
+
+ return spi_match(espinfo->spis[0], espinfo->spis[1],
+ ntohl(eh->spi),
+ !!(espinfo->invflags & IPT_ESP_INV_SPI));
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int
+checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchinfosize,
+ unsigned int hook_mask)
+{
+ const struct ipt_esp *espinfo = matchinfo;
+
+ /* Must specify proto == ESP, and no unknown invflags */
+ if (ip->proto != IPPROTO_ESP || (ip->invflags & IPT_INV_PROTO)) {
+ duprintf("ipt_esp: Protocol %u != %u\n", ip->proto,
+ IPPROTO_ESP);
+ return 0;
+ }
+ if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_esp))) {
+ duprintf("ipt_esp: matchsize %u != %u\n",
+ matchinfosize, IPT_ALIGN(sizeof(struct ipt_esp)));
+ return 0;
+ }
+ if (espinfo->invflags & ~IPT_ESP_INV_MASK) {
+ duprintf("ipt_esp: unknown flags %X\n",
+ espinfo->invflags);
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ipt_match esp_match = {
+ .name = "esp",
+ .match = &match,
+ .checkentry = &checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&esp_match);
+}
+
+static void __exit cleanup(void)
+{
+ ipt_unregister_match(&esp_match);
+}
+
+module_init(init);
+module_exit(cleanup);
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
new file mode 100644
index 00000000000..f1937190cd7
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -0,0 +1,731 @@
+/* iptables match extension to limit the number of packets per second
+ * seperately for each hashbucket (sourceip/sourceport/dstip/dstport)
+ *
+ * (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
+ *
+ * $Id: ipt_hashlimit.c 3244 2004-10-20 16:24:29Z laforge@netfilter.org $
+ *
+ * Development of this code was funded by Astaro AG, http://www.astaro.com/
+ *
+ * based on ipt_limit.c by:
+ * Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr>
+ * Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr>
+ * Rusty Russell <rusty@rustcorp.com.au>
+ *
+ * The general idea is to create a hash table for every dstip and have a
+ * seperate limit counter per tuple. This way you can do something like 'limit
+ * the number of syn packets for each of my internal addresses.
+ *
+ * Ideally this would just be implemented as a general 'hash' match, which would
+ * allow us to attach any iptables target to it's hash buckets. But this is
+ * not possible in the current iptables architecture. As always, pkttables for
+ * 2.7.x will help ;)
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/list.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_hashlimit.h>
+#include <linux/netfilter_ipv4/lockhelp.h>
+
+/* FIXME: this is just for IP_NF_ASSERRT */
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables match for limiting per hash-bucket");
+
+/* need to declare this at the top */
+static struct proc_dir_entry *hashlimit_procdir;
+static struct file_operations dl_file_ops;
+
+/* hash table crap */
+
+struct dsthash_dst {
+ u_int32_t src_ip;
+ u_int32_t dst_ip;
+ /* ports have to be consecutive !!! */
+ u_int16_t src_port;
+ u_int16_t dst_port;
+};
+
+struct dsthash_ent {
+ /* static / read-only parts in the beginning */
+ struct hlist_node node;
+ struct dsthash_dst dst;
+
+ /* modified structure members in the end */
+ unsigned long expires; /* precalculated expiry time */
+ struct {
+ unsigned long prev; /* last modification */
+ u_int32_t credit;
+ u_int32_t credit_cap, cost;
+ } rateinfo;
+};
+
+struct ipt_hashlimit_htable {
+ struct hlist_node node; /* global list of all htables */
+ atomic_t use;
+
+ struct hashlimit_cfg cfg; /* config */
+
+ /* used internally */
+ spinlock_t lock; /* lock for list_head */
+ u_int32_t rnd; /* random seed for hash */
+ struct timer_list timer; /* timer for gc */
+ atomic_t count; /* number entries in table */
+
+ /* seq_file stuff */
+ struct proc_dir_entry *pde;
+
+ struct hlist_head hash[0]; /* hashtable itself */
+};
+
+static DECLARE_LOCK(hashlimit_lock); /* protects htables list */
+static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */
+static HLIST_HEAD(hashlimit_htables);
+static kmem_cache_t *hashlimit_cachep;
+
+static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b)
+{
+ return (ent->dst.dst_ip == b->dst_ip
+ && ent->dst.dst_port == b->dst_port
+ && ent->dst.src_port == b->src_port
+ && ent->dst.src_ip == b->src_ip);
+}
+
+static inline u_int32_t
+hash_dst(const struct ipt_hashlimit_htable *ht, const struct dsthash_dst *dst)
+{
+ return (jhash_3words(dst->dst_ip, (dst->dst_port<<16 | dst->src_port),
+ dst->src_ip, ht->rnd) % ht->cfg.size);
+}
+
+static inline struct dsthash_ent *
+__dsthash_find(const struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst)
+{
+ struct dsthash_ent *ent;
+ struct hlist_node *pos;
+ u_int32_t hash = hash_dst(ht, dst);
+
+ if (!hlist_empty(&ht->hash[hash]))
+ hlist_for_each_entry(ent, pos, &ht->hash[hash], node) {
+ if (dst_cmp(ent, dst)) {
+ return ent;
+ }
+ }
+
+ return NULL;
+}
+
+/* allocate dsthash_ent, initialize dst, put in htable and lock it */
+static struct dsthash_ent *
+__dsthash_alloc_init(struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst)
+{
+ struct dsthash_ent *ent;
+
+ /* initialize hash with random val at the time we allocate
+ * the first hashtable entry */
+ if (!ht->rnd)
+ get_random_bytes(&ht->rnd, 4);
+
+ if (ht->cfg.max &&
+ atomic_read(&ht->count) >= ht->cfg.max) {
+ /* FIXME: do something. question is what.. */
+ if (net_ratelimit())
+ printk(KERN_WARNING
+ "ipt_hashlimit: max count of %u reached\n",
+ ht->cfg.max);
+ return NULL;
+ }
+
+ ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
+ if (!ent) {
+ if (net_ratelimit())
+ printk(KERN_ERR
+ "ipt_hashlimit: can't allocate dsthash_ent\n");
+ return NULL;
+ }
+
+ atomic_inc(&ht->count);
+
+ ent->dst.dst_ip = dst->dst_ip;
+ ent->dst.dst_port = dst->dst_port;
+ ent->dst.src_ip = dst->src_ip;
+ ent->dst.src_port = dst->src_port;
+
+ hlist_add_head(&ent->node, &ht->hash[hash_dst(ht, dst)]);
+
+ return ent;
+}
+
+static inline void
+__dsthash_free(struct ipt_hashlimit_htable *ht, struct dsthash_ent *ent)
+{
+ hlist_del(&ent->node);
+ kmem_cache_free(hashlimit_cachep, ent);
+ atomic_dec(&ht->count);
+}
+static void htable_gc(unsigned long htlong);
+
+static int htable_create(struct ipt_hashlimit_info *minfo)
+{
+ int i;
+ unsigned int size;
+ struct ipt_hashlimit_htable *hinfo;
+
+ if (minfo->cfg.size)
+ size = minfo->cfg.size;
+ else {
+ size = (((num_physpages << PAGE_SHIFT) / 16384)
+ / sizeof(struct list_head));
+ if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
+ size = 8192;
+ if (size < 16)
+ size = 16;
+ }
+ /* FIXME: don't use vmalloc() here or anywhere else -HW */
+ hinfo = vmalloc(sizeof(struct ipt_hashlimit_htable)
+ + (sizeof(struct list_head) * size));
+ if (!hinfo) {
+ printk(KERN_ERR "ipt_hashlimit: Unable to create hashtable\n");
+ return -1;
+ }
+ minfo->hinfo = hinfo;
+
+ /* copy match config into hashtable config */
+ memcpy(&hinfo->cfg, &minfo->cfg, sizeof(hinfo->cfg));
+ hinfo->cfg.size = size;
+ if (!hinfo->cfg.max)
+ hinfo->cfg.max = 8 * hinfo->cfg.size;
+ else if (hinfo->cfg.max < hinfo->cfg.size)
+ hinfo->cfg.max = hinfo->cfg.size;
+
+ for (i = 0; i < hinfo->cfg.size; i++)
+ INIT_HLIST_HEAD(&hinfo->hash[i]);
+
+ atomic_set(&hinfo->count, 0);
+ atomic_set(&hinfo->use, 1);
+ hinfo->rnd = 0;
+ spin_lock_init(&hinfo->lock);
+ hinfo->pde = create_proc_entry(minfo->name, 0, hashlimit_procdir);
+ if (!hinfo->pde) {
+ vfree(hinfo);
+ return -1;
+ }
+ hinfo->pde->proc_fops = &dl_file_ops;
+ hinfo->pde->data = hinfo;
+
+ init_timer(&hinfo->timer);
+ hinfo->timer.expires = jiffies + msecs_to_jiffies(hinfo->cfg.gc_interval);
+ hinfo->timer.data = (unsigned long )hinfo;
+ hinfo->timer.function = htable_gc;
+ add_timer(&hinfo->timer);
+
+ LOCK_BH(&hashlimit_lock);
+ hlist_add_head(&hinfo->node, &hashlimit_htables);
+ UNLOCK_BH(&hashlimit_lock);
+
+ return 0;
+}
+
+static int select_all(struct ipt_hashlimit_htable *ht, struct dsthash_ent *he)
+{
+ return 1;
+}
+
+static int select_gc(struct ipt_hashlimit_htable *ht, struct dsthash_ent *he)
+{
+ return (jiffies >= he->expires);
+}
+
+static void htable_selective_cleanup(struct ipt_hashlimit_htable *ht,
+ int (*select)(struct ipt_hashlimit_htable *ht,
+ struct dsthash_ent *he))
+{
+ int i;
+
+ IP_NF_ASSERT(ht->cfg.size && ht->cfg.max);
+
+ /* lock hash table and iterate over it */
+ spin_lock_bh(&ht->lock);
+ for (i = 0; i < ht->cfg.size; i++) {
+ struct dsthash_ent *dh;
+ struct hlist_node *pos, *n;
+ hlist_for_each_entry_safe(dh, pos, n, &ht->hash[i], node) {
+ if ((*select)(ht, dh))
+ __dsthash_free(ht, dh);
+ }
+ }
+ spin_unlock_bh(&ht->lock);
+}
+
+/* hash table garbage collector, run by timer */
+static void htable_gc(unsigned long htlong)
+{
+ struct ipt_hashlimit_htable *ht = (struct ipt_hashlimit_htable *)htlong;
+
+ htable_selective_cleanup(ht, select_gc);
+
+ /* re-add the timer accordingly */
+ ht->timer.expires = jiffies + msecs_to_jiffies(ht->cfg.gc_interval);
+ add_timer(&ht->timer);
+}
+
+static void htable_destroy(struct ipt_hashlimit_htable *hinfo)
+{
+ /* remove timer, if it is pending */
+ if (timer_pending(&hinfo->timer))
+ del_timer(&hinfo->timer);
+
+ /* remove proc entry */
+ remove_proc_entry(hinfo->pde->name, hashlimit_procdir);
+
+ htable_selective_cleanup(hinfo, select_all);
+ vfree(hinfo);
+}
+
+static struct ipt_hashlimit_htable *htable_find_get(char *name)
+{
+ struct ipt_hashlimit_htable *hinfo;
+ struct hlist_node *pos;
+
+ LOCK_BH(&hashlimit_lock);
+ hlist_for_each_entry(hinfo, pos, &hashlimit_htables, node) {
+ if (!strcmp(name, hinfo->pde->name)) {
+ atomic_inc(&hinfo->use);
+ UNLOCK_BH(&hashlimit_lock);
+ return hinfo;
+ }
+ }
+ UNLOCK_BH(&hashlimit_lock);
+
+ return NULL;
+}
+
+static void htable_put(struct ipt_hashlimit_htable *hinfo)
+{
+ if (atomic_dec_and_test(&hinfo->use)) {
+ LOCK_BH(&hashlimit_lock);
+ hlist_del(&hinfo->node);
+ UNLOCK_BH(&hashlimit_lock);
+ htable_destroy(hinfo);
+ }
+}
+
+
+/* The algorithm used is the Simple Token Bucket Filter (TBF)
+ * see net/sched/sch_tbf.c in the linux source tree
+ */
+
+/* Rusty: This is my (non-mathematically-inclined) understanding of
+ this algorithm. The `average rate' in jiffies becomes your initial
+ amount of credit `credit' and the most credit you can ever have
+ `credit_cap'. The `peak rate' becomes the cost of passing the
+ test, `cost'.
+
+ `prev' tracks the last packet hit: you gain one credit per jiffy.
+ If you get credit balance more than this, the extra credit is
+ discarded. Every time the match passes, you lose `cost' credits;
+ if you don't have that many, the test fails.
+
+ See Alexey's formal explanation in net/sched/sch_tbf.c.
+
+ To get the maximum range, we multiply by this factor (ie. you get N
+ credits per jiffy). We want to allow a rate as low as 1 per day
+ (slowest userspace tool allows), which means
+ CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
+*/
+#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
+
+/* Repeated shift and or gives us all 1s, final shift and add 1 gives
+ * us the power of 2 below the theoretical max, so GCC simply does a
+ * shift. */
+#define _POW2_BELOW2(x) ((x)|((x)>>1))
+#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2))
+#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
+#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
+#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
+#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
+
+#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+
+/* Precision saver. */
+static inline u_int32_t
+user2credits(u_int32_t user)
+{
+ /* If multiplying would overflow... */
+ if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+ /* Divide first. */
+ return (user / IPT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+
+ return (user * HZ * CREDITS_PER_JIFFY) / IPT_HASHLIMIT_SCALE;
+}
+
+static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now)
+{
+ dh->rateinfo.credit += (now - xchg(&dh->rateinfo.prev, now))
+ * CREDITS_PER_JIFFY;
+ if (dh->rateinfo.credit > dh->rateinfo.credit_cap)
+ dh->rateinfo.credit = dh->rateinfo.credit_cap;
+}
+
+static inline int get_ports(const struct sk_buff *skb, int offset,
+ u16 ports[2])
+{
+ union {
+ struct tcphdr th;
+ struct udphdr uh;
+ sctp_sctphdr_t sctph;
+ } hdr_u, *ptr_u;
+
+ /* Must not be a fragment. */
+ if (offset)
+ return 1;
+
+ /* Must be big enough to read ports (both UDP and TCP have
+ them at the start). */
+ ptr_u = skb_header_pointer(skb, skb->nh.iph->ihl*4, 8, &hdr_u);
+ if (!ptr_u)
+ return 1;
+
+ switch (skb->nh.iph->protocol) {
+ case IPPROTO_TCP:
+ ports[0] = ptr_u->th.source;
+ ports[1] = ptr_u->th.dest;
+ break;
+ case IPPROTO_UDP:
+ ports[0] = ptr_u->uh.source;
+ ports[1] = ptr_u->uh.dest;
+ break;
+ case IPPROTO_SCTP:
+ ports[0] = ptr_u->sctph.source;
+ ports[1] = ptr_u->sctph.dest;
+ break;
+ default:
+ /* all other protocols don't supprot per-port hash
+ * buckets */
+ ports[0] = ports[1] = 0;
+ break;
+ }
+
+ return 0;
+}
+
+
+static int
+hashlimit_match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ struct ipt_hashlimit_info *r =
+ ((struct ipt_hashlimit_info *)matchinfo)->u.master;
+ struct ipt_hashlimit_htable *hinfo = r->hinfo;
+ unsigned long now = jiffies;
+ struct dsthash_ent *dh;
+ struct dsthash_dst dst;
+
+ /* build 'dst' according to hinfo->cfg and current packet */
+ memset(&dst, 0, sizeof(dst));
+ if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DIP)
+ dst.dst_ip = skb->nh.iph->daddr;
+ if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SIP)
+ dst.src_ip = skb->nh.iph->saddr;
+ if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT
+ ||hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) {
+ u_int16_t ports[2];
+ if (get_ports(skb, offset, ports)) {
+ /* We've been asked to examine this packet, and we
+ can't. Hence, no choice but to drop. */
+ *hotdrop = 1;
+ return 0;
+ }
+ if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT)
+ dst.src_port = ports[0];
+ if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT)
+ dst.dst_port = ports[1];
+ }
+
+ spin_lock_bh(&hinfo->lock);
+ dh = __dsthash_find(hinfo, &dst);
+ if (!dh) {
+ dh = __dsthash_alloc_init(hinfo, &dst);
+
+ if (!dh) {
+ /* enomem... don't match == DROP */
+ if (net_ratelimit())
+ printk(KERN_ERR "%s: ENOMEM\n", __FUNCTION__);
+ spin_unlock_bh(&hinfo->lock);
+ return 0;
+ }
+
+ dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
+
+ dh->rateinfo.prev = jiffies;
+ dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
+ hinfo->cfg.burst);
+ dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg *
+ hinfo->cfg.burst);
+ dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
+
+ spin_unlock_bh(&hinfo->lock);
+ return 1;
+ }
+
+ /* update expiration timeout */
+ dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
+
+ rateinfo_recalc(dh, now);
+ if (dh->rateinfo.credit >= dh->rateinfo.cost) {
+ /* We're underlimit. */
+ dh->rateinfo.credit -= dh->rateinfo.cost;
+ spin_unlock_bh(&hinfo->lock);
+ return 1;
+ }
+
+ spin_unlock_bh(&hinfo->lock);
+
+ /* default case: we're overlimit, thus don't match */
+ return 0;
+}
+
+static int
+hashlimit_checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ struct ipt_hashlimit_info *r = matchinfo;
+
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_hashlimit_info)))
+ return 0;
+
+ /* Check for overflow. */
+ if (r->cfg.burst == 0
+ || user2credits(r->cfg.avg * r->cfg.burst) <
+ user2credits(r->cfg.avg)) {
+ printk(KERN_ERR "ipt_hashlimit: Overflow, try lower: %u/%u\n",
+ r->cfg.avg, r->cfg.burst);
+ return 0;
+ }
+
+ if (r->cfg.mode == 0
+ || r->cfg.mode > (IPT_HASHLIMIT_HASH_DPT
+ |IPT_HASHLIMIT_HASH_DIP
+ |IPT_HASHLIMIT_HASH_SIP
+ |IPT_HASHLIMIT_HASH_SPT))
+ return 0;
+
+ if (!r->cfg.gc_interval)
+ return 0;
+
+ if (!r->cfg.expire)
+ return 0;
+
+ /* This is the best we've got: We cannot release and re-grab lock,
+ * since checkentry() is called before ip_tables.c grabs ipt_mutex.
+ * We also cannot grab the hashtable spinlock, since htable_create will
+ * call vmalloc, and that can sleep. And we cannot just re-search
+ * the list of htable's in htable_create(), since then we would
+ * create duplicate proc files. -HW */
+ down(&hlimit_mutex);
+ r->hinfo = htable_find_get(r->name);
+ if (!r->hinfo && (htable_create(r) != 0)) {
+ up(&hlimit_mutex);
+ return 0;
+ }
+ up(&hlimit_mutex);
+
+ /* Ugly hack: For SMP, we only want to use one set */
+ r->u.master = r;
+
+ return 1;
+}
+
+static void
+hashlimit_destroy(void *matchinfo, unsigned int matchsize)
+{
+ struct ipt_hashlimit_info *r = (struct ipt_hashlimit_info *) matchinfo;
+
+ htable_put(r->hinfo);
+}
+
+static struct ipt_match ipt_hashlimit = {
+ .name = "hashlimit",
+ .match = hashlimit_match,
+ .checkentry = hashlimit_checkentry,
+ .destroy = hashlimit_destroy,
+ .me = THIS_MODULE
+};
+
+/* PROC stuff */
+
+static void *dl_seq_start(struct seq_file *s, loff_t *pos)
+{
+ struct proc_dir_entry *pde = s->private;
+ struct ipt_hashlimit_htable *htable = pde->data;
+ unsigned int *bucket;
+
+ spin_lock_bh(&htable->lock);
+ if (*pos >= htable->cfg.size)
+ return NULL;
+
+ bucket = kmalloc(sizeof(unsigned int), GFP_ATOMIC);
+ if (!bucket)
+ return ERR_PTR(-ENOMEM);
+
+ *bucket = *pos;
+ return bucket;
+}
+
+static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct proc_dir_entry *pde = s->private;
+ struct ipt_hashlimit_htable *htable = pde->data;
+ unsigned int *bucket = (unsigned int *)v;
+
+ *pos = ++(*bucket);
+ if (*pos >= htable->cfg.size) {
+ kfree(v);
+ return NULL;
+ }
+ return bucket;
+}
+
+static void dl_seq_stop(struct seq_file *s, void *v)
+{
+ struct proc_dir_entry *pde = s->private;
+ struct ipt_hashlimit_htable *htable = pde->data;
+ unsigned int *bucket = (unsigned int *)v;
+
+ kfree(bucket);
+
+ spin_unlock_bh(&htable->lock);
+}
+
+static inline int dl_seq_real_show(struct dsthash_ent *ent, struct seq_file *s)
+{
+ /* recalculate to show accurate numbers */
+ rateinfo_recalc(ent, jiffies);
+
+ return seq_printf(s, "%ld %u.%u.%u.%u:%u->%u.%u.%u.%u:%u %u %u %u\n",
+ (long)(ent->expires - jiffies)/HZ,
+ NIPQUAD(ent->dst.src_ip), ntohs(ent->dst.src_port),
+ NIPQUAD(ent->dst.dst_ip), ntohs(ent->dst.dst_port),
+ ent->rateinfo.credit, ent->rateinfo.credit_cap,
+ ent->rateinfo.cost);
+}
+
+static int dl_seq_show(struct seq_file *s, void *v)
+{
+ struct proc_dir_entry *pde = s->private;
+ struct ipt_hashlimit_htable *htable = pde->data;
+ unsigned int *bucket = (unsigned int *)v;
+ struct dsthash_ent *ent;
+ struct hlist_node *pos;
+
+ if (!hlist_empty(&htable->hash[*bucket]))
+ hlist_for_each_entry(ent, pos, &htable->hash[*bucket], node) {
+ if (dl_seq_real_show(ent, s)) {
+ /* buffer was filled and unable to print that tuple */
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static struct seq_operations dl_seq_ops = {
+ .start = dl_seq_start,
+ .next = dl_seq_next,
+ .stop = dl_seq_stop,
+ .show = dl_seq_show
+};
+
+static int dl_proc_open(struct inode *inode, struct file *file)
+{
+ int ret = seq_open(file, &dl_seq_ops);
+
+ if (!ret) {
+ struct seq_file *sf = file->private_data;
+ sf->private = PDE(inode);
+ }
+ return ret;
+}
+
+static struct file_operations dl_file_ops = {
+ .owner = THIS_MODULE,
+ .open = dl_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+static int init_or_fini(int fini)
+{
+ int ret = 0;
+
+ if (fini)
+ goto cleanup;
+
+ if (ipt_register_match(&ipt_hashlimit)) {
+ ret = -EINVAL;
+ goto cleanup_nothing;
+ }
+
+ hashlimit_cachep = kmem_cache_create("ipt_hashlimit",
+ sizeof(struct dsthash_ent), 0,
+ 0, NULL, NULL);
+ if (!hashlimit_cachep) {
+ printk(KERN_ERR "Unable to create ipt_hashlimit slab cache\n");
+ ret = -ENOMEM;
+ goto cleanup_unreg_match;
+ }
+
+ hashlimit_procdir = proc_mkdir("ipt_hashlimit", proc_net);
+ if (!hashlimit_procdir) {
+ printk(KERN_ERR "Unable to create proc dir entry\n");
+ ret = -ENOMEM;
+ goto cleanup_free_slab;
+ }
+
+ return ret;
+
+cleanup:
+ remove_proc_entry("ipt_hashlimit", proc_net);
+cleanup_free_slab:
+ kmem_cache_destroy(hashlimit_cachep);
+cleanup_unreg_match:
+ ipt_unregister_match(&ipt_hashlimit);
+cleanup_nothing:
+ return ret;
+
+}
+
+static int __init init(void)
+{
+ return init_or_fini(0);
+}
+
+static void __exit fini(void)
+{
+ init_or_fini(1);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c
new file mode 100644
index 00000000000..33fdf364d3d
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_helper.c
@@ -0,0 +1,113 @@
+/* iptables module to match on related connections */
+/*
+ * (C) 2001 Martin Josefsson <gandalf@wlug.westbo.se>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 19 Mar 2002 Harald Welte <laforge@gnumonks.org>:
+ * - Port to newnat infrastructure
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_helper.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Josefsson <gandalf@netfilter.org>");
+MODULE_DESCRIPTION("iptables helper match module");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ const struct ipt_helper_info *info = matchinfo;
+ struct ip_conntrack *ct;
+ enum ip_conntrack_info ctinfo;
+ int ret = info->invert;
+
+ ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo);
+ if (!ct) {
+ DEBUGP("ipt_helper: Eek! invalid conntrack?\n");
+ return ret;
+ }
+
+ if (!ct->master) {
+ DEBUGP("ipt_helper: conntrack %p has no master\n", ct);
+ return ret;
+ }
+
+ READ_LOCK(&ip_conntrack_lock);
+ if (!ct->master->helper) {
+ DEBUGP("ipt_helper: master ct %p has no helper\n",
+ exp->expectant);
+ goto out_unlock;
+ }
+
+ DEBUGP("master's name = %s , info->name = %s\n",
+ ct->master->helper->name, info->name);
+
+ if (info->name[0] == '\0')
+ ret ^= 1;
+ else
+ ret ^= !strncmp(ct->master->helper->name, info->name,
+ strlen(ct->master->helper->name));
+out_unlock:
+ READ_UNLOCK(&ip_conntrack_lock);
+ return ret;
+}
+
+static int check(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ struct ipt_helper_info *info = matchinfo;
+
+ info->name[29] = '\0';
+
+ /* verify size */
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_helper_info)))
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_match helper_match = {
+ .name = "helper",
+ .match = &match,
+ .checkentry = &check,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ need_ip_conntrack();
+ return ipt_register_match(&helper_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&helper_match);
+}
+
+module_init(init);
+module_exit(fini);
+
diff --git a/net/ipv4/netfilter/ipt_iprange.c b/net/ipv4/netfilter/ipt_iprange.c
new file mode 100644
index 00000000000..b835b7b2e56
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_iprange.c
@@ -0,0 +1,99 @@
+/*
+ * iptables module to match IP address ranges
+ *
+ * (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_iprange.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("iptables arbitrary IP range match module");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset, int *hotdrop)
+{
+ const struct ipt_iprange_info *info = matchinfo;
+ const struct iphdr *iph = skb->nh.iph;
+
+ if (info->flags & IPRANGE_SRC) {
+ if (((ntohl(iph->saddr) < ntohl(info->src.min_ip))
+ || (ntohl(iph->saddr) > ntohl(info->src.max_ip)))
+ ^ !!(info->flags & IPRANGE_SRC_INV)) {
+ DEBUGP("src IP %u.%u.%u.%u NOT in range %s"
+ "%u.%u.%u.%u-%u.%u.%u.%u\n",
+ NIPQUAD(iph->saddr),
+ info->flags & IPRANGE_SRC_INV ? "(INV) " : "",
+ NIPQUAD(info->src.min_ip),
+ NIPQUAD(info->src.max_ip));
+ return 0;
+ }
+ }
+ if (info->flags & IPRANGE_DST) {
+ if (((ntohl(iph->daddr) < ntohl(info->dst.min_ip))
+ || (ntohl(iph->daddr) > ntohl(info->dst.max_ip)))
+ ^ !!(info->flags & IPRANGE_DST_INV)) {
+ DEBUGP("dst IP %u.%u.%u.%u NOT in range %s"
+ "%u.%u.%u.%u-%u.%u.%u.%u\n",
+ NIPQUAD(iph->daddr),
+ info->flags & IPRANGE_DST_INV ? "(INV) " : "",
+ NIPQUAD(info->dst.min_ip),
+ NIPQUAD(info->dst.max_ip));
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static int check(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ /* verify size */
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_iprange_info)))
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_match iprange_match =
+{
+ .list = { NULL, NULL },
+ .name = "iprange",
+ .match = &match,
+ .checkentry = &check,
+ .destroy = NULL,
+ .me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&iprange_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&iprange_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_length.c b/net/ipv4/netfilter/ipt_length.c
new file mode 100644
index 00000000000..4eabcfbda9d
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_length.c
@@ -0,0 +1,64 @@
+/* Kernel module to match packet length. */
+/* (C) 1999-2001 James Morris <jmorros@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter_ipv4/ipt_length.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+MODULE_DESCRIPTION("IP tables packet length matching module");
+MODULE_LICENSE("GPL");
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ const struct ipt_length_info *info = matchinfo;
+ u_int16_t pktlen = ntohs(skb->nh.iph->tot_len);
+
+ return (pktlen >= info->min && pktlen <= info->max) ^ info->invert;
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_length_info)))
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_match length_match = {
+ .name = "length",
+ .match = &match,
+ .checkentry = &checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&length_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&length_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_limit.c b/net/ipv4/netfilter/ipt_limit.c
new file mode 100644
index 00000000000..0c24dcc703a
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_limit.c
@@ -0,0 +1,157 @@
+/* Kernel module to control the rate
+ *
+ * 2 September 1999: Changed from the target RATE to the match
+ * `limit', removed logging. Did I mention that
+ * Alexey is a fucking genius?
+ * Rusty Russell (rusty@rustcorp.com.au). */
+
+/* (C) 1999 Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr>
+ * (C) 1999 Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_limit.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>");
+MODULE_DESCRIPTION("iptables rate limit match");
+
+/* The algorithm used is the Simple Token Bucket Filter (TBF)
+ * see net/sched/sch_tbf.c in the linux source tree
+ */
+
+static DEFINE_SPINLOCK(limit_lock);
+
+/* Rusty: This is my (non-mathematically-inclined) understanding of
+ this algorithm. The `average rate' in jiffies becomes your initial
+ amount of credit `credit' and the most credit you can ever have
+ `credit_cap'. The `peak rate' becomes the cost of passing the
+ test, `cost'.
+
+ `prev' tracks the last packet hit: you gain one credit per jiffy.
+ If you get credit balance more than this, the extra credit is
+ discarded. Every time the match passes, you lose `cost' credits;
+ if you don't have that many, the test fails.
+
+ See Alexey's formal explanation in net/sched/sch_tbf.c.
+
+ To get the maxmum range, we multiply by this factor (ie. you get N
+ credits per jiffy). We want to allow a rate as low as 1 per day
+ (slowest userspace tool allows), which means
+ CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32. ie. */
+#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
+
+/* Repeated shift and or gives us all 1s, final shift and add 1 gives
+ * us the power of 2 below the theoretical max, so GCC simply does a
+ * shift. */
+#define _POW2_BELOW2(x) ((x)|((x)>>1))
+#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2))
+#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
+#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
+#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
+#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
+
+#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+
+static int
+ipt_limit_match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ struct ipt_rateinfo *r = ((struct ipt_rateinfo *)matchinfo)->master;
+ unsigned long now = jiffies;
+
+ spin_lock_bh(&limit_lock);
+ r->credit += (now - xchg(&r->prev, now)) * CREDITS_PER_JIFFY;
+ if (r->credit > r->credit_cap)
+ r->credit = r->credit_cap;
+
+ if (r->credit >= r->cost) {
+ /* We're not limited. */
+ r->credit -= r->cost;
+ spin_unlock_bh(&limit_lock);
+ return 1;
+ }
+
+ spin_unlock_bh(&limit_lock);
+ return 0;
+}
+
+/* Precision saver. */
+static u_int32_t
+user2credits(u_int32_t user)
+{
+ /* If multiplying would overflow... */
+ if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+ /* Divide first. */
+ return (user / IPT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+
+ return (user * HZ * CREDITS_PER_JIFFY) / IPT_LIMIT_SCALE;
+}
+
+static int
+ipt_limit_checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ struct ipt_rateinfo *r = matchinfo;
+
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_rateinfo)))
+ return 0;
+
+ /* Check for overflow. */
+ if (r->burst == 0
+ || user2credits(r->avg * r->burst) < user2credits(r->avg)) {
+ printk("Overflow in ipt_limit, try lower: %u/%u\n",
+ r->avg, r->burst);
+ return 0;
+ }
+
+ /* User avg in seconds * IPT_LIMIT_SCALE: convert to jiffies *
+ 128. */
+ r->prev = jiffies;
+ r->credit = user2credits(r->avg * r->burst); /* Credits full. */
+ r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */
+ r->cost = user2credits(r->avg);
+
+ /* For SMP, we only want to use one set of counters. */
+ r->master = r;
+
+ return 1;
+}
+
+static struct ipt_match ipt_limit_reg = {
+ .name = "limit",
+ .match = ipt_limit_match,
+ .checkentry = ipt_limit_checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ if (ipt_register_match(&ipt_limit_reg))
+ return -EINVAL;
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&ipt_limit_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_mac.c b/net/ipv4/netfilter/ipt_mac.c
new file mode 100644
index 00000000000..11a459e33f2
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_mac.c
@@ -0,0 +1,79 @@
+/* Kernel module to match MAC address parameters. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/if_ether.h>
+
+#include <linux/netfilter_ipv4/ipt_mac.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables mac matching module");
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ const struct ipt_mac_info *info = matchinfo;
+
+ /* Is mac pointer valid? */
+ return (skb->mac.raw >= skb->head
+ && (skb->mac.raw + ETH_HLEN) <= skb->data
+ /* If so, compare... */
+ && ((memcmp(eth_hdr(skb)->h_source, info->srcaddr, ETH_ALEN)
+ == 0) ^ info->invert));
+}
+
+static int
+ipt_mac_checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ /* FORWARD isn't always valid, but it's nice to be able to do --RR */
+ if (hook_mask
+ & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN)
+ | (1 << NF_IP_FORWARD))) {
+ printk("ipt_mac: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n");
+ return 0;
+ }
+
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_mac_info)))
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_match mac_match = {
+ .name = "mac",
+ .match = &match,
+ .checkentry = &ipt_mac_checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&mac_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&mac_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c
new file mode 100644
index 00000000000..8955728127b
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_mark.c
@@ -0,0 +1,64 @@
+/* Kernel module to match NFMARK values. */
+
+/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter_ipv4/ipt_mark.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("iptables mark matching module");
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ const struct ipt_mark_info *info = matchinfo;
+
+ return ((skb->nfmark & info->mask) == info->mark) ^ info->invert;
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info)))
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_match mark_match = {
+ .name = "mark",
+ .match = &match,
+ .checkentry = &checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&mark_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&mark_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_multiport.c b/net/ipv4/netfilter/ipt_multiport.c
new file mode 100644
index 00000000000..99e8188162e
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_multiport.c
@@ -0,0 +1,212 @@
+/* Kernel module to match one of a list of TCP/UDP ports: ports are in
+ the same place so we can treat them as equal. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/udp.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter_ipv4/ipt_multiport.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables multiple port match module");
+
+#if 0
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+/* Returns 1 if the port is matched by the test, 0 otherwise. */
+static inline int
+ports_match(const u_int16_t *portlist, enum ipt_multiport_flags flags,
+ u_int8_t count, u_int16_t src, u_int16_t dst)
+{
+ unsigned int i;
+ for (i=0; i<count; i++) {
+ if (flags != IPT_MULTIPORT_DESTINATION
+ && portlist[i] == src)
+ return 1;
+
+ if (flags != IPT_MULTIPORT_SOURCE
+ && portlist[i] == dst)
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Returns 1 if the port is matched by the test, 0 otherwise. */
+static inline int
+ports_match_v1(const struct ipt_multiport_v1 *minfo,
+ u_int16_t src, u_int16_t dst)
+{
+ unsigned int i;
+ u_int16_t s, e;
+
+ for (i=0; i < minfo->count; i++) {
+ s = minfo->ports[i];
+
+ if (minfo->pflags[i]) {
+ /* range port matching */
+ e = minfo->ports[++i];
+ duprintf("src or dst matches with %d-%d?\n", s, e);
+
+ if (minfo->flags == IPT_MULTIPORT_SOURCE
+ && src >= s && src <= e)
+ return 1 ^ minfo->invert;
+ if (minfo->flags == IPT_MULTIPORT_DESTINATION
+ && dst >= s && dst <= e)
+ return 1 ^ minfo->invert;
+ if (minfo->flags == IPT_MULTIPORT_EITHER
+ && ((dst >= s && dst <= e)
+ || (src >= s && src <= e)))
+ return 1 ^ minfo->invert;
+ } else {
+ /* exact port matching */
+ duprintf("src or dst matches with %d?\n", s);
+
+ if (minfo->flags == IPT_MULTIPORT_SOURCE
+ && src == s)
+ return 1 ^ minfo->invert;
+ if (minfo->flags == IPT_MULTIPORT_DESTINATION
+ && dst == s)
+ return 1 ^ minfo->invert;
+ if (minfo->flags == IPT_MULTIPORT_EITHER
+ && (src == s || dst == s))
+ return 1 ^ minfo->invert;
+ }
+ }
+
+ return minfo->invert;
+}
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ u16 _ports[2], *pptr;
+ const struct ipt_multiport *multiinfo = matchinfo;
+
+ if (offset)
+ return 0;
+
+ pptr = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+ sizeof(_ports), _ports);
+ if (pptr == NULL) {
+ /* We've been asked to examine this packet, and we
+ * can't. Hence, no choice but to drop.
+ */
+ duprintf("ipt_multiport:"
+ " Dropping evil offset=0 tinygram.\n");
+ *hotdrop = 1;
+ return 0;
+ }
+
+ return ports_match(multiinfo->ports,
+ multiinfo->flags, multiinfo->count,
+ ntohs(pptr[0]), ntohs(pptr[1]));
+}
+
+static int
+match_v1(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ u16 _ports[2], *pptr;
+ const struct ipt_multiport_v1 *multiinfo = matchinfo;
+
+ if (offset)
+ return 0;
+
+ pptr = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+ sizeof(_ports), _ports);
+ if (pptr == NULL) {
+ /* We've been asked to examine this packet, and we
+ * can't. Hence, no choice but to drop.
+ */
+ duprintf("ipt_multiport:"
+ " Dropping evil offset=0 tinygram.\n");
+ *hotdrop = 1;
+ return 0;
+ }
+
+ return ports_match_v1(multiinfo, ntohs(pptr[0]), ntohs(pptr[1]));
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int
+checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ return (matchsize == IPT_ALIGN(sizeof(struct ipt_multiport)));
+}
+
+static int
+checkentry_v1(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ return (matchsize == IPT_ALIGN(sizeof(struct ipt_multiport_v1)));
+}
+
+static struct ipt_match multiport_match = {
+ .name = "multiport",
+ .revision = 0,
+ .match = &match,
+ .checkentry = &checkentry,
+ .me = THIS_MODULE,
+};
+
+static struct ipt_match multiport_match_v1 = {
+ .name = "multiport",
+ .revision = 1,
+ .match = &match_v1,
+ .checkentry = &checkentry_v1,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ int err;
+
+ err = ipt_register_match(&multiport_match);
+ if (!err) {
+ err = ipt_register_match(&multiport_match_v1);
+ if (err)
+ ipt_unregister_match(&multiport_match);
+ }
+
+ return err;
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&multiport_match);
+ ipt_unregister_match(&multiport_match_v1);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
new file mode 100644
index 00000000000..3b9065e0638
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_owner.c
@@ -0,0 +1,217 @@
+/* Kernel module to match various things tied to sockets associated with
+ locally generated outgoing packets. */
+
+/* (C) 2000 Marc Boucher <marc@mbsi.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/file.h>
+#include <net/sock.h>
+
+#include <linux/netfilter_ipv4/ipt_owner.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("iptables owner match");
+
+static int
+match_comm(const struct sk_buff *skb, const char *comm)
+{
+ struct task_struct *g, *p;
+ struct files_struct *files;
+ int i;
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ if(strncmp(p->comm, comm, sizeof(p->comm)))
+ continue;
+
+ task_lock(p);
+ files = p->files;
+ if(files) {
+ spin_lock(&files->file_lock);
+ for (i=0; i < files->max_fds; i++) {
+ if (fcheck_files(files, i) ==
+ skb->sk->sk_socket->file) {
+ spin_unlock(&files->file_lock);
+ task_unlock(p);
+ read_unlock(&tasklist_lock);
+ return 1;
+ }
+ }
+ spin_unlock(&files->file_lock);
+ }
+ task_unlock(p);
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+ return 0;
+}
+
+static int
+match_pid(const struct sk_buff *skb, pid_t pid)
+{
+ struct task_struct *p;
+ struct files_struct *files;
+ int i;
+
+ read_lock(&tasklist_lock);
+ p = find_task_by_pid(pid);
+ if (!p)
+ goto out;
+ task_lock(p);
+ files = p->files;
+ if(files) {
+ spin_lock(&files->file_lock);
+ for (i=0; i < files->max_fds; i++) {
+ if (fcheck_files(files, i) ==
+ skb->sk->sk_socket->file) {
+ spin_unlock(&files->file_lock);
+ task_unlock(p);
+ read_unlock(&tasklist_lock);
+ return 1;
+ }
+ }
+ spin_unlock(&files->file_lock);
+ }
+ task_unlock(p);
+out:
+ read_unlock(&tasklist_lock);
+ return 0;
+}
+
+static int
+match_sid(const struct sk_buff *skb, pid_t sid)
+{
+ struct task_struct *g, *p;
+ struct file *file = skb->sk->sk_socket->file;
+ int i, found=0;
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ struct files_struct *files;
+ if (p->signal->session != sid)
+ continue;
+
+ task_lock(p);
+ files = p->files;
+ if (files) {
+ spin_lock(&files->file_lock);
+ for (i=0; i < files->max_fds; i++) {
+ if (fcheck_files(files, i) == file) {
+ found = 1;
+ break;
+ }
+ }
+ spin_unlock(&files->file_lock);
+ }
+ task_unlock(p);
+ if (found)
+ goto out;
+ } while_each_thread(g, p);
+out:
+ read_unlock(&tasklist_lock);
+
+ return found;
+}
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ const struct ipt_owner_info *info = matchinfo;
+
+ if (!skb->sk || !skb->sk->sk_socket || !skb->sk->sk_socket->file)
+ return 0;
+
+ if(info->match & IPT_OWNER_UID) {
+ if ((skb->sk->sk_socket->file->f_uid != info->uid) ^
+ !!(info->invert & IPT_OWNER_UID))
+ return 0;
+ }
+
+ if(info->match & IPT_OWNER_GID) {
+ if ((skb->sk->sk_socket->file->f_gid != info->gid) ^
+ !!(info->invert & IPT_OWNER_GID))
+ return 0;
+ }
+
+ if(info->match & IPT_OWNER_PID) {
+ if (!match_pid(skb, info->pid) ^
+ !!(info->invert & IPT_OWNER_PID))
+ return 0;
+ }
+
+ if(info->match & IPT_OWNER_SID) {
+ if (!match_sid(skb, info->sid) ^
+ !!(info->invert & IPT_OWNER_SID))
+ return 0;
+ }
+
+ if(info->match & IPT_OWNER_COMM) {
+ if (!match_comm(skb, info->comm) ^
+ !!(info->invert & IPT_OWNER_COMM))
+ return 0;
+ }
+
+ return 1;
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ if (hook_mask
+ & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) {
+ printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n");
+ return 0;
+ }
+
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_owner_info))) {
+ printk("Matchsize %u != %Zu\n", matchsize,
+ IPT_ALIGN(sizeof(struct ipt_owner_info)));
+ return 0;
+ }
+#ifdef CONFIG_SMP
+ /* files->file_lock can not be used in a BH */
+ if (((struct ipt_owner_info *)matchinfo)->match
+ & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) {
+ printk("ipt_owner: pid, sid and command matching is broken "
+ "on SMP.\n");
+ return 0;
+ }
+#endif
+ return 1;
+}
+
+static struct ipt_match owner_match = {
+ .name = "owner",
+ .match = &match,
+ .checkentry = &checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&owner_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&owner_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_physdev.c b/net/ipv4/netfilter/ipt_physdev.c
new file mode 100644
index 00000000000..1a53924041f
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_physdev.c
@@ -0,0 +1,134 @@
+/* Kernel module to match the bridge port in and
+ * out device for IP packets coming into contact with a bridge. */
+
+/* (C) 2001-2003 Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ipt_physdev.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_bridge.h>
+#define MATCH 1
+#define NOMATCH 0
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
+MODULE_DESCRIPTION("iptables bridge physical device match module");
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ int i;
+ static const char nulldevname[IFNAMSIZ];
+ const struct ipt_physdev_info *info = matchinfo;
+ unsigned int ret;
+ const char *indev, *outdev;
+ struct nf_bridge_info *nf_bridge;
+
+ /* Not a bridged IP packet or no info available yet:
+ * LOCAL_OUT/mangle and LOCAL_OUT/nat don't know if
+ * the destination device will be a bridge. */
+ if (!(nf_bridge = skb->nf_bridge)) {
+ /* Return MATCH if the invert flags of the used options are on */
+ if ((info->bitmask & IPT_PHYSDEV_OP_BRIDGED) &&
+ !(info->invert & IPT_PHYSDEV_OP_BRIDGED))
+ return NOMATCH;
+ if ((info->bitmask & IPT_PHYSDEV_OP_ISIN) &&
+ !(info->invert & IPT_PHYSDEV_OP_ISIN))
+ return NOMATCH;
+ if ((info->bitmask & IPT_PHYSDEV_OP_ISOUT) &&
+ !(info->invert & IPT_PHYSDEV_OP_ISOUT))
+ return NOMATCH;
+ if ((info->bitmask & IPT_PHYSDEV_OP_IN) &&
+ !(info->invert & IPT_PHYSDEV_OP_IN))
+ return NOMATCH;
+ if ((info->bitmask & IPT_PHYSDEV_OP_OUT) &&
+ !(info->invert & IPT_PHYSDEV_OP_OUT))
+ return NOMATCH;
+ return MATCH;
+ }
+
+ /* This only makes sense in the FORWARD and POSTROUTING chains */
+ if ((info->bitmask & IPT_PHYSDEV_OP_BRIDGED) &&
+ (!!(nf_bridge->mask & BRNF_BRIDGED) ^
+ !(info->invert & IPT_PHYSDEV_OP_BRIDGED)))
+ return NOMATCH;
+
+ if ((info->bitmask & IPT_PHYSDEV_OP_ISIN &&
+ (!nf_bridge->physindev ^ !!(info->invert & IPT_PHYSDEV_OP_ISIN))) ||
+ (info->bitmask & IPT_PHYSDEV_OP_ISOUT &&
+ (!nf_bridge->physoutdev ^ !!(info->invert & IPT_PHYSDEV_OP_ISOUT))))
+ return NOMATCH;
+
+ if (!(info->bitmask & IPT_PHYSDEV_OP_IN))
+ goto match_outdev;
+ indev = nf_bridge->physindev ? nf_bridge->physindev->name : nulldevname;
+ for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) {
+ ret |= (((const unsigned int *)indev)[i]
+ ^ ((const unsigned int *)info->physindev)[i])
+ & ((const unsigned int *)info->in_mask)[i];
+ }
+
+ if ((ret == 0) ^ !(info->invert & IPT_PHYSDEV_OP_IN))
+ return NOMATCH;
+
+match_outdev:
+ if (!(info->bitmask & IPT_PHYSDEV_OP_OUT))
+ return MATCH;
+ outdev = nf_bridge->physoutdev ?
+ nf_bridge->physoutdev->name : nulldevname;
+ for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) {
+ ret |= (((const unsigned int *)outdev)[i]
+ ^ ((const unsigned int *)info->physoutdev)[i])
+ & ((const unsigned int *)info->out_mask)[i];
+ }
+
+ return (ret != 0) ^ !(info->invert & IPT_PHYSDEV_OP_OUT);
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ const struct ipt_physdev_info *info = matchinfo;
+
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_physdev_info)))
+ return 0;
+ if (!(info->bitmask & IPT_PHYSDEV_OP_MASK) ||
+ info->bitmask & ~IPT_PHYSDEV_OP_MASK)
+ return 0;
+ return 1;
+}
+
+static struct ipt_match physdev_match = {
+ .name = "physdev",
+ .match = &match,
+ .checkentry = &checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&physdev_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&physdev_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_pkttype.c b/net/ipv4/netfilter/ipt_pkttype.c
new file mode 100644
index 00000000000..8ddb1dc5e5a
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_pkttype.c
@@ -0,0 +1,70 @@
+/* (C) 1999-2001 Michal Ludvig <michal@logix.cz>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+
+#include <linux/netfilter_ipv4/ipt_pkttype.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michal Ludvig <michal@logix.cz>");
+MODULE_DESCRIPTION("IP tables match to match on linklayer packet type");
+
+static int match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ const struct ipt_pkttype_info *info = matchinfo;
+
+ return (skb->pkt_type == info->pkttype) ^ info->invert;
+}
+
+static int checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+/*
+ if (hook_mask
+ & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN)
+ | (1 << NF_IP_FORWARD))) {
+ printk("ipt_pkttype: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n");
+ return 0;
+ }
+*/
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_pkttype_info)))
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_match pkttype_match = {
+ .name = "pkttype",
+ .match = &match,
+ .checkentry = &checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&pkttype_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&pkttype_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_realm.c b/net/ipv4/netfilter/ipt_realm.c
new file mode 100644
index 00000000000..54a6897ebaa
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_realm.c
@@ -0,0 +1,76 @@
+/* IP tables module for matching the routing realm
+ *
+ * $Id: ipt_realm.c,v 1.3 2004/03/05 13:25:40 laforge Exp $
+ *
+ * (C) 2003 by Sampsa Ranta <sampsa@netsonic.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <net/route.h>
+
+#include <linux/netfilter_ipv4/ipt_realm.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_AUTHOR("Sampsa Ranta <sampsa@netsonic.fi>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("iptables realm match");
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ const struct ipt_realm_info *info = matchinfo;
+ struct dst_entry *dst = skb->dst;
+
+ return (info->id == (dst->tclassid & info->mask)) ^ info->invert;
+}
+
+static int check(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ if (hook_mask
+ & ~((1 << NF_IP_POST_ROUTING) | (1 << NF_IP_FORWARD) |
+ (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_LOCAL_IN))) {
+ printk("ipt_realm: only valid for POST_ROUTING, LOCAL_OUT, "
+ "LOCAL_IN or FORWARD.\n");
+ return 0;
+ }
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_realm_info))) {
+ printk("ipt_realm: invalid matchsize.\n");
+ return 0;
+ }
+ return 1;
+}
+
+static struct ipt_match realm_match = {
+ .name = "realm",
+ .match = match,
+ .checkentry = check,
+ .me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&realm_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&realm_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
new file mode 100644
index 00000000000..25ab9fabdcb
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -0,0 +1,1002 @@
+/* Kernel module to check if the source address has been seen recently. */
+/* Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org */
+/* Author: Stephen Frost <sfrost@snowman.net> */
+/* Project Page: http://snowman.net/projects/ipt_recent/ */
+/* This software is distributed under the terms of the GPL, Version 2 */
+/* This copyright does not cover user programs that use kernel services
+ * by normal system calls. */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <asm/uaccess.h>
+#include <linux/ctype.h>
+#include <linux/ip.h>
+#include <linux/vmalloc.h>
+#include <linux/moduleparam.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_recent.h>
+
+#undef DEBUG
+#define HASH_LOG 9
+
+/* Defaults, these can be overridden on the module command-line. */
+static int ip_list_tot = 100;
+static int ip_pkt_list_tot = 20;
+static int ip_list_hash_size = 0;
+static int ip_list_perms = 0644;
+#ifdef DEBUG
+static int debug = 1;
+#endif
+
+static char version[] =
+KERN_INFO RECENT_NAME " " RECENT_VER ": Stephen Frost <sfrost@snowman.net>. http://snowman.net/projects/ipt_recent/\n";
+
+MODULE_AUTHOR("Stephen Frost <sfrost@snowman.net>");
+MODULE_DESCRIPTION("IP tables recently seen matching module " RECENT_VER);
+MODULE_LICENSE("GPL");
+module_param(ip_list_tot, int, 0400);
+module_param(ip_pkt_list_tot, int, 0400);
+module_param(ip_list_hash_size, int, 0400);
+module_param(ip_list_perms, int, 0400);
+#ifdef DEBUG
+module_param(debug, int, 0600);
+MODULE_PARM_DESC(debug,"debugging level, defaults to 1");
+#endif
+MODULE_PARM_DESC(ip_list_tot,"number of IPs to remember per list");
+MODULE_PARM_DESC(ip_pkt_list_tot,"number of packets per IP to remember");
+MODULE_PARM_DESC(ip_list_hash_size,"size of hash table used to look up IPs");
+MODULE_PARM_DESC(ip_list_perms,"permissions on /proc/net/ipt_recent/* files");
+
+/* Structure of our list of recently seen addresses. */
+struct recent_ip_list {
+ u_int32_t addr;
+ u_int8_t ttl;
+ unsigned long last_seen;
+ unsigned long *last_pkts;
+ u_int32_t oldest_pkt;
+ u_int32_t hash_entry;
+ u_int32_t time_pos;
+};
+
+struct time_info_list {
+ u_int32_t position;
+ u_int32_t time;
+};
+
+/* Structure of our linked list of tables of recent lists. */
+struct recent_ip_tables {
+ char name[IPT_RECENT_NAME_LEN];
+ int count;
+ int time_pos;
+ struct recent_ip_list *table;
+ struct recent_ip_tables *next;
+ spinlock_t list_lock;
+ int *hash_table;
+ struct time_info_list *time_info;
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *status_proc;
+#endif /* CONFIG_PROC_FS */
+};
+
+/* Our current list of addresses we have recently seen.
+ * Only added to on a --set, and only updated on --set || --update
+ */
+static struct recent_ip_tables *r_tables = NULL;
+
+/* We protect r_list with this spinlock so two processors are not modifying
+ * the list at the same time.
+ */
+static DEFINE_SPINLOCK(recent_lock);
+
+#ifdef CONFIG_PROC_FS
+/* Our /proc/net/ipt_recent entry */
+static struct proc_dir_entry *proc_net_ipt_recent = NULL;
+#endif
+
+/* Function declaration for later. */
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop);
+
+/* Function to hash a given address into the hash table of table_size size */
+static int hash_func(unsigned int addr, int table_size)
+{
+ int result = 0;
+ unsigned int value = addr;
+ do { result ^= value; } while((value >>= HASH_LOG));
+
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": %d = hash_func(%u,%d)\n",
+ result & (table_size - 1),
+ addr,
+ table_size);
+#endif
+
+ return(result & (table_size - 1));
+}
+
+#ifdef CONFIG_PROC_FS
+/* This is the function which produces the output for our /proc output
+ * interface which lists each IP address, the last seen time and the
+ * other recent times the address was seen.
+ */
+
+static int ip_recent_get_info(char *buffer, char **start, off_t offset, int length, int *eof, void *data)
+{
+ int len = 0, count, last_len = 0, pkt_count;
+ off_t pos = 0;
+ off_t begin = 0;
+ struct recent_ip_tables *curr_table;
+
+ curr_table = (struct recent_ip_tables*) data;
+
+ spin_lock_bh(&curr_table->list_lock);
+ for(count = 0; count < ip_list_tot; count++) {
+ if(!curr_table->table[count].addr) continue;
+ last_len = len;
+ len += sprintf(buffer+len,"src=%u.%u.%u.%u ",NIPQUAD(curr_table->table[count].addr));
+ len += sprintf(buffer+len,"ttl: %u ",curr_table->table[count].ttl);
+ len += sprintf(buffer+len,"last_seen: %lu ",curr_table->table[count].last_seen);
+ len += sprintf(buffer+len,"oldest_pkt: %u ",curr_table->table[count].oldest_pkt);
+ len += sprintf(buffer+len,"last_pkts: %lu",curr_table->table[count].last_pkts[0]);
+ for(pkt_count = 1; pkt_count < ip_pkt_list_tot; pkt_count++) {
+ if(!curr_table->table[count].last_pkts[pkt_count]) break;
+ len += sprintf(buffer+len,", %lu",curr_table->table[count].last_pkts[pkt_count]);
+ }
+ len += sprintf(buffer+len,"\n");
+ pos = begin + len;
+ if(pos < offset) { len = 0; begin = pos; }
+ if(pos > offset + length) { len = last_len; break; }
+ }
+
+ *start = buffer + (offset - begin);
+ len -= (offset - begin);
+ if(len > length) len = length;
+
+ spin_unlock_bh(&curr_table->list_lock);
+ return len;
+}
+
+/* ip_recent_ctrl provides an interface for users to modify the table
+ * directly. This allows adding entries, removing entries, and
+ * flushing the entire table.
+ * This is done by opening up the appropriate table for writing and
+ * sending one of:
+ * xx.xx.xx.xx -- Add entry to table with current time
+ * +xx.xx.xx.xx -- Add entry to table with current time
+ * -xx.xx.xx.xx -- Remove entry from table
+ * clear -- Flush table, remove all entries
+ */
+
+static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned long size, void *data)
+{
+ static const u_int32_t max[4] = { 0xffffffff, 0xffffff, 0xffff, 0xff };
+ u_int32_t val;
+ int base, used = 0;
+ char c, *cp;
+ union iaddr {
+ uint8_t bytes[4];
+ uint32_t word;
+ } res;
+ uint8_t *pp = res.bytes;
+ int digit;
+
+ char buffer[20];
+ int len, check_set = 0, count;
+ u_int32_t addr = 0;
+ struct sk_buff *skb;
+ struct ipt_recent_info *info;
+ struct recent_ip_tables *curr_table;
+
+ curr_table = (struct recent_ip_tables*) data;
+
+ if(size > 20) len = 20; else len = size;
+
+ if(copy_from_user(buffer,input,len)) return -EFAULT;
+
+ if(len < 20) buffer[len] = '\0';
+
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl len: %d, input: `%.20s'\n",len,buffer);
+#endif
+
+ cp = buffer;
+ while(isspace(*cp)) { cp++; used++; if(used >= len-5) return used; }
+
+ /* Check if we are asked to flush the entire table */
+ if(!memcmp(cp,"clear",5)) {
+ used += 5;
+ spin_lock_bh(&curr_table->list_lock);
+ curr_table->time_pos = 0;
+ for(count = 0; count < ip_list_hash_size; count++) {
+ curr_table->hash_table[count] = -1;
+ }
+ for(count = 0; count < ip_list_tot; count++) {
+ curr_table->table[count].last_seen = 0;
+ curr_table->table[count].addr = 0;
+ curr_table->table[count].ttl = 0;
+ memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t));
+ curr_table->table[count].oldest_pkt = 0;
+ curr_table->table[count].time_pos = 0;
+ curr_table->time_info[count].position = count;
+ curr_table->time_info[count].time = 0;
+ }
+ spin_unlock_bh(&curr_table->list_lock);
+ return used;
+ }
+
+ check_set = IPT_RECENT_SET;
+ switch(*cp) {
+ case '+': check_set = IPT_RECENT_SET; cp++; used++; break;
+ case '-': check_set = IPT_RECENT_REMOVE; cp++; used++; break;
+ default: if(!isdigit(*cp)) return (used+1); break;
+ }
+
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl cp: `%c', check_set: %d\n",*cp,check_set);
+#endif
+ /* Get addr (effectively inet_aton()) */
+ /* Shamelessly stolen from libc, a function in the kernel for doing
+ * this would, of course, be greatly preferred, but our options appear
+ * to be rather limited, so we will just do it ourselves here.
+ */
+ res.word = 0;
+
+ c = *cp;
+ for(;;) {
+ if(!isdigit(c)) return used;
+ val = 0; base = 10; digit = 0;
+ if(c == '0') {
+ c = *++cp;
+ if(c == 'x' || c == 'X') base = 16, c = *++cp;
+ else { base = 8; digit = 1; }
+ }
+ for(;;) {
+ if(isascii(c) && isdigit(c)) {
+ if(base == 8 && (c == '8' || c == '0')) return used;
+ val = (val * base) + (c - '0');
+ c = *++cp;
+ digit = 1;
+ } else if(base == 16 && isascii(c) && isxdigit(c)) {
+ val = (val << 4) | (c + 10 - (islower(c) ? 'a' : 'A'));
+ c = *++cp;
+ digit = 1;
+ } else break;
+ }
+ if(c == '.') {
+ if(pp > res.bytes + 2 || val > 0xff) return used;
+ *pp++ = val;
+ c = *++cp;
+ } else break;
+ }
+ used = cp - buffer;
+ if(c != '\0' && (!isascii(c) || !isspace(c))) return used;
+ if(c == '\n') used++;
+ if(!digit) return used;
+
+ if(val > max[pp - res.bytes]) return used;
+ addr = res.word | htonl(val);
+
+ if(!addr && check_set == IPT_RECENT_SET) return used;
+
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl c: %c, addr: %u used: %d\n",c,addr,used);
+#endif
+
+ /* Set up and just call match */
+ info = kmalloc(sizeof(struct ipt_recent_info),GFP_KERNEL);
+ if(!info) { return -ENOMEM; }
+ info->seconds = 0;
+ info->hit_count = 0;
+ info->check_set = check_set;
+ info->invert = 0;
+ info->side = IPT_RECENT_SOURCE;
+ strncpy(info->name,curr_table->name,IPT_RECENT_NAME_LEN);
+ info->name[IPT_RECENT_NAME_LEN-1] = '\0';
+
+ skb = kmalloc(sizeof(struct sk_buff),GFP_KERNEL);
+ if (!skb) {
+ used = -ENOMEM;
+ goto out_free_info;
+ }
+ skb->nh.iph = kmalloc(sizeof(struct iphdr),GFP_KERNEL);
+ if (!skb->nh.iph) {
+ used = -ENOMEM;
+ goto out_free_skb;
+ }
+
+ skb->nh.iph->saddr = addr;
+ skb->nh.iph->daddr = 0;
+ /* Clear ttl since we have no way of knowing it */
+ skb->nh.iph->ttl = 0;
+ match(skb,NULL,NULL,info,0,NULL);
+
+ kfree(skb->nh.iph);
+out_free_skb:
+ kfree(skb);
+out_free_info:
+ kfree(info);
+
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": Leaving ip_recent_ctrl addr: %u used: %d\n",addr,used);
+#endif
+ return used;
+}
+
+#endif /* CONFIG_PROC_FS */
+
+/* 'match' is our primary function, called by the kernel whenever a rule is
+ * hit with our module as an option to it.
+ * What this function does depends on what was specifically asked of it by
+ * the user:
+ * --set -- Add or update last seen time of the source address of the packet
+ * -- matchinfo->check_set == IPT_RECENT_SET
+ * --rcheck -- Just check if the source address is in the list
+ * -- matchinfo->check_set == IPT_RECENT_CHECK
+ * --update -- If the source address is in the list, update last_seen
+ * -- matchinfo->check_set == IPT_RECENT_UPDATE
+ * --remove -- If the source address is in the list, remove it
+ * -- matchinfo->check_set == IPT_RECENT_REMOVE
+ * --seconds -- Option to --rcheck/--update, only match if last_seen within seconds
+ * -- matchinfo->seconds
+ * --hitcount -- Option to --rcheck/--update, only match if seen hitcount times
+ * -- matchinfo->hit_count
+ * --seconds and --hitcount can be combined
+ */
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ int pkt_count, hits_found, ans;
+ unsigned long now;
+ const struct ipt_recent_info *info = matchinfo;
+ u_int32_t addr = 0, time_temp;
+ u_int8_t ttl = skb->nh.iph->ttl;
+ int *hash_table;
+ int orig_hash_result, hash_result, temp, location = 0, time_loc, end_collision_chain = -1;
+ struct time_info_list *time_info;
+ struct recent_ip_tables *curr_table;
+ struct recent_ip_tables *last_table;
+ struct recent_ip_list *r_list;
+
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": match() called\n");
+#endif
+
+ /* Default is false ^ info->invert */
+ ans = info->invert;
+
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": match(): name = '%s'\n",info->name);
+#endif
+
+ /* if out != NULL then routing has been done and TTL changed.
+ * We change it back here internally for match what came in before routing. */
+ if(out) ttl++;
+
+ /* Find the right table */
+ spin_lock_bh(&recent_lock);
+ curr_table = r_tables;
+ while( (last_table = curr_table) && strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (curr_table = curr_table->next) );
+
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": match(): table found('%s')\n",info->name);
+#endif
+
+ spin_unlock_bh(&recent_lock);
+
+ /* Table with this name not found, match impossible */
+ if(!curr_table) { return ans; }
+
+ /* Make sure no one is changing the list while we work with it */
+ spin_lock_bh(&curr_table->list_lock);
+
+ r_list = curr_table->table;
+ if(info->side == IPT_RECENT_DEST) addr = skb->nh.iph->daddr; else addr = skb->nh.iph->saddr;
+
+ if(!addr) {
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": match() address (%u) invalid, leaving.\n",addr);
+#endif
+ spin_unlock_bh(&curr_table->list_lock);
+ return ans;
+ }
+
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": match(): checking table, addr: %u, ttl: %u, orig_ttl: %u\n",addr,ttl,skb->nh.iph->ttl);
+#endif
+
+ /* Get jiffies now in case they changed while we were waiting for a lock */
+ now = jiffies;
+ hash_table = curr_table->hash_table;
+ time_info = curr_table->time_info;
+
+ orig_hash_result = hash_result = hash_func(addr,ip_list_hash_size);
+ /* Hash entry at this result used */
+ /* Check for TTL match if requested. If TTL is zero then a match would never
+ * happen, so match regardless of existing TTL in that case. Zero means the
+ * entry was added via the /proc interface anyway, so we will just use the
+ * first TTL we get for that IP address. */
+ if(info->check_set & IPT_RECENT_TTL) {
+ while(hash_table[hash_result] != -1 && !(r_list[hash_table[hash_result]].addr == addr &&
+ (!r_list[hash_table[hash_result]].ttl || r_list[hash_table[hash_result]].ttl == ttl))) {
+ /* Collision in hash table */
+ hash_result = (hash_result + 1) % ip_list_hash_size;
+ }
+ } else {
+ while(hash_table[hash_result] != -1 && r_list[hash_table[hash_result]].addr != addr) {
+ /* Collision in hash table */
+ hash_result = (hash_result + 1) % ip_list_hash_size;
+ }
+ }
+
+ if(hash_table[hash_result] == -1 && !(info->check_set & IPT_RECENT_SET)) {
+ /* IP not in list and not asked to SET */
+ spin_unlock_bh(&curr_table->list_lock);
+ return ans;
+ }
+
+ /* Check if we need to handle the collision, do not need to on REMOVE */
+ if(orig_hash_result != hash_result && !(info->check_set & IPT_RECENT_REMOVE)) {
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision in hash table. (or: %d,hr: %d,oa: %u,ha: %u)\n",
+ orig_hash_result,
+ hash_result,
+ r_list[hash_table[orig_hash_result]].addr,
+ addr);
+#endif
+
+ /* We had a collision.
+ * orig_hash_result is where we started, hash_result is where we ended up.
+ * So, swap them because we are likely to see the same guy again sooner */
+#ifdef DEBUG
+ if(debug) {
+ printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[orig_hash_result] = %d\n",hash_table[orig_hash_result]);
+ printk(KERN_INFO RECENT_NAME ": match(): Collision; r_list[hash_table[orig_hash_result]].hash_entry = %d\n",
+ r_list[hash_table[orig_hash_result]].hash_entry);
+ }
+#endif
+
+ r_list[hash_table[orig_hash_result]].hash_entry = hash_result;
+
+
+ temp = hash_table[orig_hash_result];
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[hash_result] = %d\n",hash_table[hash_result]);
+#endif
+ hash_table[orig_hash_result] = hash_table[hash_result];
+ hash_table[hash_result] = temp;
+ temp = hash_result;
+ hash_result = orig_hash_result;
+ orig_hash_result = temp;
+ time_info[r_list[hash_table[orig_hash_result]].time_pos].position = hash_table[orig_hash_result];
+ if(hash_table[hash_result] != -1) {
+ r_list[hash_table[hash_result]].hash_entry = hash_result;
+ time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
+ }
+
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision handled.\n");
+#endif
+ }
+
+ if(hash_table[hash_result] == -1) {
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": match(): New table entry. (hr: %d,ha: %u)\n",
+ hash_result, addr);
+#endif
+
+ /* New item found and IPT_RECENT_SET, so we need to add it */
+ location = time_info[curr_table->time_pos].position;
+ hash_table[r_list[location].hash_entry] = -1;
+ hash_table[hash_result] = location;
+ memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t));
+ r_list[location].time_pos = curr_table->time_pos;
+ r_list[location].addr = addr;
+ r_list[location].ttl = ttl;
+ r_list[location].last_seen = now;
+ r_list[location].oldest_pkt = 1;
+ r_list[location].last_pkts[0] = now;
+ r_list[location].hash_entry = hash_result;
+ time_info[curr_table->time_pos].time = r_list[location].last_seen;
+ curr_table->time_pos = (curr_table->time_pos + 1) % ip_list_tot;
+
+ ans = !info->invert;
+ } else {
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": match(): Existing table entry. (hr: %d,ha: %u)\n",
+ hash_result,
+ addr);
+#endif
+
+ /* Existing item found */
+ location = hash_table[hash_result];
+ /* We have a match on address, now to make sure it meets all requirements for a
+ * full match. */
+ if(info->check_set & IPT_RECENT_CHECK || info->check_set & IPT_RECENT_UPDATE) {
+ if(!info->seconds && !info->hit_count) ans = !info->invert; else ans = info->invert;
+ if(info->seconds && !info->hit_count) {
+ if(time_before_eq(now,r_list[location].last_seen+info->seconds*HZ)) ans = !info->invert; else ans = info->invert;
+ }
+ if(info->seconds && info->hit_count) {
+ for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) {
+ if(time_before_eq(now,r_list[location].last_pkts[pkt_count]+info->seconds*HZ)) hits_found++;
+ }
+ if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert;
+ }
+ if(info->hit_count && !info->seconds) {
+ for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) {
+ if(r_list[location].last_pkts[pkt_count] == 0) break;
+ hits_found++;
+ }
+ if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert;
+ }
+ }
+#ifdef DEBUG
+ if(debug) {
+ if(ans)
+ printk(KERN_INFO RECENT_NAME ": match(): match addr: %u\n",addr);
+ else
+ printk(KERN_INFO RECENT_NAME ": match(): no match addr: %u\n",addr);
+ }
+#endif
+
+ /* If and only if we have been asked to SET, or to UPDATE (on match) do we add the
+ * current timestamp to the last_seen. */
+ if((info->check_set & IPT_RECENT_SET && (ans = !info->invert)) || (info->check_set & IPT_RECENT_UPDATE && ans)) {
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": match(): SET or UPDATE; updating time info.\n");
+#endif
+ /* Have to update our time info */
+ time_loc = r_list[location].time_pos;
+ time_info[time_loc].time = now;
+ time_info[time_loc].position = location;
+ while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) {
+ time_temp = time_info[time_loc].time;
+ time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time;
+ time_info[(time_loc+1)%ip_list_tot].time = time_temp;
+ time_temp = time_info[time_loc].position;
+ time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position;
+ time_info[(time_loc+1)%ip_list_tot].position = time_temp;
+ r_list[time_info[time_loc].position].time_pos = time_loc;
+ r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot;
+ time_loc = (time_loc+1) % ip_list_tot;
+ }
+ r_list[location].time_pos = time_loc;
+ r_list[location].ttl = ttl;
+ r_list[location].last_pkts[r_list[location].oldest_pkt] = now;
+ r_list[location].oldest_pkt = ++r_list[location].oldest_pkt % ip_pkt_list_tot;
+ r_list[location].last_seen = now;
+ }
+ /* If we have been asked to remove the entry from the list, just set it to 0 */
+ if(info->check_set & IPT_RECENT_REMOVE) {
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; clearing entry (or: %d, hr: %d).\n",orig_hash_result,hash_result);
+#endif
+ /* Check if this is part of a collision chain */
+ while(hash_table[(orig_hash_result+1) % ip_list_hash_size] != -1) {
+ orig_hash_result++;
+ if(hash_func(r_list[hash_table[orig_hash_result]].addr,ip_list_hash_size) == hash_result) {
+ /* Found collision chain, how deep does this rabbit hole go? */
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; found collision chain.\n");
+#endif
+ end_collision_chain = orig_hash_result;
+ }
+ }
+ if(end_collision_chain != -1) {
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; part of collision chain, moving to end.\n");
+#endif
+ /* Part of a collision chain, swap it with the end of the chain
+ * before removing. */
+ r_list[hash_table[end_collision_chain]].hash_entry = hash_result;
+ temp = hash_table[end_collision_chain];
+ hash_table[end_collision_chain] = hash_table[hash_result];
+ hash_table[hash_result] = temp;
+ time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
+ hash_result = end_collision_chain;
+ r_list[hash_table[hash_result]].hash_entry = hash_result;
+ time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
+ }
+ location = hash_table[hash_result];
+ hash_table[r_list[location].hash_entry] = -1;
+ time_loc = r_list[location].time_pos;
+ time_info[time_loc].time = 0;
+ time_info[time_loc].position = location;
+ while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) {
+ time_temp = time_info[time_loc].time;
+ time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time;
+ time_info[(time_loc+1)%ip_list_tot].time = time_temp;
+ time_temp = time_info[time_loc].position;
+ time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position;
+ time_info[(time_loc+1)%ip_list_tot].position = time_temp;
+ r_list[time_info[time_loc].position].time_pos = time_loc;
+ r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot;
+ time_loc = (time_loc+1) % ip_list_tot;
+ }
+ r_list[location].time_pos = time_loc;
+ r_list[location].last_seen = 0;
+ r_list[location].addr = 0;
+ r_list[location].ttl = 0;
+ memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t));
+ r_list[location].oldest_pkt = 0;
+ ans = !info->invert;
+ }
+ spin_unlock_bh(&curr_table->list_lock);
+ return ans;
+ }
+
+ spin_unlock_bh(&curr_table->list_lock);
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": match() left.\n");
+#endif
+ return ans;
+}
+
+/* This function is to verify that the rule given during the userspace iptables
+ * command is correct.
+ * If the command is valid then we check if the table name referred to by the
+ * rule exists, if not it is created.
+ */
+static int
+checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ int flag = 0, c;
+ unsigned long *hold;
+ const struct ipt_recent_info *info = matchinfo;
+ struct recent_ip_tables *curr_table, *find_table, *last_table;
+
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() entered.\n");
+#endif
+
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return 0;
+
+ /* seconds and hit_count only valid for CHECK/UPDATE */
+ if(info->check_set & IPT_RECENT_SET) { flag++; if(info->seconds || info->hit_count) return 0; }
+ if(info->check_set & IPT_RECENT_REMOVE) { flag++; if(info->seconds || info->hit_count) return 0; }
+ if(info->check_set & IPT_RECENT_CHECK) flag++;
+ if(info->check_set & IPT_RECENT_UPDATE) flag++;
+
+ /* One and only one of these should ever be set */
+ if(flag != 1) return 0;
+
+ /* Name must be set to something */
+ if(!info->name || !info->name[0]) return 0;
+
+ /* Things look good, create a list for this if it does not exist */
+ /* Lock the linked list while we play with it */
+ spin_lock_bh(&recent_lock);
+
+ /* Look for an entry with this name already created */
+ /* Finds the end of the list and the entry before the end if current name does not exist */
+ find_table = r_tables;
+ while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) );
+
+ /* If a table already exists just increment the count on that table and return */
+ if(find_table) {
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), incrementing count.\n",info->name);
+#endif
+ find_table->count++;
+ spin_unlock_bh(&recent_lock);
+ return 1;
+ }
+
+ spin_unlock_bh(&recent_lock);
+
+ /* Table with this name not found */
+ /* Allocate memory for new linked list item */
+
+#ifdef DEBUG
+ if(debug) {
+ printk(KERN_INFO RECENT_NAME ": checkentry: no table found (%s)\n",info->name);
+ printk(KERN_INFO RECENT_NAME ": checkentry: Allocationg %d for link-list entry.\n",sizeof(struct recent_ip_tables));
+ }
+#endif
+
+ curr_table = vmalloc(sizeof(struct recent_ip_tables));
+ if(curr_table == NULL) return 0;
+
+ spin_lock_init(&curr_table->list_lock);
+ curr_table->next = NULL;
+ curr_table->count = 1;
+ curr_table->time_pos = 0;
+ strncpy(curr_table->name,info->name,IPT_RECENT_NAME_LEN);
+ curr_table->name[IPT_RECENT_NAME_LEN-1] = '\0';
+
+ /* Allocate memory for this table and the list of packets in each entry. */
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for table (%s).\n",
+ sizeof(struct recent_ip_list)*ip_list_tot,
+ info->name);
+#endif
+
+ curr_table->table = vmalloc(sizeof(struct recent_ip_list)*ip_list_tot);
+ if(curr_table->table == NULL) { vfree(curr_table); return 0; }
+ memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot);
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n",
+ sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot);
+#endif
+
+ hold = vmalloc(sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot);
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n");
+#endif
+ if(hold == NULL) {
+ printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for pkt_list.\n");
+ vfree(curr_table->table);
+ vfree(curr_table);
+ return 0;
+ }
+ for(c = 0; c < ip_list_tot; c++) {
+ curr_table->table[c].last_pkts = hold + c*ip_pkt_list_tot;
+ }
+
+ /* Allocate memory for the hash table */
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for hash_table.\n",
+ sizeof(int)*ip_list_hash_size);
+#endif
+
+ curr_table->hash_table = vmalloc(sizeof(int)*ip_list_hash_size);
+ if(!curr_table->hash_table) {
+ printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for hash_table.\n");
+ vfree(hold);
+ vfree(curr_table->table);
+ vfree(curr_table);
+ return 0;
+ }
+
+ for(c = 0; c < ip_list_hash_size; c++) {
+ curr_table->hash_table[c] = -1;
+ }
+
+ /* Allocate memory for the time info */
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for time_info.\n",
+ sizeof(struct time_info_list)*ip_list_tot);
+#endif
+
+ curr_table->time_info = vmalloc(sizeof(struct time_info_list)*ip_list_tot);
+ if(!curr_table->time_info) {
+ printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for time_info.\n");
+ vfree(curr_table->hash_table);
+ vfree(hold);
+ vfree(curr_table->table);
+ vfree(curr_table);
+ return 0;
+ }
+ for(c = 0; c < ip_list_tot; c++) {
+ curr_table->time_info[c].position = c;
+ curr_table->time_info[c].time = 0;
+ }
+
+ /* Put the new table in place */
+ spin_lock_bh(&recent_lock);
+ find_table = r_tables;
+ while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) );
+
+ /* If a table already exists just increment the count on that table and return */
+ if(find_table) {
+ find_table->count++;
+ spin_unlock_bh(&recent_lock);
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), created by other process.\n",info->name);
+#endif
+ vfree(curr_table->time_info);
+ vfree(curr_table->hash_table);
+ vfree(hold);
+ vfree(curr_table->table);
+ vfree(curr_table);
+ return 1;
+ }
+ if(!last_table) r_tables = curr_table; else last_table->next = curr_table;
+
+ spin_unlock_bh(&recent_lock);
+
+#ifdef CONFIG_PROC_FS
+ /* Create our proc 'status' entry. */
+ curr_table->status_proc = create_proc_entry(curr_table->name, ip_list_perms, proc_net_ipt_recent);
+ if (!curr_table->status_proc) {
+ printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for /proc entry.\n");
+ /* Destroy the created table */
+ spin_lock_bh(&recent_lock);
+ last_table = NULL;
+ curr_table = r_tables;
+ if(!curr_table) {
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, no tables.\n");
+#endif
+ spin_unlock_bh(&recent_lock);
+ return 0;
+ }
+ while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) );
+ if(!curr_table) {
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, table already destroyed.\n");
+#endif
+ spin_unlock_bh(&recent_lock);
+ return 0;
+ }
+ if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next;
+ spin_unlock_bh(&recent_lock);
+ vfree(curr_table->time_info);
+ vfree(curr_table->hash_table);
+ vfree(hold);
+ vfree(curr_table->table);
+ vfree(curr_table);
+ return 0;
+ }
+
+ curr_table->status_proc->owner = THIS_MODULE;
+ curr_table->status_proc->data = curr_table;
+ wmb();
+ curr_table->status_proc->read_proc = ip_recent_get_info;
+ curr_table->status_proc->write_proc = ip_recent_ctrl;
+#endif /* CONFIG_PROC_FS */
+
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() left.\n");
+#endif
+
+ return 1;
+}
+
+/* This function is called in the event that a rule matching this module is
+ * removed.
+ * When this happens we need to check if there are no other rules matching
+ * the table given. If that is the case then we remove the table and clean
+ * up its memory.
+ */
+static void
+destroy(void *matchinfo, unsigned int matchsize)
+{
+ const struct ipt_recent_info *info = matchinfo;
+ struct recent_ip_tables *curr_table, *last_table;
+
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": destroy() entered.\n");
+#endif
+
+ if(matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return;
+
+ /* Lock the linked list while we play with it */
+ spin_lock_bh(&recent_lock);
+
+ /* Look for an entry with this name already created */
+ /* Finds the end of the list and the entry before the end if current name does not exist */
+ last_table = NULL;
+ curr_table = r_tables;
+ if(!curr_table) {
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": destroy() No tables found, leaving.\n");
+#endif
+ spin_unlock_bh(&recent_lock);
+ return;
+ }
+ while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) );
+
+ /* If a table does not exist then do nothing and return */
+ if(!curr_table) {
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table not found, leaving.\n");
+#endif
+ spin_unlock_bh(&recent_lock);
+ return;
+ }
+
+ curr_table->count--;
+
+ /* If count is still non-zero then there are still rules referenceing it so we do nothing */
+ if(curr_table->count) {
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, non-zero count, leaving.\n");
+#endif
+ spin_unlock_bh(&recent_lock);
+ return;
+ }
+
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, zero count, removing.\n");
+#endif
+
+ /* Count must be zero so we remove this table from the list */
+ if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next;
+
+ spin_unlock_bh(&recent_lock);
+
+ /* lock to make sure any late-runners still using this after we removed it from
+ * the list finish up then remove everything */
+ spin_lock_bh(&curr_table->list_lock);
+ spin_unlock_bh(&curr_table->list_lock);
+
+#ifdef CONFIG_PROC_FS
+ if(curr_table->status_proc) remove_proc_entry(curr_table->name,proc_net_ipt_recent);
+#endif /* CONFIG_PROC_FS */
+ vfree(curr_table->table[0].last_pkts);
+ vfree(curr_table->table);
+ vfree(curr_table->hash_table);
+ vfree(curr_table->time_info);
+ vfree(curr_table);
+
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": destroy() left.\n");
+#endif
+
+ return;
+}
+
+/* This is the structure we pass to ipt_register to register our
+ * module with iptables.
+ */
+static struct ipt_match recent_match = {
+ .name = "recent",
+ .match = &match,
+ .checkentry = &checkentry,
+ .destroy = &destroy,
+ .me = THIS_MODULE
+};
+
+/* Kernel module initialization. */
+static int __init init(void)
+{
+ int err, count;
+
+ printk(version);
+#ifdef CONFIG_PROC_FS
+ proc_net_ipt_recent = proc_mkdir("ipt_recent",proc_net);
+ if(!proc_net_ipt_recent) return -ENOMEM;
+#endif
+
+ if(ip_list_hash_size && ip_list_hash_size <= ip_list_tot) {
+ printk(KERN_WARNING RECENT_NAME ": ip_list_hash_size too small, resetting to default.\n");
+ ip_list_hash_size = 0;
+ }
+
+ if(!ip_list_hash_size) {
+ ip_list_hash_size = ip_list_tot*3;
+ count = 2*2;
+ while(ip_list_hash_size > count) count = count*2;
+ ip_list_hash_size = count;
+ }
+
+#ifdef DEBUG
+ if(debug) printk(KERN_INFO RECENT_NAME ": ip_list_hash_size: %d\n",ip_list_hash_size);
+#endif
+
+ err = ipt_register_match(&recent_match);
+ if (err)
+ remove_proc_entry("ipt_recent", proc_net);
+ return err;
+}
+
+/* Kernel module destruction. */
+static void __exit fini(void)
+{
+ ipt_unregister_match(&recent_match);
+
+ remove_proc_entry("ipt_recent",proc_net);
+}
+
+/* Register our module with the kernel. */
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_sctp.c b/net/ipv4/netfilter/ipt_sctp.c
new file mode 100644
index 00000000000..fe2b327bcaa
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_sctp.c
@@ -0,0 +1,203 @@
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <linux/sctp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_sctp.h>
+
+#ifdef DEBUG_SCTP
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+#define SCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \
+ || (!!((invflag) & (option)) ^ (cond)))
+
+static int
+match_flags(const struct ipt_sctp_flag_info *flag_info,
+ const int flag_count,
+ u_int8_t chunktype,
+ u_int8_t chunkflags)
+{
+ int i;
+
+ for (i = 0; i < flag_count; i++) {
+ if (flag_info[i].chunktype == chunktype) {
+ return (chunkflags & flag_info[i].flag_mask) == flag_info[i].flag;
+ }
+ }
+
+ return 1;
+}
+
+static int
+match_packet(const struct sk_buff *skb,
+ const u_int32_t *chunkmap,
+ int chunk_match_type,
+ const struct ipt_sctp_flag_info *flag_info,
+ const int flag_count,
+ int *hotdrop)
+{
+ int offset;
+ u_int32_t chunkmapcopy[256 / sizeof (u_int32_t)];
+ sctp_chunkhdr_t _sch, *sch;
+
+#ifdef DEBUG_SCTP
+ int i = 0;
+#endif
+
+ if (chunk_match_type == SCTP_CHUNK_MATCH_ALL) {
+ SCTP_CHUNKMAP_COPY(chunkmapcopy, chunkmap);
+ }
+
+ offset = skb->nh.iph->ihl * 4 + sizeof (sctp_sctphdr_t);
+ do {
+ sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch);
+ if (sch == NULL) {
+ duprintf("Dropping invalid SCTP packet.\n");
+ *hotdrop = 1;
+ return 0;
+ }
+
+ duprintf("Chunk num: %d\toffset: %d\ttype: %d\tlength: %d\tflags: %x\n",
+ ++i, offset, sch->type, htons(sch->length), sch->flags);
+
+ offset += (htons(sch->length) + 3) & ~3;
+
+ duprintf("skb->len: %d\toffset: %d\n", skb->len, offset);
+
+ if (SCTP_CHUNKMAP_IS_SET(chunkmap, sch->type)) {
+ switch (chunk_match_type) {
+ case SCTP_CHUNK_MATCH_ANY:
+ if (match_flags(flag_info, flag_count,
+ sch->type, sch->flags)) {
+ return 1;
+ }
+ break;
+
+ case SCTP_CHUNK_MATCH_ALL:
+ if (match_flags(flag_info, flag_count,
+ sch->type, sch->flags)) {
+ SCTP_CHUNKMAP_CLEAR(chunkmapcopy, sch->type);
+ }
+ break;
+
+ case SCTP_CHUNK_MATCH_ONLY:
+ if (!match_flags(flag_info, flag_count,
+ sch->type, sch->flags)) {
+ return 0;
+ }
+ break;
+ }
+ } else {
+ switch (chunk_match_type) {
+ case SCTP_CHUNK_MATCH_ONLY:
+ return 0;
+ }
+ }
+ } while (offset < skb->len);
+
+ switch (chunk_match_type) {
+ case SCTP_CHUNK_MATCH_ALL:
+ return SCTP_CHUNKMAP_IS_CLEAR(chunkmap);
+ case SCTP_CHUNK_MATCH_ANY:
+ return 0;
+ case SCTP_CHUNK_MATCH_ONLY:
+ return 1;
+ }
+
+ /* This will never be reached, but required to stop compiler whine */
+ return 0;
+}
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ const struct ipt_sctp_info *info;
+ sctp_sctphdr_t _sh, *sh;
+
+ info = (const struct ipt_sctp_info *)matchinfo;
+
+ if (offset) {
+ duprintf("Dropping non-first fragment.. FIXME\n");
+ return 0;
+ }
+
+ sh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_sh), &_sh);
+ if (sh == NULL) {
+ duprintf("Dropping evil TCP offset=0 tinygram.\n");
+ *hotdrop = 1;
+ return 0;
+ }
+ duprintf("spt: %d\tdpt: %d\n", ntohs(sh->source), ntohs(sh->dest));
+
+ return SCCHECK(((ntohs(sh->source) >= info->spts[0])
+ && (ntohs(sh->source) <= info->spts[1])),
+ IPT_SCTP_SRC_PORTS, info->flags, info->invflags)
+ && SCCHECK(((ntohs(sh->dest) >= info->dpts[0])
+ && (ntohs(sh->dest) <= info->dpts[1])),
+ IPT_SCTP_DEST_PORTS, info->flags, info->invflags)
+ && SCCHECK(match_packet(skb, info->chunkmap, info->chunk_match_type,
+ info->flag_info, info->flag_count,
+ hotdrop),
+ IPT_SCTP_CHUNK_TYPES, info->flags, info->invflags);
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ const struct ipt_sctp_info *info;
+
+ info = (const struct ipt_sctp_info *)matchinfo;
+
+ return ip->proto == IPPROTO_SCTP
+ && !(ip->invflags & IPT_INV_PROTO)
+ && matchsize == IPT_ALIGN(sizeof(struct ipt_sctp_info))
+ && !(info->flags & ~IPT_SCTP_VALID_FLAGS)
+ && !(info->invflags & ~IPT_SCTP_VALID_FLAGS)
+ && !(info->invflags & ~info->flags)
+ && ((!(info->flags & IPT_SCTP_CHUNK_TYPES)) ||
+ (info->chunk_match_type &
+ (SCTP_CHUNK_MATCH_ALL
+ | SCTP_CHUNK_MATCH_ANY
+ | SCTP_CHUNK_MATCH_ONLY)));
+}
+
+static struct ipt_match sctp_match =
+{
+ .list = { NULL, NULL},
+ .name = "sctp",
+ .match = &match,
+ .checkentry = &checkentry,
+ .destroy = NULL,
+ .me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&sctp_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&sctp_match);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kiran Kumar Immidi");
+MODULE_DESCRIPTION("Match for SCTP protocol packets");
+
diff --git a/net/ipv4/netfilter/ipt_state.c b/net/ipv4/netfilter/ipt_state.c
new file mode 100644
index 00000000000..b1511b97ea5
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_state.c
@@ -0,0 +1,74 @@
+/* Kernel module to match connection tracking information. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_state.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+MODULE_DESCRIPTION("iptables connection tracking state match module");
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ const struct ipt_state_info *sinfo = matchinfo;
+ enum ip_conntrack_info ctinfo;
+ unsigned int statebit;
+
+ if (skb->nfct == &ip_conntrack_untracked.ct_general)
+ statebit = IPT_STATE_UNTRACKED;
+ else if (!ip_conntrack_get(skb, &ctinfo))
+ statebit = IPT_STATE_INVALID;
+ else
+ statebit = IPT_STATE_BIT(ctinfo);
+
+ return (sinfo->statemask & statebit);
+}
+
+static int check(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_state_info)))
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_match state_match = {
+ .name = "state",
+ .match = &match,
+ .checkentry = &check,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ need_ip_conntrack();
+ return ipt_register_match(&state_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&state_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_tcpmss.c b/net/ipv4/netfilter/ipt_tcpmss.c
new file mode 100644
index 00000000000..4dc9b16ab4a
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_tcpmss.c
@@ -0,0 +1,127 @@
+/* Kernel module to match TCP MSS values. */
+
+/* Copyright (C) 2000 Marc Boucher <marc@mbsi.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter_ipv4/ipt_tcpmss.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+#define TH_SYN 0x02
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("iptables TCP MSS match module");
+
+/* Returns 1 if the mss option is set and matched by the range, 0 otherwise */
+static inline int
+mssoption_match(u_int16_t min, u_int16_t max,
+ const struct sk_buff *skb,
+ int invert,
+ int *hotdrop)
+{
+ struct tcphdr _tcph, *th;
+ /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
+ u8 _opt[15 * 4 - sizeof(_tcph)], *op;
+ unsigned int i, optlen;
+
+ /* If we don't have the whole header, drop packet. */
+ th = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+ sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ goto dropit;
+
+ /* Malformed. */
+ if (th->doff*4 < sizeof(*th))
+ goto dropit;
+
+ optlen = th->doff*4 - sizeof(*th);
+ if (!optlen)
+ goto out;
+
+ /* Truncated options. */
+ op = skb_header_pointer(skb, skb->nh.iph->ihl * 4 + sizeof(*th),
+ optlen, _opt);
+ if (op == NULL)
+ goto dropit;
+
+ for (i = 0; i < optlen; ) {
+ if (op[i] == TCPOPT_MSS
+ && (optlen - i) >= TCPOLEN_MSS
+ && op[i+1] == TCPOLEN_MSS) {
+ u_int16_t mssval;
+
+ mssval = (op[i+2] << 8) | op[i+3];
+
+ return (mssval >= min && mssval <= max) ^ invert;
+ }
+ if (op[i] < 2) i++;
+ else i += op[i+1]?:1;
+ }
+out:
+ return invert;
+
+ dropit:
+ *hotdrop = 1;
+ return 0;
+}
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ const struct ipt_tcpmss_match_info *info = matchinfo;
+
+ return mssoption_match(info->mss_min, info->mss_max, skb,
+ info->invert, hotdrop);
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_tcpmss_match_info)))
+ return 0;
+
+ /* Must specify -p tcp */
+ if (ip->proto != IPPROTO_TCP || (ip->invflags & IPT_INV_PROTO)) {
+ printk("tcpmss: Only works on TCP packets\n");
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ipt_match tcpmss_match = {
+ .name = "tcpmss",
+ .match = &match,
+ .checkentry = &checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&tcpmss_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&tcpmss_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_tos.c b/net/ipv4/netfilter/ipt_tos.c
new file mode 100644
index 00000000000..086a1bb61e3
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_tos.c
@@ -0,0 +1,64 @@
+/* Kernel module to match TOS values. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter_ipv4/ipt_tos.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("iptables TOS match module");
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ const struct ipt_tos_info *info = matchinfo;
+
+ return (skb->nh.iph->tos == info->tos) ^ info->invert;
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_tos_info)))
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_match tos_match = {
+ .name = "tos",
+ .match = &match,
+ .checkentry = &checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&tos_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&tos_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c
new file mode 100644
index 00000000000..219aa9de88c
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ttl.c
@@ -0,0 +1,79 @@
+/* IP tables module for matching the value of the TTL
+ *
+ * ipt_ttl.c,v 1.5 2000/11/13 11:16:08 laforge Exp
+ *
+ * (C) 2000,2001 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter_ipv4/ipt_ttl.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("IP tables TTL matching module");
+MODULE_LICENSE("GPL");
+
+static int match(const struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, const void *matchinfo,
+ int offset, int *hotdrop)
+{
+ const struct ipt_ttl_info *info = matchinfo;
+
+ switch (info->mode) {
+ case IPT_TTL_EQ:
+ return (skb->nh.iph->ttl == info->ttl);
+ break;
+ case IPT_TTL_NE:
+ return (!(skb->nh.iph->ttl == info->ttl));
+ break;
+ case IPT_TTL_LT:
+ return (skb->nh.iph->ttl < info->ttl);
+ break;
+ case IPT_TTL_GT:
+ return (skb->nh.iph->ttl > info->ttl);
+ break;
+ default:
+ printk(KERN_WARNING "ipt_ttl: unknown mode %d\n",
+ info->mode);
+ return 0;
+ }
+
+ return 0;
+}
+
+static int checkentry(const char *tablename, const struct ipt_ip *ip,
+ void *matchinfo, unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_ttl_info)))
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_match ttl_match = {
+ .name = "ttl",
+ .match = &match,
+ .checkentry = &checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&ttl_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&ttl_match);
+
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
new file mode 100644
index 00000000000..260a4f0a2a9
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -0,0 +1,194 @@
+/*
+ * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables filter table");
+
+#define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT))
+
+static struct
+{
+ struct ipt_replace repl;
+ struct ipt_standard entries[3];
+ struct ipt_error term;
+} initial_table __initdata
+= { { "filter", FILTER_VALID_HOOKS, 4,
+ sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
+ { [NF_IP_LOCAL_IN] = 0,
+ [NF_IP_FORWARD] = sizeof(struct ipt_standard),
+ [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },
+ { [NF_IP_LOCAL_IN] = 0,
+ [NF_IP_FORWARD] = sizeof(struct ipt_standard),
+ [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },
+ 0, NULL, { } },
+ {
+ /* LOCAL_IN */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_standard),
+ 0, { 0, 0 }, { } },
+ { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+ -NF_ACCEPT - 1 } },
+ /* FORWARD */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_standard),
+ 0, { 0, 0 }, { } },
+ { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+ -NF_ACCEPT - 1 } },
+ /* LOCAL_OUT */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_standard),
+ 0, { 0, 0 }, { } },
+ { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+ -NF_ACCEPT - 1 } }
+ },
+ /* ERROR */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_error),
+ 0, { 0, 0 }, { } },
+ { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } },
+ { } },
+ "ERROR"
+ }
+ }
+};
+
+static struct ipt_table packet_filter = {
+ .name = "filter",
+ .valid_hooks = FILTER_VALID_HOOKS,
+ .lock = RW_LOCK_UNLOCKED,
+ .me = THIS_MODULE
+};
+
+/* The work comes in here from netfilter.c. */
+static unsigned int
+ipt_hook(unsigned int hook,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL);
+}
+
+static unsigned int
+ipt_local_out_hook(unsigned int hook,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ /* root is playing with raw sockets. */
+ if ((*pskb)->len < sizeof(struct iphdr)
+ || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
+ if (net_ratelimit())
+ printk("ipt_hook: happy cracking.\n");
+ return NF_ACCEPT;
+ }
+
+ return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL);
+}
+
+static struct nf_hook_ops ipt_ops[] = {
+ {
+ .hook = ipt_hook,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_LOCAL_IN,
+ .priority = NF_IP_PRI_FILTER,
+ },
+ {
+ .hook = ipt_hook,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_FORWARD,
+ .priority = NF_IP_PRI_FILTER,
+ },
+ {
+ .hook = ipt_local_out_hook,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_LOCAL_OUT,
+ .priority = NF_IP_PRI_FILTER,
+ },
+};
+
+/* Default to forward because I got too much mail already. */
+static int forward = NF_ACCEPT;
+module_param(forward, bool, 0000);
+
+static int __init init(void)
+{
+ int ret;
+
+ if (forward < 0 || forward > NF_MAX_VERDICT) {
+ printk("iptables forward must be 0 or 1\n");
+ return -EINVAL;
+ }
+
+ /* Entry 1 is the FORWARD hook */
+ initial_table.entries[1].target.verdict = -forward - 1;
+
+ /* Register table */
+ ret = ipt_register_table(&packet_filter, &initial_table.repl);
+ if (ret < 0)
+ return ret;
+
+ /* Register hooks */
+ ret = nf_register_hook(&ipt_ops[0]);
+ if (ret < 0)
+ goto cleanup_table;
+
+ ret = nf_register_hook(&ipt_ops[1]);
+ if (ret < 0)
+ goto cleanup_hook0;
+
+ ret = nf_register_hook(&ipt_ops[2]);
+ if (ret < 0)
+ goto cleanup_hook1;
+
+ return ret;
+
+ cleanup_hook1:
+ nf_unregister_hook(&ipt_ops[1]);
+ cleanup_hook0:
+ nf_unregister_hook(&ipt_ops[0]);
+ cleanup_table:
+ ipt_unregister_table(&packet_filter);
+
+ return ret;
+}
+
+static void __exit fini(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++)
+ nf_unregister_hook(&ipt_ops[i]);
+
+ ipt_unregister_table(&packet_filter);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
new file mode 100644
index 00000000000..160eb11b6e2
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -0,0 +1,260 @@
+/*
+ * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Extended to all five netfilter hooks by Brad Chapman & Harald Welte
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/route.h>
+#include <linux/ip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables mangle table");
+
+#define MANGLE_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | \
+ (1 << NF_IP_LOCAL_IN) | \
+ (1 << NF_IP_FORWARD) | \
+ (1 << NF_IP_LOCAL_OUT) | \
+ (1 << NF_IP_POST_ROUTING))
+
+/* Ouch - five different hooks? Maybe this should be a config option..... -- BC */
+static struct
+{
+ struct ipt_replace repl;
+ struct ipt_standard entries[5];
+ struct ipt_error term;
+} initial_table __initdata
+= { { "mangle", MANGLE_VALID_HOOKS, 6,
+ sizeof(struct ipt_standard) * 5 + sizeof(struct ipt_error),
+ { [NF_IP_PRE_ROUTING] = 0,
+ [NF_IP_LOCAL_IN] = sizeof(struct ipt_standard),
+ [NF_IP_FORWARD] = sizeof(struct ipt_standard) * 2,
+ [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 3,
+ [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 4 },
+ { [NF_IP_PRE_ROUTING] = 0,
+ [NF_IP_LOCAL_IN] = sizeof(struct ipt_standard),
+ [NF_IP_FORWARD] = sizeof(struct ipt_standard) * 2,
+ [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 3,
+ [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 4 },
+ 0, NULL, { } },
+ {
+ /* PRE_ROUTING */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_standard),
+ 0, { 0, 0 }, { } },
+ { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+ -NF_ACCEPT - 1 } },
+ /* LOCAL_IN */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_standard),
+ 0, { 0, 0 }, { } },
+ { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+ -NF_ACCEPT - 1 } },
+ /* FORWARD */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_standard),
+ 0, { 0, 0 }, { } },
+ { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+ -NF_ACCEPT - 1 } },
+ /* LOCAL_OUT */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_standard),
+ 0, { 0, 0 }, { } },
+ { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+ -NF_ACCEPT - 1 } },
+ /* POST_ROUTING */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_standard),
+ 0, { 0, 0 }, { } },
+ { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+ -NF_ACCEPT - 1 } },
+ },
+ /* ERROR */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_error),
+ 0, { 0, 0 }, { } },
+ { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } },
+ { } },
+ "ERROR"
+ }
+ }
+};
+
+static struct ipt_table packet_mangler = {
+ .name = "mangle",
+ .valid_hooks = MANGLE_VALID_HOOKS,
+ .lock = RW_LOCK_UNLOCKED,
+ .me = THIS_MODULE,
+};
+
+/* The work comes in here from netfilter.c. */
+static unsigned int
+ipt_route_hook(unsigned int hook,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL);
+}
+
+static unsigned int
+ipt_local_hook(unsigned int hook,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ unsigned int ret;
+ u_int8_t tos;
+ u_int32_t saddr, daddr;
+ unsigned long nfmark;
+
+ /* root is playing with raw sockets. */
+ if ((*pskb)->len < sizeof(struct iphdr)
+ || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
+ if (net_ratelimit())
+ printk("ipt_hook: happy cracking.\n");
+ return NF_ACCEPT;
+ }
+
+ /* Save things which could affect route */
+ nfmark = (*pskb)->nfmark;
+ saddr = (*pskb)->nh.iph->saddr;
+ daddr = (*pskb)->nh.iph->daddr;
+ tos = (*pskb)->nh.iph->tos;
+
+ ret = ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL);
+ /* Reroute for ANY change. */
+ if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE
+ && ((*pskb)->nh.iph->saddr != saddr
+ || (*pskb)->nh.iph->daddr != daddr
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ || (*pskb)->nfmark != nfmark
+#endif
+ || (*pskb)->nh.iph->tos != tos))
+ return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP;
+
+ return ret;
+}
+
+static struct nf_hook_ops ipt_ops[] = {
+ {
+ .hook = ipt_route_hook,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_PRE_ROUTING,
+ .priority = NF_IP_PRI_MANGLE,
+ },
+ {
+ .hook = ipt_route_hook,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_LOCAL_IN,
+ .priority = NF_IP_PRI_MANGLE,
+ },
+ {
+ .hook = ipt_route_hook,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_FORWARD,
+ .priority = NF_IP_PRI_MANGLE,
+ },
+ {
+ .hook = ipt_local_hook,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_LOCAL_OUT,
+ .priority = NF_IP_PRI_MANGLE,
+ },
+ {
+ .hook = ipt_route_hook,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_POST_ROUTING,
+ .priority = NF_IP_PRI_MANGLE,
+ },
+};
+
+static int __init init(void)
+{
+ int ret;
+
+ /* Register table */
+ ret = ipt_register_table(&packet_mangler, &initial_table.repl);
+ if (ret < 0)
+ return ret;
+
+ /* Register hooks */
+ ret = nf_register_hook(&ipt_ops[0]);
+ if (ret < 0)
+ goto cleanup_table;
+
+ ret = nf_register_hook(&ipt_ops[1]);
+ if (ret < 0)
+ goto cleanup_hook0;
+
+ ret = nf_register_hook(&ipt_ops[2]);
+ if (ret < 0)
+ goto cleanup_hook1;
+
+ ret = nf_register_hook(&ipt_ops[3]);
+ if (ret < 0)
+ goto cleanup_hook2;
+
+ ret = nf_register_hook(&ipt_ops[4]);
+ if (ret < 0)
+ goto cleanup_hook3;
+
+ return ret;
+
+ cleanup_hook3:
+ nf_unregister_hook(&ipt_ops[3]);
+ cleanup_hook2:
+ nf_unregister_hook(&ipt_ops[2]);
+ cleanup_hook1:
+ nf_unregister_hook(&ipt_ops[1]);
+ cleanup_hook0:
+ nf_unregister_hook(&ipt_ops[0]);
+ cleanup_table:
+ ipt_unregister_table(&packet_mangler);
+
+ return ret;
+}
+
+static void __exit fini(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++)
+ nf_unregister_hook(&ipt_ops[i]);
+
+ ipt_unregister_table(&packet_mangler);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
new file mode 100644
index 00000000000..01b4a3c814d
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -0,0 +1,156 @@
+/*
+ * 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT .
+ *
+ * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+#define RAW_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))
+
+static struct
+{
+ struct ipt_replace repl;
+ struct ipt_standard entries[2];
+ struct ipt_error term;
+} initial_table __initdata = {
+ .repl = {
+ .name = "raw",
+ .valid_hooks = RAW_VALID_HOOKS,
+ .num_entries = 3,
+ .size = sizeof(struct ipt_standard) * 2 + sizeof(struct ipt_error),
+ .hook_entry = {
+ [NF_IP_PRE_ROUTING] = 0,
+ [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) },
+ .underflow = {
+ [NF_IP_PRE_ROUTING] = 0,
+ [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) },
+ },
+ .entries = {
+ /* PRE_ROUTING */
+ {
+ .entry = {
+ .target_offset = sizeof(struct ipt_entry),
+ .next_offset = sizeof(struct ipt_standard),
+ },
+ .target = {
+ .target = {
+ .u = {
+ .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)),
+ },
+ },
+ .verdict = -NF_ACCEPT - 1,
+ },
+ },
+
+ /* LOCAL_OUT */
+ {
+ .entry = {
+ .target_offset = sizeof(struct ipt_entry),
+ .next_offset = sizeof(struct ipt_standard),
+ },
+ .target = {
+ .target = {
+ .u = {
+ .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)),
+ },
+ },
+ .verdict = -NF_ACCEPT - 1,
+ },
+ },
+ },
+ /* ERROR */
+ .term = {
+ .entry = {
+ .target_offset = sizeof(struct ipt_entry),
+ .next_offset = sizeof(struct ipt_error),
+ },
+ .target = {
+ .target = {
+ .u = {
+ .user = {
+ .target_size = IPT_ALIGN(sizeof(struct ipt_error_target)),
+ .name = IPT_ERROR_TARGET,
+ },
+ },
+ },
+ .errorname = "ERROR",
+ },
+ }
+};
+
+static struct ipt_table packet_raw = {
+ .name = "raw",
+ .valid_hooks = RAW_VALID_HOOKS,
+ .lock = RW_LOCK_UNLOCKED,
+ .me = THIS_MODULE
+};
+
+/* The work comes in here from netfilter.c. */
+static unsigned int
+ipt_hook(unsigned int hook,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return ipt_do_table(pskb, hook, in, out, &packet_raw, NULL);
+}
+
+/* 'raw' is the very first table. */
+static struct nf_hook_ops ipt_ops[] = {
+ {
+ .hook = ipt_hook,
+ .pf = PF_INET,
+ .hooknum = NF_IP_PRE_ROUTING,
+ .priority = NF_IP_PRI_RAW
+ },
+ {
+ .hook = ipt_hook,
+ .pf = PF_INET,
+ .hooknum = NF_IP_LOCAL_OUT,
+ .priority = NF_IP_PRI_RAW
+ },
+};
+
+static int __init init(void)
+{
+ int ret;
+
+ /* Register table */
+ ret = ipt_register_table(&packet_raw, &initial_table.repl);
+ if (ret < 0)
+ return ret;
+
+ /* Register hooks */
+ ret = nf_register_hook(&ipt_ops[0]);
+ if (ret < 0)
+ goto cleanup_table;
+
+ ret = nf_register_hook(&ipt_ops[1]);
+ if (ret < 0)
+ goto cleanup_hook0;
+
+ return ret;
+
+ cleanup_hook0:
+ nf_unregister_hook(&ipt_ops[0]);
+ cleanup_table:
+ ipt_unregister_table(&packet_raw);
+
+ return ret;
+}
+
+static void __exit fini(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++)
+ nf_unregister_hook(&ipt_ops[i]);
+
+ ipt_unregister_table(&packet_raw);
+}
+
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
new file mode 100644
index 00000000000..912bbcc7f41
--- /dev/null
+++ b/net/ipv4/proc.c
@@ -0,0 +1,382 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * This file implements the various access functions for the
+ * PROC file system. It is mainly used for debugging and
+ * statistics.
+ *
+ * Version: $Id: proc.c,v 1.45 2001/05/16 16:45:35 davem Exp $
+ *
+ * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de>
+ * Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de>
+ * Erik Schoenfelder, <schoenfr@ibr.cs.tu-bs.de>
+ *
+ * Fixes:
+ * Alan Cox : UDP sockets show the rxqueue/txqueue
+ * using hint flag for the netinfo.
+ * Pauline Middelink : identd support
+ * Alan Cox : Make /proc safer.
+ * Erik Schoenfelder : /proc/net/snmp
+ * Alan Cox : Handle dead sockets properly.
+ * Gerhard Koerting : Show both timers
+ * Alan Cox : Allow inode to be NULL (kernel socket)
+ * Andi Kleen : Add support for open_requests and
+ * split functions for more readibility.
+ * Andi Kleen : Add support for /proc/net/netstat
+ * Arnaldo C. Melo : Convert to seq_file
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/types.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/sock.h>
+#include <net/raw.h>
+
+static int fold_prot_inuse(struct proto *proto)
+{
+ int res = 0;
+ int cpu;
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++)
+ res += proto->stats[cpu].inuse;
+
+ return res;
+}
+
+/*
+ * Report socket allocation statistics [mea@utu.fi]
+ */
+static int sockstat_seq_show(struct seq_file *seq, void *v)
+{
+ /* From net/socket.c */
+ extern void socket_seq_show(struct seq_file *seq);
+
+ socket_seq_show(seq);
+ seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n",
+ fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
+ tcp_tw_count, atomic_read(&tcp_sockets_allocated),
+ atomic_read(&tcp_memory_allocated));
+ seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot));
+ seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot));
+ seq_printf(seq, "FRAG: inuse %d memory %d\n", ip_frag_nqueues,
+ atomic_read(&ip_frag_mem));
+ return 0;
+}
+
+static int sockstat_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, sockstat_seq_show, NULL);
+}
+
+static struct file_operations sockstat_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = sockstat_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static unsigned long
+fold_field(void *mib[], int offt)
+{
+ unsigned long res = 0;
+ int i;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!cpu_possible(i))
+ continue;
+ res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt);
+ res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt);
+ }
+ return res;
+}
+
+/* snmp items */
+static struct snmp_mib snmp4_ipstats_list[] = {
+ SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INRECEIVES),
+ SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS),
+ SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS),
+ SNMP_MIB_ITEM("ForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS),
+ SNMP_MIB_ITEM("InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS),
+ SNMP_MIB_ITEM("InDiscards", IPSTATS_MIB_INDISCARDS),
+ SNMP_MIB_ITEM("InDelivers", IPSTATS_MIB_INDELIVERS),
+ SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTREQUESTS),
+ SNMP_MIB_ITEM("OutDiscards", IPSTATS_MIB_OUTDISCARDS),
+ SNMP_MIB_ITEM("OutNoRoutes", IPSTATS_MIB_OUTNOROUTES),
+ SNMP_MIB_ITEM("ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT),
+ SNMP_MIB_ITEM("ReasmReqds", IPSTATS_MIB_REASMREQDS),
+ SNMP_MIB_ITEM("ReasmOKs", IPSTATS_MIB_REASMOKS),
+ SNMP_MIB_ITEM("ReasmFails", IPSTATS_MIB_REASMFAILS),
+ SNMP_MIB_ITEM("FragOKs", IPSTATS_MIB_FRAGOKS),
+ SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS),
+ SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES),
+ SNMP_MIB_SENTINEL
+};
+
+static struct snmp_mib snmp4_icmp_list[] = {
+ SNMP_MIB_ITEM("InMsgs", ICMP_MIB_INMSGS),
+ SNMP_MIB_ITEM("InErrors", ICMP_MIB_INERRORS),
+ SNMP_MIB_ITEM("InDestUnreachs", ICMP_MIB_INDESTUNREACHS),
+ SNMP_MIB_ITEM("InTimeExcds", ICMP_MIB_INTIMEEXCDS),
+ SNMP_MIB_ITEM("InParmProbs", ICMP_MIB_INPARMPROBS),
+ SNMP_MIB_ITEM("InSrcQuenchs", ICMP_MIB_INSRCQUENCHS),
+ SNMP_MIB_ITEM("InRedirects", ICMP_MIB_INREDIRECTS),
+ SNMP_MIB_ITEM("InEchos", ICMP_MIB_INECHOS),
+ SNMP_MIB_ITEM("InEchoReps", ICMP_MIB_INECHOREPS),
+ SNMP_MIB_ITEM("InTimestamps", ICMP_MIB_INTIMESTAMPS),
+ SNMP_MIB_ITEM("InTimestampReps", ICMP_MIB_INTIMESTAMPREPS),
+ SNMP_MIB_ITEM("InAddrMasks", ICMP_MIB_INADDRMASKS),
+ SNMP_MIB_ITEM("InAddrMaskReps", ICMP_MIB_INADDRMASKREPS),
+ SNMP_MIB_ITEM("OutMsgs", ICMP_MIB_OUTMSGS),
+ SNMP_MIB_ITEM("OutErrors", ICMP_MIB_OUTERRORS),
+ SNMP_MIB_ITEM("OutDestUnreachs", ICMP_MIB_OUTDESTUNREACHS),
+ SNMP_MIB_ITEM("OutTimeExcds", ICMP_MIB_OUTTIMEEXCDS),
+ SNMP_MIB_ITEM("OutParmProbs", ICMP_MIB_OUTPARMPROBS),
+ SNMP_MIB_ITEM("OutSrcQuenchs", ICMP_MIB_OUTSRCQUENCHS),
+ SNMP_MIB_ITEM("OutRedirects", ICMP_MIB_OUTREDIRECTS),
+ SNMP_MIB_ITEM("OutEchos", ICMP_MIB_OUTECHOS),
+ SNMP_MIB_ITEM("OutEchoReps", ICMP_MIB_OUTECHOREPS),
+ SNMP_MIB_ITEM("OutTimestamps", ICMP_MIB_OUTTIMESTAMPS),
+ SNMP_MIB_ITEM("OutTimestampReps", ICMP_MIB_OUTTIMESTAMPREPS),
+ SNMP_MIB_ITEM("OutAddrMasks", ICMP_MIB_OUTADDRMASKS),
+ SNMP_MIB_ITEM("OutAddrMaskReps", ICMP_MIB_OUTADDRMASKREPS),
+ SNMP_MIB_SENTINEL
+};
+
+static struct snmp_mib snmp4_tcp_list[] = {
+ SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM),
+ SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN),
+ SNMP_MIB_ITEM("RtoMax", TCP_MIB_RTOMAX),
+ SNMP_MIB_ITEM("MaxConn", TCP_MIB_MAXCONN),
+ SNMP_MIB_ITEM("ActiveOpens", TCP_MIB_ACTIVEOPENS),
+ SNMP_MIB_ITEM("PassiveOpens", TCP_MIB_PASSIVEOPENS),
+ SNMP_MIB_ITEM("AttemptFails", TCP_MIB_ATTEMPTFAILS),
+ SNMP_MIB_ITEM("EstabResets", TCP_MIB_ESTABRESETS),
+ SNMP_MIB_ITEM("CurrEstab", TCP_MIB_CURRESTAB),
+ SNMP_MIB_ITEM("InSegs", TCP_MIB_INSEGS),
+ SNMP_MIB_ITEM("OutSegs", TCP_MIB_OUTSEGS),
+ SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS),
+ SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS),
+ SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS),
+ SNMP_MIB_SENTINEL
+};
+
+static struct snmp_mib snmp4_udp_list[] = {
+ SNMP_MIB_ITEM("InDatagrams", UDP_MIB_INDATAGRAMS),
+ SNMP_MIB_ITEM("NoPorts", UDP_MIB_NOPORTS),
+ SNMP_MIB_ITEM("InErrors", UDP_MIB_INERRORS),
+ SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS),
+ SNMP_MIB_SENTINEL
+};
+
+static struct snmp_mib snmp4_net_list[] = {
+ SNMP_MIB_ITEM("SyncookiesSent", LINUX_MIB_SYNCOOKIESSENT),
+ SNMP_MIB_ITEM("SyncookiesRecv", LINUX_MIB_SYNCOOKIESRECV),
+ SNMP_MIB_ITEM("SyncookiesFailed", LINUX_MIB_SYNCOOKIESFAILED),
+ SNMP_MIB_ITEM("EmbryonicRsts", LINUX_MIB_EMBRYONICRSTS),
+ SNMP_MIB_ITEM("PruneCalled", LINUX_MIB_PRUNECALLED),
+ SNMP_MIB_ITEM("RcvPruned", LINUX_MIB_RCVPRUNED),
+ SNMP_MIB_ITEM("OfoPruned", LINUX_MIB_OFOPRUNED),
+ SNMP_MIB_ITEM("OutOfWindowIcmps", LINUX_MIB_OUTOFWINDOWICMPS),
+ SNMP_MIB_ITEM("LockDroppedIcmps", LINUX_MIB_LOCKDROPPEDICMPS),
+ SNMP_MIB_ITEM("ArpFilter", LINUX_MIB_ARPFILTER),
+ SNMP_MIB_ITEM("TW", LINUX_MIB_TIMEWAITED),
+ SNMP_MIB_ITEM("TWRecycled", LINUX_MIB_TIMEWAITRECYCLED),
+ SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED),
+ SNMP_MIB_ITEM("PAWSPassive", LINUX_MIB_PAWSPASSIVEREJECTED),
+ SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED),
+ SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED),
+ SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS),
+ SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED),
+ SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
+ SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS),
+ SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS),
+ SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED),
+ SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG),
+ SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE),
+ SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED),
+ SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS),
+ SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER),
+ SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS),
+ SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS),
+ SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
+ SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
+ SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
+ SNMP_MIB_ITEM("TCPFACKReorder", LINUX_MIB_TCPFACKREORDER),
+ SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER),
+ SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER),
+ SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER),
+ SNMP_MIB_ITEM("TCPFullUndo", LINUX_MIB_TCPFULLUNDO),
+ SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO),
+ SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO),
+ SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO),
+ SNMP_MIB_ITEM("TCPLoss", LINUX_MIB_TCPLOSS),
+ SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT),
+ SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES),
+ SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES),
+ SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES),
+ SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS),
+ SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),
+ SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
+ SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
+ SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
+ SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
+ SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
+ SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
+ SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
+ SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
+ SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
+ SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV),
+ SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN),
+ SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA),
+ SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE),
+ SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY),
+ SNMP_MIB_ITEM("TCPAbortOnTimeout", LINUX_MIB_TCPABORTONTIMEOUT),
+ SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER),
+ SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED),
+ SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES),
+ SNMP_MIB_SENTINEL
+};
+
+/*
+ * Called from the PROCfs module. This outputs /proc/net/snmp.
+ */
+static int snmp_seq_show(struct seq_file *seq, void *v)
+{
+ int i;
+
+ seq_puts(seq, "Ip: Forwarding DefaultTTL");
+
+ for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
+ seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
+
+ seq_printf(seq, "\nIp: %d %d",
+ ipv4_devconf.forwarding ? 1 : 2, sysctl_ip_default_ttl);
+
+ for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
+ seq_printf(seq, " %lu",
+ fold_field((void **) ip_statistics,
+ snmp4_ipstats_list[i].entry));
+
+ seq_puts(seq, "\nIcmp:");
+ for (i = 0; snmp4_icmp_list[i].name != NULL; i++)
+ seq_printf(seq, " %s", snmp4_icmp_list[i].name);
+
+ seq_puts(seq, "\nIcmp:");
+ for (i = 0; snmp4_icmp_list[i].name != NULL; i++)
+ seq_printf(seq, " %lu",
+ fold_field((void **) icmp_statistics,
+ snmp4_icmp_list[i].entry));
+
+ seq_puts(seq, "\nTcp:");
+ for (i = 0; snmp4_tcp_list[i].name != NULL; i++)
+ seq_printf(seq, " %s", snmp4_tcp_list[i].name);
+
+ seq_puts(seq, "\nTcp:");
+ for (i = 0; snmp4_tcp_list[i].name != NULL; i++) {
+ /* MaxConn field is signed, RFC 2012 */
+ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
+ seq_printf(seq, " %ld",
+ fold_field((void **) tcp_statistics,
+ snmp4_tcp_list[i].entry));
+ else
+ seq_printf(seq, " %lu",
+ fold_field((void **) tcp_statistics,
+ snmp4_tcp_list[i].entry));
+ }
+
+ seq_puts(seq, "\nUdp:");
+ for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+ seq_printf(seq, " %s", snmp4_udp_list[i].name);
+
+ seq_puts(seq, "\nUdp:");
+ for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+ seq_printf(seq, " %lu",
+ fold_field((void **) udp_statistics,
+ snmp4_udp_list[i].entry));
+
+ seq_putc(seq, '\n');
+ return 0;
+}
+
+static int snmp_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, snmp_seq_show, NULL);
+}
+
+static struct file_operations snmp_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = snmp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/*
+ * Output /proc/net/netstat
+ */
+static int netstat_seq_show(struct seq_file *seq, void *v)
+{
+ int i;
+
+ seq_puts(seq, "TcpExt:");
+ for (i = 0; snmp4_net_list[i].name != NULL; i++)
+ seq_printf(seq, " %s", snmp4_net_list[i].name);
+
+ seq_puts(seq, "\nTcpExt:");
+ for (i = 0; snmp4_net_list[i].name != NULL; i++)
+ seq_printf(seq, " %lu",
+ fold_field((void **) net_statistics,
+ snmp4_net_list[i].entry));
+
+ seq_putc(seq, '\n');
+ return 0;
+}
+
+static int netstat_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, netstat_seq_show, NULL);
+}
+
+static struct file_operations netstat_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = netstat_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+int __init ip_misc_proc_init(void)
+{
+ int rc = 0;
+
+ if (!proc_net_fops_create("netstat", S_IRUGO, &netstat_seq_fops))
+ goto out_netstat;
+
+ if (!proc_net_fops_create("snmp", S_IRUGO, &snmp_seq_fops))
+ goto out_snmp;
+
+ if (!proc_net_fops_create("sockstat", S_IRUGO, &sockstat_seq_fops))
+ goto out_sockstat;
+out:
+ return rc;
+out_sockstat:
+ proc_net_remove("snmp");
+out_snmp:
+ proc_net_remove("netstat");
+out_netstat:
+ rc = -ENOMEM;
+ goto out;
+}
+
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
new file mode 100644
index 00000000000..90a587cacaa
--- /dev/null
+++ b/net/ipv4/protocol.c
@@ -0,0 +1,101 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * INET protocol dispatch tables.
+ *
+ * Version: $Id: protocol.c,v 1.14 2001/05/18 02:25:49 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * Fixes:
+ * Alan Cox : Ahah! udp icmp errors don't work because
+ * udp_err is never called!
+ * Alan Cox : Added new fields for init and ready for
+ * proper fragmentation (_NO_ 4K limits!)
+ * Richard Colella : Hang on hash collision
+ * Vince Laviano : Modified inet_del_protocol() to correctly
+ * maintain copy bit.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/config.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/timer.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/ipip.h>
+#include <linux/igmp.h>
+
+struct net_protocol *inet_protos[MAX_INET_PROTOS];
+static DEFINE_SPINLOCK(inet_proto_lock);
+
+/*
+ * Add a protocol handler to the hash tables
+ */
+
+int inet_add_protocol(struct net_protocol *prot, unsigned char protocol)
+{
+ int hash, ret;
+
+ hash = protocol & (MAX_INET_PROTOS - 1);
+
+ spin_lock_bh(&inet_proto_lock);
+ if (inet_protos[hash]) {
+ ret = -1;
+ } else {
+ inet_protos[hash] = prot;
+ ret = 0;
+ }
+ spin_unlock_bh(&inet_proto_lock);
+
+ return ret;
+}
+
+/*
+ * Remove a protocol from the hash tables.
+ */
+
+int inet_del_protocol(struct net_protocol *prot, unsigned char protocol)
+{
+ int hash, ret;
+
+ hash = protocol & (MAX_INET_PROTOS - 1);
+
+ spin_lock_bh(&inet_proto_lock);
+ if (inet_protos[hash] == prot) {
+ inet_protos[hash] = NULL;
+ ret = 0;
+ } else {
+ ret = -1;
+ }
+ spin_unlock_bh(&inet_proto_lock);
+
+ synchronize_net();
+
+ return ret;
+}
+
+EXPORT_SYMBOL(inet_add_protocol);
+EXPORT_SYMBOL(inet_del_protocol);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
new file mode 100644
index 00000000000..93624a32eb9
--- /dev/null
+++ b/net/ipv4/raw.c
@@ -0,0 +1,888 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * RAW - implementation of IP "raw" sockets.
+ *
+ * Version: $Id: raw.c,v 1.64 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * Fixes:
+ * Alan Cox : verify_area() fixed up
+ * Alan Cox : ICMP error handling
+ * Alan Cox : EMSGSIZE if you send too big a packet
+ * Alan Cox : Now uses generic datagrams and shared
+ * skbuff library. No more peek crashes,
+ * no more backlogs
+ * Alan Cox : Checks sk->broadcast.
+ * Alan Cox : Uses skb_free_datagram/skb_copy_datagram
+ * Alan Cox : Raw passes ip options too
+ * Alan Cox : Setsocketopt added
+ * Alan Cox : Fixed error return for broadcasts
+ * Alan Cox : Removed wake_up calls
+ * Alan Cox : Use ttl/tos
+ * Alan Cox : Cleaned up old debugging
+ * Alan Cox : Use new kernel side addresses
+ * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets.
+ * Alan Cox : BSD style RAW socket demultiplexing.
+ * Alan Cox : Beginnings of mrouted support.
+ * Alan Cox : Added IP_HDRINCL option.
+ * Alan Cox : Skip broadcast check if BSDism set.
+ * David S. Miller : New socket lookup architecture.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/atomic.h>
+#include <asm/byteorder.h>
+#include <asm/current.h>
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/aio.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/sockios.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/mroute.h>
+#include <linux/netdevice.h>
+#include <linux/in_route.h>
+#include <linux/route.h>
+#include <linux/tcp.h>
+#include <linux/skbuff.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <linux/gfp.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <net/snmp.h>
+#include <net/inet_common.h>
+#include <net/checksum.h>
+#include <net/xfrm.h>
+#include <linux/rtnetlink.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+struct hlist_head raw_v4_htable[RAWV4_HTABLE_SIZE];
+DEFINE_RWLOCK(raw_v4_lock);
+
+static void raw_v4_hash(struct sock *sk)
+{
+ struct hlist_head *head = &raw_v4_htable[inet_sk(sk)->num &
+ (RAWV4_HTABLE_SIZE - 1)];
+
+ write_lock_bh(&raw_v4_lock);
+ sk_add_node(sk, head);
+ sock_prot_inc_use(sk->sk_prot);
+ write_unlock_bh(&raw_v4_lock);
+}
+
+static void raw_v4_unhash(struct sock *sk)
+{
+ write_lock_bh(&raw_v4_lock);
+ if (sk_del_node_init(sk))
+ sock_prot_dec_use(sk->sk_prot);
+ write_unlock_bh(&raw_v4_lock);
+}
+
+struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num,
+ unsigned long raddr, unsigned long laddr,
+ int dif)
+{
+ struct hlist_node *node;
+
+ sk_for_each_from(sk, node) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (inet->num == num &&
+ !(inet->daddr && inet->daddr != raddr) &&
+ !(inet->rcv_saddr && inet->rcv_saddr != laddr) &&
+ !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+ goto found; /* gotcha */
+ }
+ sk = NULL;
+found:
+ return sk;
+}
+
+/*
+ * 0 - deliver
+ * 1 - block
+ */
+static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
+{
+ int type;
+
+ if (!pskb_may_pull(skb, sizeof(struct icmphdr)))
+ return 1;
+
+ type = skb->h.icmph->type;
+ if (type < 32) {
+ __u32 data = raw_sk(sk)->filter.data;
+
+ return ((1 << type) & data) != 0;
+ }
+
+ /* Do not block unknown ICMP types */
+ return 0;
+}
+
+/* IP input processing comes here for RAW socket delivery.
+ * Caller owns SKB, so we must make clones.
+ *
+ * RFC 1122: SHOULD pass TOS value up to the transport layer.
+ * -> It does. And not only TOS, but all IP header.
+ */
+void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
+{
+ struct sock *sk;
+ struct hlist_head *head;
+
+ read_lock(&raw_v4_lock);
+ head = &raw_v4_htable[hash];
+ if (hlist_empty(head))
+ goto out;
+ sk = __raw_v4_lookup(__sk_head(head), iph->protocol,
+ iph->saddr, iph->daddr,
+ skb->dev->ifindex);
+
+ while (sk) {
+ if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) {
+ struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
+
+ /* Not releasing hash table! */
+ if (clone)
+ raw_rcv(sk, clone);
+ }
+ sk = __raw_v4_lookup(sk_next(sk), iph->protocol,
+ iph->saddr, iph->daddr,
+ skb->dev->ifindex);
+ }
+out:
+ read_unlock(&raw_v4_lock);
+}
+
+void raw_err (struct sock *sk, struct sk_buff *skb, u32 info)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ int type = skb->h.icmph->type;
+ int code = skb->h.icmph->code;
+ int err = 0;
+ int harderr = 0;
+
+ /* Report error on raw socket, if:
+ 1. User requested ip_recverr.
+ 2. Socket is connected (otherwise the error indication
+ is useless without ip_recverr and error is hard.
+ */
+ if (!inet->recverr && sk->sk_state != TCP_ESTABLISHED)
+ return;
+
+ switch (type) {
+ default:
+ case ICMP_TIME_EXCEEDED:
+ err = EHOSTUNREACH;
+ break;
+ case ICMP_SOURCE_QUENCH:
+ return;
+ case ICMP_PARAMETERPROB:
+ err = EPROTO;
+ harderr = 1;
+ break;
+ case ICMP_DEST_UNREACH:
+ err = EHOSTUNREACH;
+ if (code > NR_ICMP_UNREACH)
+ break;
+ err = icmp_err_convert[code].errno;
+ harderr = icmp_err_convert[code].fatal;
+ if (code == ICMP_FRAG_NEEDED) {
+ harderr = inet->pmtudisc != IP_PMTUDISC_DONT;
+ err = EMSGSIZE;
+ }
+ }
+
+ if (inet->recverr) {
+ struct iphdr *iph = (struct iphdr*)skb->data;
+ u8 *payload = skb->data + (iph->ihl << 2);
+
+ if (inet->hdrincl)
+ payload = skb->data;
+ ip_icmp_error(sk, skb, err, 0, info, payload);
+ }
+
+ if (inet->recverr || harderr) {
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ }
+}
+
+static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
+{
+ /* Charge it to the socket. */
+
+ if (sock_queue_rcv_skb(sk, skb) < 0) {
+ /* FIXME: increment a raw drops counter here */
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ return NET_RX_SUCCESS;
+}
+
+int raw_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ skb_push(skb, skb->data - skb->nh.raw);
+
+ raw_rcv_skb(sk, skb);
+ return 0;
+}
+
+static int raw_send_hdrinc(struct sock *sk, void *from, int length,
+ struct rtable *rt,
+ unsigned int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ int hh_len;
+ struct iphdr *iph;
+ struct sk_buff *skb;
+ int err;
+
+ if (length > rt->u.dst.dev->mtu) {
+ ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport,
+ rt->u.dst.dev->mtu);
+ return -EMSGSIZE;
+ }
+ if (flags&MSG_PROBE)
+ goto out;
+
+ hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+
+ skb = sock_alloc_send_skb(sk, length+hh_len+15,
+ flags&MSG_DONTWAIT, &err);
+ if (skb == NULL)
+ goto error;
+ skb_reserve(skb, hh_len);
+
+ skb->priority = sk->sk_priority;
+ skb->dst = dst_clone(&rt->u.dst);
+
+ skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ skb->h.raw = skb->nh.raw;
+ err = memcpy_fromiovecend((void *)iph, from, 0, length);
+ if (err)
+ goto error_fault;
+
+ /* We don't modify invalid header */
+ if (length >= sizeof(*iph) && iph->ihl * 4 <= length) {
+ if (!iph->saddr)
+ iph->saddr = rt->rt_src;
+ iph->check = 0;
+ iph->tot_len = htons(length);
+ if (!iph->id)
+ ip_select_ident(iph, &rt->u.dst, NULL);
+
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+ }
+
+ err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+ dst_output);
+ if (err > 0)
+ err = inet->recverr ? net_xmit_errno(err) : 0;
+ if (err)
+ goto error;
+out:
+ return 0;
+
+error_fault:
+ err = -EFAULT;
+ kfree_skb(skb);
+error:
+ IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
+ return err;
+}
+
+static void raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
+{
+ struct iovec *iov;
+ u8 __user *type = NULL;
+ u8 __user *code = NULL;
+ int probed = 0;
+ int i;
+
+ if (!msg->msg_iov)
+ return;
+
+ for (i = 0; i < msg->msg_iovlen; i++) {
+ iov = &msg->msg_iov[i];
+ if (!iov)
+ continue;
+
+ switch (fl->proto) {
+ case IPPROTO_ICMP:
+ /* check if one-byte field is readable or not. */
+ if (iov->iov_base && iov->iov_len < 1)
+ break;
+
+ if (!type) {
+ type = iov->iov_base;
+ /* check if code field is readable or not. */
+ if (iov->iov_len > 1)
+ code = type + 1;
+ } else if (!code)
+ code = iov->iov_base;
+
+ if (type && code) {
+ get_user(fl->fl_icmp_type, type);
+ __get_user(fl->fl_icmp_code, code);
+ probed = 1;
+ }
+ break;
+ default:
+ probed = 1;
+ break;
+ }
+ if (probed)
+ break;
+ }
+}
+
+static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ size_t len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipcm_cookie ipc;
+ struct rtable *rt = NULL;
+ int free = 0;
+ u32 daddr;
+ u32 saddr;
+ u8 tos;
+ int err;
+
+ err = -EMSGSIZE;
+ if (len < 0 || len > 0xFFFF)
+ goto out;
+
+ /*
+ * Check the flags.
+ */
+
+ err = -EOPNOTSUPP;
+ if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message */
+ goto out; /* compatibility */
+
+ /*
+ * Get and verify the address.
+ */
+
+ if (msg->msg_namelen) {
+ struct sockaddr_in *usin = (struct sockaddr_in*)msg->msg_name;
+ err = -EINVAL;
+ if (msg->msg_namelen < sizeof(*usin))
+ goto out;
+ if (usin->sin_family != AF_INET) {
+ static int complained;
+ if (!complained++)
+ printk(KERN_INFO "%s forgot to set AF_INET in "
+ "raw sendmsg. Fix it!\n",
+ current->comm);
+ err = -EAFNOSUPPORT;
+ if (usin->sin_family)
+ goto out;
+ }
+ daddr = usin->sin_addr.s_addr;
+ /* ANK: I did not forget to get protocol from port field.
+ * I just do not know, who uses this weirdness.
+ * IP_HDRINCL is much more convenient.
+ */
+ } else {
+ err = -EDESTADDRREQ;
+ if (sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+ daddr = inet->daddr;
+ }
+
+ ipc.addr = inet->saddr;
+ ipc.opt = NULL;
+ ipc.oif = sk->sk_bound_dev_if;
+
+ if (msg->msg_controllen) {
+ err = ip_cmsg_send(msg, &ipc);
+ if (err)
+ goto out;
+ if (ipc.opt)
+ free = 1;
+ }
+
+ saddr = ipc.addr;
+ ipc.addr = daddr;
+
+ if (!ipc.opt)
+ ipc.opt = inet->opt;
+
+ if (ipc.opt) {
+ err = -EINVAL;
+ /* Linux does not mangle headers on raw sockets,
+ * so that IP options + IP_HDRINCL is non-sense.
+ */
+ if (inet->hdrincl)
+ goto done;
+ if (ipc.opt->srr) {
+ if (!daddr)
+ goto done;
+ daddr = ipc.opt->faddr;
+ }
+ }
+ tos = RT_CONN_FLAGS(sk);
+ if (msg->msg_flags & MSG_DONTROUTE)
+ tos |= RTO_ONLINK;
+
+ if (MULTICAST(daddr)) {
+ if (!ipc.oif)
+ ipc.oif = inet->mc_index;
+ if (!saddr)
+ saddr = inet->mc_addr;
+ }
+
+ {
+ struct flowi fl = { .oif = ipc.oif,
+ .nl_u = { .ip4_u =
+ { .daddr = daddr,
+ .saddr = saddr,
+ .tos = tos } },
+ .proto = inet->hdrincl ? IPPROTO_RAW :
+ sk->sk_protocol,
+ };
+ if (!inet->hdrincl)
+ raw_probe_proto_opt(&fl, msg);
+
+ err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT));
+ }
+ if (err)
+ goto done;
+
+ err = -EACCES;
+ if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))
+ goto done;
+
+ if (msg->msg_flags & MSG_CONFIRM)
+ goto do_confirm;
+back_from_confirm:
+
+ if (inet->hdrincl)
+ err = raw_send_hdrinc(sk, msg->msg_iov, len,
+ rt, msg->msg_flags);
+
+ else {
+ if (!ipc.addr)
+ ipc.addr = rt->rt_dst;
+ lock_sock(sk);
+ err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0,
+ &ipc, rt, msg->msg_flags);
+ if (err)
+ ip_flush_pending_frames(sk);
+ else if (!(msg->msg_flags & MSG_MORE))
+ err = ip_push_pending_frames(sk);
+ release_sock(sk);
+ }
+done:
+ if (free)
+ kfree(ipc.opt);
+ ip_rt_put(rt);
+
+out: return err < 0 ? err : len;
+
+do_confirm:
+ dst_confirm(&rt->u.dst);
+ if (!(msg->msg_flags & MSG_PROBE) || len)
+ goto back_from_confirm;
+ err = 0;
+ goto done;
+}
+
+static void raw_close(struct sock *sk, long timeout)
+{
+ /*
+ * Raw sockets may have direct kernel refereneces. Kill them.
+ */
+ ip_ra_control(sk, 0, NULL);
+
+ sk_common_release(sk);
+}
+
+/* This gets rid of all the nasties in af_inet. -DaveM */
+static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+ int ret = -EINVAL;
+ int chk_addr_ret;
+
+ if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
+ goto out;
+ chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
+ ret = -EADDRNOTAVAIL;
+ if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
+ chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
+ goto out;
+ inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
+ if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+ inet->saddr = 0; /* Use device */
+ sk_dst_reset(sk);
+ ret = 0;
+out: return ret;
+}
+
+/*
+ * This should be easy, if there is something there
+ * we return it, otherwise we block.
+ */
+
+static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ size_t len, int noblock, int flags, int *addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ size_t copied = 0;
+ int err = -EOPNOTSUPP;
+ struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
+ struct sk_buff *skb;
+
+ if (flags & MSG_OOB)
+ goto out;
+
+ if (addr_len)
+ *addr_len = sizeof(*sin);
+
+ if (flags & MSG_ERRQUEUE) {
+ err = ip_recv_error(sk, msg, len);
+ goto out;
+ }
+
+ skb = skb_recv_datagram(sk, flags, noblock, &err);
+ if (!skb)
+ goto out;
+
+ copied = skb->len;
+ if (len < copied) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+
+ err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+ if (err)
+ goto done;
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ /* Copy the address. */
+ if (sin) {
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = skb->nh.iph->saddr;
+ memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+ }
+ if (inet->cmsg_flags)
+ ip_cmsg_recv(msg, skb);
+ if (flags & MSG_TRUNC)
+ copied = skb->len;
+done:
+ skb_free_datagram(sk, skb);
+out: return err ? err : copied;
+}
+
+static int raw_init(struct sock *sk)
+{
+ struct raw_sock *rp = raw_sk(sk);
+
+ if (inet_sk(sk)->num == IPPROTO_ICMP)
+ memset(&rp->filter, 0, sizeof(rp->filter));
+ return 0;
+}
+
+static int raw_seticmpfilter(struct sock *sk, char __user *optval, int optlen)
+{
+ if (optlen > sizeof(struct icmp_filter))
+ optlen = sizeof(struct icmp_filter);
+ if (copy_from_user(&raw_sk(sk)->filter, optval, optlen))
+ return -EFAULT;
+ return 0;
+}
+
+static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *optlen)
+{
+ int len, ret = -EFAULT;
+
+ if (get_user(len, optlen))
+ goto out;
+ ret = -EINVAL;
+ if (len < 0)
+ goto out;
+ if (len > sizeof(struct icmp_filter))
+ len = sizeof(struct icmp_filter);
+ ret = -EFAULT;
+ if (put_user(len, optlen) ||
+ copy_to_user(optval, &raw_sk(sk)->filter, len))
+ goto out;
+ ret = 0;
+out: return ret;
+}
+
+static int raw_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int optlen)
+{
+ if (level != SOL_RAW)
+ return ip_setsockopt(sk, level, optname, optval, optlen);
+
+ if (optname == ICMP_FILTER) {
+ if (inet_sk(sk)->num != IPPROTO_ICMP)
+ return -EOPNOTSUPP;
+ else
+ return raw_seticmpfilter(sk, optval, optlen);
+ }
+ return -ENOPROTOOPT;
+}
+
+static int raw_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ if (level != SOL_RAW)
+ return ip_getsockopt(sk, level, optname, optval, optlen);
+
+ if (optname == ICMP_FILTER) {
+ if (inet_sk(sk)->num != IPPROTO_ICMP)
+ return -EOPNOTSUPP;
+ else
+ return raw_geticmpfilter(sk, optval, optlen);
+ }
+ return -ENOPROTOOPT;
+}
+
+static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case SIOCOUTQ: {
+ int amount = atomic_read(&sk->sk_wmem_alloc);
+ return put_user(amount, (int __user *)arg);
+ }
+ case SIOCINQ: {
+ struct sk_buff *skb;
+ int amount = 0;
+
+ spin_lock_irq(&sk->sk_receive_queue.lock);
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb != NULL)
+ amount = skb->len;
+ spin_unlock_irq(&sk->sk_receive_queue.lock);
+ return put_user(amount, (int __user *)arg);
+ }
+
+ default:
+#ifdef CONFIG_IP_MROUTE
+ return ipmr_ioctl(sk, cmd, (void __user *)arg);
+#else
+ return -ENOIOCTLCMD;
+#endif
+ }
+}
+
+struct proto raw_prot = {
+ .name = "RAW",
+ .owner = THIS_MODULE,
+ .close = raw_close,
+ .connect = ip4_datagram_connect,
+ .disconnect = udp_disconnect,
+ .ioctl = raw_ioctl,
+ .init = raw_init,
+ .setsockopt = raw_setsockopt,
+ .getsockopt = raw_getsockopt,
+ .sendmsg = raw_sendmsg,
+ .recvmsg = raw_recvmsg,
+ .bind = raw_bind,
+ .backlog_rcv = raw_rcv_skb,
+ .hash = raw_v4_hash,
+ .unhash = raw_v4_unhash,
+ .obj_size = sizeof(struct raw_sock),
+};
+
+#ifdef CONFIG_PROC_FS
+struct raw_iter_state {
+ int bucket;
+};
+
+#define raw_seq_private(seq) ((struct raw_iter_state *)(seq)->private)
+
+static struct sock *raw_get_first(struct seq_file *seq)
+{
+ struct sock *sk;
+ struct raw_iter_state* state = raw_seq_private(seq);
+
+ for (state->bucket = 0; state->bucket < RAWV4_HTABLE_SIZE; ++state->bucket) {
+ struct hlist_node *node;
+
+ sk_for_each(sk, node, &raw_v4_htable[state->bucket])
+ if (sk->sk_family == PF_INET)
+ goto found;
+ }
+ sk = NULL;
+found:
+ return sk;
+}
+
+static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
+{
+ struct raw_iter_state* state = raw_seq_private(seq);
+
+ do {
+ sk = sk_next(sk);
+try_again:
+ ;
+ } while (sk && sk->sk_family != PF_INET);
+
+ if (!sk && ++state->bucket < RAWV4_HTABLE_SIZE) {
+ sk = sk_head(&raw_v4_htable[state->bucket]);
+ goto try_again;
+ }
+ return sk;
+}
+
+static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct sock *sk = raw_get_first(seq);
+
+ if (sk)
+ while (pos && (sk = raw_get_next(seq, sk)) != NULL)
+ --pos;
+ return pos ? NULL : sk;
+}
+
+static void *raw_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ read_lock(&raw_v4_lock);
+ return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct sock *sk;
+
+ if (v == SEQ_START_TOKEN)
+ sk = raw_get_first(seq);
+ else
+ sk = raw_get_next(seq, v);
+ ++*pos;
+ return sk;
+}
+
+static void raw_seq_stop(struct seq_file *seq, void *v)
+{
+ read_unlock(&raw_v4_lock);
+}
+
+static __inline__ char *get_raw_sock(struct sock *sp, char *tmpbuf, int i)
+{
+ struct inet_sock *inet = inet_sk(sp);
+ unsigned int dest = inet->daddr,
+ src = inet->rcv_saddr;
+ __u16 destp = 0,
+ srcp = inet->num;
+
+ sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
+ " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p",
+ i, src, srcp, dest, destp, sp->sk_state,
+ atomic_read(&sp->sk_wmem_alloc),
+ atomic_read(&sp->sk_rmem_alloc),
+ 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
+ atomic_read(&sp->sk_refcnt), sp);
+ return tmpbuf;
+}
+
+static int raw_seq_show(struct seq_file *seq, void *v)
+{
+ char tmpbuf[129];
+
+ if (v == SEQ_START_TOKEN)
+ seq_printf(seq, "%-127s\n",
+ " sl local_address rem_address st tx_queue "
+ "rx_queue tr tm->when retrnsmt uid timeout "
+ "inode");
+ else {
+ struct raw_iter_state *state = raw_seq_private(seq);
+
+ seq_printf(seq, "%-127s\n",
+ get_raw_sock(v, tmpbuf, state->bucket));
+ }
+ return 0;
+}
+
+static struct seq_operations raw_seq_ops = {
+ .start = raw_seq_start,
+ .next = raw_seq_next,
+ .stop = raw_seq_stop,
+ .show = raw_seq_show,
+};
+
+static int raw_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc = -ENOMEM;
+ struct raw_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (!s)
+ goto out;
+ rc = seq_open(file, &raw_seq_ops);
+ if (rc)
+ goto out_kfree;
+
+ seq = file->private_data;
+ seq->private = s;
+ memset(s, 0, sizeof(*s));
+out:
+ return rc;
+out_kfree:
+ kfree(s);
+ goto out;
+}
+
+static struct file_operations raw_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = raw_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+int __init raw_proc_init(void)
+{
+ if (!proc_net_fops_create("raw", S_IRUGO, &raw_seq_fops))
+ return -ENOMEM;
+ return 0;
+}
+
+void __init raw_proc_exit(void)
+{
+ proc_net_remove("raw");
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
new file mode 100644
index 00000000000..9f91a116d91
--- /dev/null
+++ b/net/ipv4/route.c
@@ -0,0 +1,3177 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * ROUTE - implementation of the IP router.
+ *
+ * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
+ * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Fixes:
+ * Alan Cox : Verify area fixes.
+ * Alan Cox : cli() protects routing changes
+ * Rui Oliveira : ICMP routing table updates
+ * (rco@di.uminho.pt) Routing table insertion and update
+ * Linus Torvalds : Rewrote bits to be sensible
+ * Alan Cox : Added BSD route gw semantics
+ * Alan Cox : Super /proc >4K
+ * Alan Cox : MTU in route table
+ * Alan Cox : MSS actually. Also added the window
+ * clamper.
+ * Sam Lantinga : Fixed route matching in rt_del()
+ * Alan Cox : Routing cache support.
+ * Alan Cox : Removed compatibility cruft.
+ * Alan Cox : RTF_REJECT support.
+ * Alan Cox : TCP irtt support.
+ * Jonathan Naylor : Added Metric support.
+ * Miquel van Smoorenburg : BSD API fixes.
+ * Miquel van Smoorenburg : Metrics.
+ * Alan Cox : Use __u32 properly
+ * Alan Cox : Aligned routing errors more closely with BSD
+ * our system is still very different.
+ * Alan Cox : Faster /proc handling
+ * Alexey Kuznetsov : Massive rework to support tree based routing,
+ * routing caches and better behaviour.
+ *
+ * Olaf Erb : irtt wasn't being copied right.
+ * Bjorn Ekwall : Kerneld route support.
+ * Alan Cox : Multicast fixed (I hope)
+ * Pavel Krauz : Limited broadcast fixed
+ * Mike McLagan : Routing by source
+ * Alexey Kuznetsov : End of old history. Split to fib.c and
+ * route.c and rewritten from scratch.
+ * Andi Kleen : Load-limit warning messages.
+ * Vitaly E. Lavrov : Transparent proxy revived after year coma.
+ * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
+ * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
+ * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
+ * Marc Boucher : routing by fwmark
+ * Robert Olsson : Added rt_cache statistics
+ * Arnaldo C. Melo : Convert proc stuff to seq_file
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/pkt_sched.h>
+#include <linux/mroute.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/rcupdate.h>
+#include <linux/times.h>
+#include <net/protocol.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/inetpeer.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include <net/arp.h>
+#include <net/tcp.h>
+#include <net/icmp.h>
+#include <net/xfrm.h>
+#include <net/ip_mp_alg.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#define RT_FL_TOS(oldflp) \
+ ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
+
+#define IP_MAX_MTU 0xFFF0
+
+#define RT_GC_TIMEOUT (300*HZ)
+
+static int ip_rt_min_delay = 2 * HZ;
+static int ip_rt_max_delay = 10 * HZ;
+static int ip_rt_max_size;
+static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
+static int ip_rt_gc_interval = 60 * HZ;
+static int ip_rt_gc_min_interval = HZ / 2;
+static int ip_rt_redirect_number = 9;
+static int ip_rt_redirect_load = HZ / 50;
+static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
+static int ip_rt_error_cost = HZ;
+static int ip_rt_error_burst = 5 * HZ;
+static int ip_rt_gc_elasticity = 8;
+static int ip_rt_mtu_expires = 10 * 60 * HZ;
+static int ip_rt_min_pmtu = 512 + 20 + 20;
+static int ip_rt_min_advmss = 256;
+static int ip_rt_secret_interval = 10 * 60 * HZ;
+static unsigned long rt_deadline;
+
+#define RTprint(a...) printk(KERN_DEBUG a)
+
+static struct timer_list rt_flush_timer;
+static struct timer_list rt_periodic_timer;
+static struct timer_list rt_secret_timer;
+
+/*
+ * Interface to generic destination cache.
+ */
+
+static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
+static void ipv4_dst_destroy(struct dst_entry *dst);
+static void ipv4_dst_ifdown(struct dst_entry *dst,
+ struct net_device *dev, int how);
+static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
+static void ipv4_link_failure(struct sk_buff *skb);
+static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
+static int rt_garbage_collect(void);
+
+
+static struct dst_ops ipv4_dst_ops = {
+ .family = AF_INET,
+ .protocol = __constant_htons(ETH_P_IP),
+ .gc = rt_garbage_collect,
+ .check = ipv4_dst_check,
+ .destroy = ipv4_dst_destroy,
+ .ifdown = ipv4_dst_ifdown,
+ .negative_advice = ipv4_negative_advice,
+ .link_failure = ipv4_link_failure,
+ .update_pmtu = ip_rt_update_pmtu,
+ .entry_size = sizeof(struct rtable),
+};
+
+#define ECN_OR_COST(class) TC_PRIO_##class
+
+__u8 ip_tos2prio[16] = {
+ TC_PRIO_BESTEFFORT,
+ ECN_OR_COST(FILLER),
+ TC_PRIO_BESTEFFORT,
+ ECN_OR_COST(BESTEFFORT),
+ TC_PRIO_BULK,
+ ECN_OR_COST(BULK),
+ TC_PRIO_BULK,
+ ECN_OR_COST(BULK),
+ TC_PRIO_INTERACTIVE,
+ ECN_OR_COST(INTERACTIVE),
+ TC_PRIO_INTERACTIVE,
+ ECN_OR_COST(INTERACTIVE),
+ TC_PRIO_INTERACTIVE_BULK,
+ ECN_OR_COST(INTERACTIVE_BULK),
+ TC_PRIO_INTERACTIVE_BULK,
+ ECN_OR_COST(INTERACTIVE_BULK)
+};
+
+
+/*
+ * Route cache.
+ */
+
+/* The locking scheme is rather straight forward:
+ *
+ * 1) Read-Copy Update protects the buckets of the central route hash.
+ * 2) Only writers remove entries, and they hold the lock
+ * as they look at rtable reference counts.
+ * 3) Only readers acquire references to rtable entries,
+ * they do so with atomic increments and with the
+ * lock held.
+ */
+
+struct rt_hash_bucket {
+ struct rtable *chain;
+ spinlock_t lock;
+} __attribute__((__aligned__(8)));
+
+static struct rt_hash_bucket *rt_hash_table;
+static unsigned rt_hash_mask;
+static int rt_hash_log;
+static unsigned int rt_hash_rnd;
+
+struct rt_cache_stat *rt_cache_stat;
+
+static int rt_intern_hash(unsigned hash, struct rtable *rth,
+ struct rtable **res);
+
+static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
+{
+ return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
+ & rt_hash_mask);
+}
+
+#ifdef CONFIG_PROC_FS
+struct rt_cache_iter_state {
+ int bucket;
+};
+
+static struct rtable *rt_cache_get_first(struct seq_file *seq)
+{
+ struct rtable *r = NULL;
+ struct rt_cache_iter_state *st = seq->private;
+
+ for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
+ rcu_read_lock_bh();
+ r = rt_hash_table[st->bucket].chain;
+ if (r)
+ break;
+ rcu_read_unlock_bh();
+ }
+ return r;
+}
+
+static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
+{
+ struct rt_cache_iter_state *st = rcu_dereference(seq->private);
+
+ r = r->u.rt_next;
+ while (!r) {
+ rcu_read_unlock_bh();
+ if (--st->bucket < 0)
+ break;
+ rcu_read_lock_bh();
+ r = rt_hash_table[st->bucket].chain;
+ }
+ return r;
+}
+
+static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct rtable *r = rt_cache_get_first(seq);
+
+ if (r)
+ while (pos && (r = rt_cache_get_next(seq, r)))
+ --pos;
+ return pos ? NULL : r;
+}
+
+static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct rtable *r = NULL;
+
+ if (v == SEQ_START_TOKEN)
+ r = rt_cache_get_first(seq);
+ else
+ r = rt_cache_get_next(seq, v);
+ ++*pos;
+ return r;
+}
+
+static void rt_cache_seq_stop(struct seq_file *seq, void *v)
+{
+ if (v && v != SEQ_START_TOKEN)
+ rcu_read_unlock_bh();
+}
+
+static int rt_cache_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_printf(seq, "%-127s\n",
+ "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
+ "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
+ "HHUptod\tSpecDst");
+ else {
+ struct rtable *r = v;
+ char temp[256];
+
+ sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
+ "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
+ r->u.dst.dev ? r->u.dst.dev->name : "*",
+ (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
+ r->rt_flags, atomic_read(&r->u.dst.__refcnt),
+ r->u.dst.__use, 0, (unsigned long)r->rt_src,
+ (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
+ (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
+ dst_metric(&r->u.dst, RTAX_WINDOW),
+ (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
+ dst_metric(&r->u.dst, RTAX_RTTVAR)),
+ r->fl.fl4_tos,
+ r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
+ r->u.dst.hh ? (r->u.dst.hh->hh_output ==
+ dev_queue_xmit) : 0,
+ r->rt_spec_dst);
+ seq_printf(seq, "%-127s\n", temp);
+ }
+ return 0;
+}
+
+static struct seq_operations rt_cache_seq_ops = {
+ .start = rt_cache_seq_start,
+ .next = rt_cache_seq_next,
+ .stop = rt_cache_seq_stop,
+ .show = rt_cache_seq_show,
+};
+
+static int rt_cache_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc = -ENOMEM;
+ struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (!s)
+ goto out;
+ rc = seq_open(file, &rt_cache_seq_ops);
+ if (rc)
+ goto out_kfree;
+ seq = file->private_data;
+ seq->private = s;
+ memset(s, 0, sizeof(*s));
+out:
+ return rc;
+out_kfree:
+ kfree(s);
+ goto out;
+}
+
+static struct file_operations rt_cache_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = rt_cache_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+
+static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ int cpu;
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu+1;
+ return per_cpu_ptr(rt_cache_stat, cpu);
+ }
+ return NULL;
+}
+
+static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ int cpu;
+
+ for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu+1;
+ return per_cpu_ptr(rt_cache_stat, cpu);
+ }
+ return NULL;
+
+}
+
+static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+
+}
+
+static int rt_cpu_seq_show(struct seq_file *seq, void *v)
+{
+ struct rt_cache_stat *st = v;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "entries in_hit in_slow_tot in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
+ return 0;
+ }
+
+ seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
+ " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
+ atomic_read(&ipv4_dst_ops.entries),
+ st->in_hit,
+ st->in_slow_tot,
+ st->in_slow_mc,
+ st->in_no_route,
+ st->in_brd,
+ st->in_martian_dst,
+ st->in_martian_src,
+
+ st->out_hit,
+ st->out_slow_tot,
+ st->out_slow_mc,
+
+ st->gc_total,
+ st->gc_ignored,
+ st->gc_goal_miss,
+ st->gc_dst_overflow,
+ st->in_hlist_search,
+ st->out_hlist_search
+ );
+ return 0;
+}
+
+static struct seq_operations rt_cpu_seq_ops = {
+ .start = rt_cpu_seq_start,
+ .next = rt_cpu_seq_next,
+ .stop = rt_cpu_seq_stop,
+ .show = rt_cpu_seq_show,
+};
+
+
+static int rt_cpu_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &rt_cpu_seq_ops);
+}
+
+static struct file_operations rt_cpu_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = rt_cpu_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+static __inline__ void rt_free(struct rtable *rt)
+{
+ multipath_remove(rt);
+ call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
+}
+
+static __inline__ void rt_drop(struct rtable *rt)
+{
+ multipath_remove(rt);
+ ip_rt_put(rt);
+ call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
+}
+
+static __inline__ int rt_fast_clean(struct rtable *rth)
+{
+ /* Kill broadcast/multicast entries very aggresively, if they
+ collide in hash table with more useful entries */
+ return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
+ rth->fl.iif && rth->u.rt_next;
+}
+
+static __inline__ int rt_valuable(struct rtable *rth)
+{
+ return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
+ rth->u.dst.expires;
+}
+
+static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
+{
+ unsigned long age;
+ int ret = 0;
+
+ if (atomic_read(&rth->u.dst.__refcnt))
+ goto out;
+
+ ret = 1;
+ if (rth->u.dst.expires &&
+ time_after_eq(jiffies, rth->u.dst.expires))
+ goto out;
+
+ age = jiffies - rth->u.dst.lastuse;
+ ret = 0;
+ if ((age <= tmo1 && !rt_fast_clean(rth)) ||
+ (age <= tmo2 && rt_valuable(rth)))
+ goto out;
+ ret = 1;
+out: return ret;
+}
+
+/* Bits of score are:
+ * 31: very valuable
+ * 30: not quite useless
+ * 29..0: usage counter
+ */
+static inline u32 rt_score(struct rtable *rt)
+{
+ u32 score = jiffies - rt->u.dst.lastuse;
+
+ score = ~score & ~(3<<30);
+
+ if (rt_valuable(rt))
+ score |= (1<<31);
+
+ if (!rt->fl.iif ||
+ !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
+ score |= (1<<30);
+
+ return score;
+}
+
+static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
+{
+ return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
+ fl1->oif == fl2->oif &&
+ fl1->iif == fl2->iif;
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
+ struct rtable *expentry,
+ int *removed_count)
+{
+ int passedexpired = 0;
+ struct rtable **nextstep = NULL;
+ struct rtable **rthp = chain_head;
+ struct rtable *rth;
+
+ if (removed_count)
+ *removed_count = 0;
+
+ while ((rth = *rthp) != NULL) {
+ if (rth == expentry)
+ passedexpired = 1;
+
+ if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
+ compare_keys(&(*rthp)->fl, &expentry->fl)) {
+ if (*rthp == expentry) {
+ *rthp = rth->u.rt_next;
+ continue;
+ } else {
+ *rthp = rth->u.rt_next;
+ rt_free(rth);
+ if (removed_count)
+ ++(*removed_count);
+ }
+ } else {
+ if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
+ passedexpired && !nextstep)
+ nextstep = &rth->u.rt_next;
+
+ rthp = &rth->u.rt_next;
+ }
+ }
+
+ rt_free(expentry);
+ if (removed_count)
+ ++(*removed_count);
+
+ return nextstep;
+}
+#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+
+
+/* This runs via a timer and thus is always in BH context. */
+static void rt_check_expire(unsigned long dummy)
+{
+ static int rover;
+ int i = rover, t;
+ struct rtable *rth, **rthp;
+ unsigned long now = jiffies;
+
+ for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
+ t -= ip_rt_gc_timeout) {
+ unsigned long tmo = ip_rt_gc_timeout;
+
+ i = (i + 1) & rt_hash_mask;
+ rthp = &rt_hash_table[i].chain;
+
+ spin_lock(&rt_hash_table[i].lock);
+ while ((rth = *rthp) != NULL) {
+ if (rth->u.dst.expires) {
+ /* Entry is expired even if it is in use */
+ if (time_before_eq(now, rth->u.dst.expires)) {
+ tmo >>= 1;
+ rthp = &rth->u.rt_next;
+ continue;
+ }
+ } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
+ tmo >>= 1;
+ rthp = &rth->u.rt_next;
+ continue;
+ }
+
+ /* Cleanup aged off entries. */
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+ /* remove all related balanced entries if necessary */
+ if (rth->u.dst.flags & DST_BALANCED) {
+ rthp = rt_remove_balanced_route(
+ &rt_hash_table[i].chain,
+ rth, NULL);
+ if (!rthp)
+ break;
+ } else {
+ *rthp = rth->u.rt_next;
+ rt_free(rth);
+ }
+#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+ *rthp = rth->u.rt_next;
+ rt_free(rth);
+#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+ }
+ spin_unlock(&rt_hash_table[i].lock);
+
+ /* Fallback loop breaker. */
+ if (time_after(jiffies, now))
+ break;
+ }
+ rover = i;
+ mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
+}
+
+/* This can run from both BH and non-BH contexts, the latter
+ * in the case of a forced flush event.
+ */
+static void rt_run_flush(unsigned long dummy)
+{
+ int i;
+ struct rtable *rth, *next;
+
+ rt_deadline = 0;
+
+ get_random_bytes(&rt_hash_rnd, 4);
+
+ for (i = rt_hash_mask; i >= 0; i--) {
+ spin_lock_bh(&rt_hash_table[i].lock);
+ rth = rt_hash_table[i].chain;
+ if (rth)
+ rt_hash_table[i].chain = NULL;
+ spin_unlock_bh(&rt_hash_table[i].lock);
+
+ for (; rth; rth = next) {
+ next = rth->u.rt_next;
+ rt_free(rth);
+ }
+ }
+}
+
+static DEFINE_SPINLOCK(rt_flush_lock);
+
+void rt_cache_flush(int delay)
+{
+ unsigned long now = jiffies;
+ int user_mode = !in_softirq();
+
+ if (delay < 0)
+ delay = ip_rt_min_delay;
+
+ /* flush existing multipath state*/
+ multipath_flush();
+
+ spin_lock_bh(&rt_flush_lock);
+
+ if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
+ long tmo = (long)(rt_deadline - now);
+
+ /* If flush timer is already running
+ and flush request is not immediate (delay > 0):
+
+ if deadline is not achieved, prolongate timer to "delay",
+ otherwise fire it at deadline time.
+ */
+
+ if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
+ tmo = 0;
+
+ if (delay > tmo)
+ delay = tmo;
+ }
+
+ if (delay <= 0) {
+ spin_unlock_bh(&rt_flush_lock);
+ rt_run_flush(0);
+ return;
+ }
+
+ if (rt_deadline == 0)
+ rt_deadline = now + ip_rt_max_delay;
+
+ mod_timer(&rt_flush_timer, now+delay);
+ spin_unlock_bh(&rt_flush_lock);
+}
+
+static void rt_secret_rebuild(unsigned long dummy)
+{
+ unsigned long now = jiffies;
+
+ rt_cache_flush(0);
+ mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
+}
+
+/*
+ Short description of GC goals.
+
+ We want to build algorithm, which will keep routing cache
+ at some equilibrium point, when number of aged off entries
+ is kept approximately equal to newly generated ones.
+
+ Current expiration strength is variable "expire".
+ We try to adjust it dynamically, so that if networking
+ is idle expires is large enough to keep enough of warm entries,
+ and when load increases it reduces to limit cache size.
+ */
+
+static int rt_garbage_collect(void)
+{
+ static unsigned long expire = RT_GC_TIMEOUT;
+ static unsigned long last_gc;
+ static int rover;
+ static int equilibrium;
+ struct rtable *rth, **rthp;
+ unsigned long now = jiffies;
+ int goal;
+
+ /*
+ * Garbage collection is pretty expensive,
+ * do not make it too frequently.
+ */
+
+ RT_CACHE_STAT_INC(gc_total);
+
+ if (now - last_gc < ip_rt_gc_min_interval &&
+ atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
+ RT_CACHE_STAT_INC(gc_ignored);
+ goto out;
+ }
+
+ /* Calculate number of entries, which we want to expire now. */
+ goal = atomic_read(&ipv4_dst_ops.entries) -
+ (ip_rt_gc_elasticity << rt_hash_log);
+ if (goal <= 0) {
+ if (equilibrium < ipv4_dst_ops.gc_thresh)
+ equilibrium = ipv4_dst_ops.gc_thresh;
+ goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+ if (goal > 0) {
+ equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
+ goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+ }
+ } else {
+ /* We are in dangerous area. Try to reduce cache really
+ * aggressively.
+ */
+ goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
+ equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
+ }
+
+ if (now - last_gc >= ip_rt_gc_min_interval)
+ last_gc = now;
+
+ if (goal <= 0) {
+ equilibrium += goal;
+ goto work_done;
+ }
+
+ do {
+ int i, k;
+
+ for (i = rt_hash_mask, k = rover; i >= 0; i--) {
+ unsigned long tmo = expire;
+
+ k = (k + 1) & rt_hash_mask;
+ rthp = &rt_hash_table[k].chain;
+ spin_lock_bh(&rt_hash_table[k].lock);
+ while ((rth = *rthp) != NULL) {
+ if (!rt_may_expire(rth, tmo, expire)) {
+ tmo >>= 1;
+ rthp = &rth->u.rt_next;
+ continue;
+ }
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+ /* remove all related balanced entries
+ * if necessary
+ */
+ if (rth->u.dst.flags & DST_BALANCED) {
+ int r;
+
+ rthp = rt_remove_balanced_route(
+ &rt_hash_table[i].chain,
+ rth,
+ &r);
+ goal -= r;
+ if (!rthp)
+ break;
+ } else {
+ *rthp = rth->u.rt_next;
+ rt_free(rth);
+ goal--;
+ }
+#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+ *rthp = rth->u.rt_next;
+ rt_free(rth);
+ goal--;
+#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+ }
+ spin_unlock_bh(&rt_hash_table[k].lock);
+ if (goal <= 0)
+ break;
+ }
+ rover = k;
+
+ if (goal <= 0)
+ goto work_done;
+
+ /* Goal is not achieved. We stop process if:
+
+ - if expire reduced to zero. Otherwise, expire is halfed.
+ - if table is not full.
+ - if we are called from interrupt.
+ - jiffies check is just fallback/debug loop breaker.
+ We will not spin here for long time in any case.
+ */
+
+ RT_CACHE_STAT_INC(gc_goal_miss);
+
+ if (expire == 0)
+ break;
+
+ expire >>= 1;
+#if RT_CACHE_DEBUG >= 2
+ printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
+ atomic_read(&ipv4_dst_ops.entries), goal, i);
+#endif
+
+ if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+ goto out;
+ } while (!in_softirq() && time_before_eq(jiffies, now));
+
+ if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+ goto out;
+ if (net_ratelimit())
+ printk(KERN_WARNING "dst cache overflow\n");
+ RT_CACHE_STAT_INC(gc_dst_overflow);
+ return 1;
+
+work_done:
+ expire += ip_rt_gc_min_interval;
+ if (expire > ip_rt_gc_timeout ||
+ atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
+ expire = ip_rt_gc_timeout;
+#if RT_CACHE_DEBUG >= 2
+ printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
+ atomic_read(&ipv4_dst_ops.entries), goal, rover);
+#endif
+out: return 0;
+}
+
+static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
+{
+ struct rtable *rth, **rthp;
+ unsigned long now;
+ struct rtable *cand, **candp;
+ u32 min_score;
+ int chain_length;
+ int attempts = !in_softirq();
+
+restart:
+ chain_length = 0;
+ min_score = ~(u32)0;
+ cand = NULL;
+ candp = NULL;
+ now = jiffies;
+
+ rthp = &rt_hash_table[hash].chain;
+
+ spin_lock_bh(&rt_hash_table[hash].lock);
+ while ((rth = *rthp) != NULL) {
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+ if (!(rth->u.dst.flags & DST_BALANCED) &&
+ compare_keys(&rth->fl, &rt->fl)) {
+#else
+ if (compare_keys(&rth->fl, &rt->fl)) {
+#endif
+ /* Put it first */
+ *rthp = rth->u.rt_next;
+ /*
+ * Since lookup is lockfree, the deletion
+ * must be visible to another weakly ordered CPU before
+ * the insertion at the start of the hash chain.
+ */
+ rcu_assign_pointer(rth->u.rt_next,
+ rt_hash_table[hash].chain);
+ /*
+ * Since lookup is lockfree, the update writes
+ * must be ordered for consistency on SMP.
+ */
+ rcu_assign_pointer(rt_hash_table[hash].chain, rth);
+
+ rth->u.dst.__use++;
+ dst_hold(&rth->u.dst);
+ rth->u.dst.lastuse = now;
+ spin_unlock_bh(&rt_hash_table[hash].lock);
+
+ rt_drop(rt);
+ *rp = rth;
+ return 0;
+ }
+
+ if (!atomic_read(&rth->u.dst.__refcnt)) {
+ u32 score = rt_score(rth);
+
+ if (score <= min_score) {
+ cand = rth;
+ candp = rthp;
+ min_score = score;
+ }
+ }
+
+ chain_length++;
+
+ rthp = &rth->u.rt_next;
+ }
+
+ if (cand) {
+ /* ip_rt_gc_elasticity used to be average length of chain
+ * length, when exceeded gc becomes really aggressive.
+ *
+ * The second limit is less certain. At the moment it allows
+ * only 2 entries per bucket. We will see.
+ */
+ if (chain_length > ip_rt_gc_elasticity) {
+ *candp = cand->u.rt_next;
+ rt_free(cand);
+ }
+ }
+
+ /* Try to bind route to arp only if it is output
+ route or unicast forwarding path.
+ */
+ if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
+ int err = arp_bind_neighbour(&rt->u.dst);
+ if (err) {
+ spin_unlock_bh(&rt_hash_table[hash].lock);
+
+ if (err != -ENOBUFS) {
+ rt_drop(rt);
+ return err;
+ }
+
+ /* Neighbour tables are full and nothing
+ can be released. Try to shrink route cache,
+ it is most likely it holds some neighbour records.
+ */
+ if (attempts-- > 0) {
+ int saved_elasticity = ip_rt_gc_elasticity;
+ int saved_int = ip_rt_gc_min_interval;
+ ip_rt_gc_elasticity = 1;
+ ip_rt_gc_min_interval = 0;
+ rt_garbage_collect();
+ ip_rt_gc_min_interval = saved_int;
+ ip_rt_gc_elasticity = saved_elasticity;
+ goto restart;
+ }
+
+ if (net_ratelimit())
+ printk(KERN_WARNING "Neighbour table overflow.\n");
+ rt_drop(rt);
+ return -ENOBUFS;
+ }
+ }
+
+ rt->u.rt_next = rt_hash_table[hash].chain;
+#if RT_CACHE_DEBUG >= 2
+ if (rt->u.rt_next) {
+ struct rtable *trt;
+ printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
+ NIPQUAD(rt->rt_dst));
+ for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
+ printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
+ printk("\n");
+ }
+#endif
+ rt_hash_table[hash].chain = rt;
+ spin_unlock_bh(&rt_hash_table[hash].lock);
+ *rp = rt;
+ return 0;
+}
+
+void rt_bind_peer(struct rtable *rt, int create)
+{
+ static DEFINE_SPINLOCK(rt_peer_lock);
+ struct inet_peer *peer;
+
+ peer = inet_getpeer(rt->rt_dst, create);
+
+ spin_lock_bh(&rt_peer_lock);
+ if (rt->peer == NULL) {
+ rt->peer = peer;
+ peer = NULL;
+ }
+ spin_unlock_bh(&rt_peer_lock);
+ if (peer)
+ inet_putpeer(peer);
+}
+
+/*
+ * Peer allocation may fail only in serious out-of-memory conditions. However
+ * we still can generate some output.
+ * Random ID selection looks a bit dangerous because we have no chances to
+ * select ID being unique in a reasonable period of time.
+ * But broken packet identifier may be better than no packet at all.
+ */
+static void ip_select_fb_ident(struct iphdr *iph)
+{
+ static DEFINE_SPINLOCK(ip_fb_id_lock);
+ static u32 ip_fallback_id;
+ u32 salt;
+
+ spin_lock_bh(&ip_fb_id_lock);
+ salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
+ iph->id = htons(salt & 0xFFFF);
+ ip_fallback_id = salt;
+ spin_unlock_bh(&ip_fb_id_lock);
+}
+
+void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
+{
+ struct rtable *rt = (struct rtable *) dst;
+
+ if (rt) {
+ if (rt->peer == NULL)
+ rt_bind_peer(rt, 1);
+
+ /* If peer is attached to destination, it is never detached,
+ so that we need not to grab a lock to dereference it.
+ */
+ if (rt->peer) {
+ iph->id = htons(inet_getid(rt->peer, more));
+ return;
+ }
+ } else
+ printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
+
+ ip_select_fb_ident(iph);
+}
+
+static void rt_del(unsigned hash, struct rtable *rt)
+{
+ struct rtable **rthp;
+
+ spin_lock_bh(&rt_hash_table[hash].lock);
+ ip_rt_put(rt);
+ for (rthp = &rt_hash_table[hash].chain; *rthp;
+ rthp = &(*rthp)->u.rt_next)
+ if (*rthp == rt) {
+ *rthp = rt->u.rt_next;
+ rt_free(rt);
+ break;
+ }
+ spin_unlock_bh(&rt_hash_table[hash].lock);
+}
+
+void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
+ u32 saddr, u8 tos, struct net_device *dev)
+{
+ int i, k;
+ struct in_device *in_dev = in_dev_get(dev);
+ struct rtable *rth, **rthp;
+ u32 skeys[2] = { saddr, 0 };
+ int ikeys[2] = { dev->ifindex, 0 };
+
+ tos &= IPTOS_RT_MASK;
+
+ if (!in_dev)
+ return;
+
+ if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
+ || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
+ goto reject_redirect;
+
+ if (!IN_DEV_SHARED_MEDIA(in_dev)) {
+ if (!inet_addr_onlink(in_dev, new_gw, old_gw))
+ goto reject_redirect;
+ if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
+ goto reject_redirect;
+ } else {
+ if (inet_addr_type(new_gw) != RTN_UNICAST)
+ goto reject_redirect;
+ }
+
+ for (i = 0; i < 2; i++) {
+ for (k = 0; k < 2; k++) {
+ unsigned hash = rt_hash_code(daddr,
+ skeys[i] ^ (ikeys[k] << 5),
+ tos);
+
+ rthp=&rt_hash_table[hash].chain;
+
+ rcu_read_lock();
+ while ((rth = rcu_dereference(*rthp)) != NULL) {
+ struct rtable *rt;
+
+ if (rth->fl.fl4_dst != daddr ||
+ rth->fl.fl4_src != skeys[i] ||
+ rth->fl.fl4_tos != tos ||
+ rth->fl.oif != ikeys[k] ||
+ rth->fl.iif != 0) {
+ rthp = &rth->u.rt_next;
+ continue;
+ }
+
+ if (rth->rt_dst != daddr ||
+ rth->rt_src != saddr ||
+ rth->u.dst.error ||
+ rth->rt_gateway != old_gw ||
+ rth->u.dst.dev != dev)
+ break;
+
+ dst_hold(&rth->u.dst);
+ rcu_read_unlock();
+
+ rt = dst_alloc(&ipv4_dst_ops);
+ if (rt == NULL) {
+ ip_rt_put(rth);
+ in_dev_put(in_dev);
+ return;
+ }
+
+ /* Copy all the information. */
+ *rt = *rth;
+ INIT_RCU_HEAD(&rt->u.dst.rcu_head);
+ rt->u.dst.__use = 1;
+ atomic_set(&rt->u.dst.__refcnt, 1);
+ rt->u.dst.child = NULL;
+ if (rt->u.dst.dev)
+ dev_hold(rt->u.dst.dev);
+ if (rt->idev)
+ in_dev_hold(rt->idev);
+ rt->u.dst.obsolete = 0;
+ rt->u.dst.lastuse = jiffies;
+ rt->u.dst.path = &rt->u.dst;
+ rt->u.dst.neighbour = NULL;
+ rt->u.dst.hh = NULL;
+ rt->u.dst.xfrm = NULL;
+
+ rt->rt_flags |= RTCF_REDIRECTED;
+
+ /* Gateway is different ... */
+ rt->rt_gateway = new_gw;
+
+ /* Redirect received -> path was valid */
+ dst_confirm(&rth->u.dst);
+
+ if (rt->peer)
+ atomic_inc(&rt->peer->refcnt);
+
+ if (arp_bind_neighbour(&rt->u.dst) ||
+ !(rt->u.dst.neighbour->nud_state &
+ NUD_VALID)) {
+ if (rt->u.dst.neighbour)
+ neigh_event_send(rt->u.dst.neighbour, NULL);
+ ip_rt_put(rth);
+ rt_drop(rt);
+ goto do_next;
+ }
+
+ rt_del(hash, rth);
+ if (!rt_intern_hash(hash, rt, &rt))
+ ip_rt_put(rt);
+ goto do_next;
+ }
+ rcu_read_unlock();
+ do_next:
+ ;
+ }
+ }
+ in_dev_put(in_dev);
+ return;
+
+reject_redirect:
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+ if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
+ printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
+ "%u.%u.%u.%u ignored.\n"
+ " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
+ "tos %02x\n",
+ NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
+ NIPQUAD(saddr), NIPQUAD(daddr), tos);
+#endif
+ in_dev_put(in_dev);
+}
+
+static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
+{
+ struct rtable *rt = (struct rtable*)dst;
+ struct dst_entry *ret = dst;
+
+ if (rt) {
+ if (dst->obsolete) {
+ ip_rt_put(rt);
+ ret = NULL;
+ } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
+ rt->u.dst.expires) {
+ unsigned hash = rt_hash_code(rt->fl.fl4_dst,
+ rt->fl.fl4_src ^
+ (rt->fl.oif << 5),
+ rt->fl.fl4_tos);
+#if RT_CACHE_DEBUG >= 1
+ printk(KERN_DEBUG "ip_rt_advice: redirect to "
+ "%u.%u.%u.%u/%02x dropped\n",
+ NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
+#endif
+ rt_del(hash, rt);
+ ret = NULL;
+ }
+ }
+ return ret;
+}
+
+/*
+ * Algorithm:
+ * 1. The first ip_rt_redirect_number redirects are sent
+ * with exponential backoff, then we stop sending them at all,
+ * assuming that the host ignores our redirects.
+ * 2. If we did not see packets requiring redirects
+ * during ip_rt_redirect_silence, we assume that the host
+ * forgot redirected route and start to send redirects again.
+ *
+ * This algorithm is much cheaper and more intelligent than dumb load limiting
+ * in icmp.c.
+ *
+ * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
+ * and "frag. need" (breaks PMTU discovery) in icmp.c.
+ */
+
+void ip_rt_send_redirect(struct sk_buff *skb)
+{
+ struct rtable *rt = (struct rtable*)skb->dst;
+ struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
+
+ if (!in_dev)
+ return;
+
+ if (!IN_DEV_TX_REDIRECTS(in_dev))
+ goto out;
+
+ /* No redirected packets during ip_rt_redirect_silence;
+ * reset the algorithm.
+ */
+ if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
+ rt->u.dst.rate_tokens = 0;
+
+ /* Too many ignored redirects; do not send anything
+ * set u.dst.rate_last to the last seen redirected packet.
+ */
+ if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
+ rt->u.dst.rate_last = jiffies;
+ goto out;
+ }
+
+ /* Check for load limit; set rate_last to the latest sent
+ * redirect.
+ */
+ if (time_after(jiffies,
+ (rt->u.dst.rate_last +
+ (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
+ icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
+ rt->u.dst.rate_last = jiffies;
+ ++rt->u.dst.rate_tokens;
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+ if (IN_DEV_LOG_MARTIANS(in_dev) &&
+ rt->u.dst.rate_tokens == ip_rt_redirect_number &&
+ net_ratelimit())
+ printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
+ "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
+ NIPQUAD(rt->rt_src), rt->rt_iif,
+ NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
+#endif
+ }
+out:
+ in_dev_put(in_dev);
+}
+
+static int ip_error(struct sk_buff *skb)
+{
+ struct rtable *rt = (struct rtable*)skb->dst;
+ unsigned long now;
+ int code;
+
+ switch (rt->u.dst.error) {
+ case EINVAL:
+ default:
+ goto out;
+ case EHOSTUNREACH:
+ code = ICMP_HOST_UNREACH;
+ break;
+ case ENETUNREACH:
+ code = ICMP_NET_UNREACH;
+ break;
+ case EACCES:
+ code = ICMP_PKT_FILTERED;
+ break;
+ }
+
+ now = jiffies;
+ rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
+ if (rt->u.dst.rate_tokens > ip_rt_error_burst)
+ rt->u.dst.rate_tokens = ip_rt_error_burst;
+ rt->u.dst.rate_last = now;
+ if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
+ rt->u.dst.rate_tokens -= ip_rt_error_cost;
+ icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
+ }
+
+out: kfree_skb(skb);
+ return 0;
+}
+
+/*
+ * The last two values are not from the RFC but
+ * are needed for AMPRnet AX.25 paths.
+ */
+
+static unsigned short mtu_plateau[] =
+{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
+
+static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
+ if (old_mtu > mtu_plateau[i])
+ return mtu_plateau[i];
+ return 68;
+}
+
+unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
+{
+ int i;
+ unsigned short old_mtu = ntohs(iph->tot_len);
+ struct rtable *rth;
+ u32 skeys[2] = { iph->saddr, 0, };
+ u32 daddr = iph->daddr;
+ u8 tos = iph->tos & IPTOS_RT_MASK;
+ unsigned short est_mtu = 0;
+
+ if (ipv4_config.no_pmtu_disc)
+ return 0;
+
+ for (i = 0; i < 2; i++) {
+ unsigned hash = rt_hash_code(daddr, skeys[i], tos);
+
+ rcu_read_lock();
+ for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+ rth = rcu_dereference(rth->u.rt_next)) {
+ if (rth->fl.fl4_dst == daddr &&
+ rth->fl.fl4_src == skeys[i] &&
+ rth->rt_dst == daddr &&
+ rth->rt_src == iph->saddr &&
+ rth->fl.fl4_tos == tos &&
+ rth->fl.iif == 0 &&
+ !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
+ unsigned short mtu = new_mtu;
+
+ if (new_mtu < 68 || new_mtu >= old_mtu) {
+
+ /* BSD 4.2 compatibility hack :-( */
+ if (mtu == 0 &&
+ old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
+ old_mtu >= 68 + (iph->ihl << 2))
+ old_mtu -= iph->ihl << 2;
+
+ mtu = guess_mtu(old_mtu);
+ }
+ if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
+ if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
+ dst_confirm(&rth->u.dst);
+ if (mtu < ip_rt_min_pmtu) {
+ mtu = ip_rt_min_pmtu;
+ rth->u.dst.metrics[RTAX_LOCK-1] |=
+ (1 << RTAX_MTU);
+ }
+ rth->u.dst.metrics[RTAX_MTU-1] = mtu;
+ dst_set_expires(&rth->u.dst,
+ ip_rt_mtu_expires);
+ }
+ est_mtu = mtu;
+ }
+ }
+ }
+ rcu_read_unlock();
+ }
+ return est_mtu ? : new_mtu;
+}
+
+static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
+{
+ if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
+ !(dst_metric_locked(dst, RTAX_MTU))) {
+ if (mtu < ip_rt_min_pmtu) {
+ mtu = ip_rt_min_pmtu;
+ dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
+ }
+ dst->metrics[RTAX_MTU-1] = mtu;
+ dst_set_expires(dst, ip_rt_mtu_expires);
+ }
+}
+
+static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
+{
+ return NULL;
+}
+
+static void ipv4_dst_destroy(struct dst_entry *dst)
+{
+ struct rtable *rt = (struct rtable *) dst;
+ struct inet_peer *peer = rt->peer;
+ struct in_device *idev = rt->idev;
+
+ if (peer) {
+ rt->peer = NULL;
+ inet_putpeer(peer);
+ }
+
+ if (idev) {
+ rt->idev = NULL;
+ in_dev_put(idev);
+ }
+}
+
+static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
+ int how)
+{
+ struct rtable *rt = (struct rtable *) dst;
+ struct in_device *idev = rt->idev;
+ if (dev != &loopback_dev && idev && idev->dev == dev) {
+ struct in_device *loopback_idev = in_dev_get(&loopback_dev);
+ if (loopback_idev) {
+ rt->idev = loopback_idev;
+ in_dev_put(idev);
+ }
+ }
+}
+
+static void ipv4_link_failure(struct sk_buff *skb)
+{
+ struct rtable *rt;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+
+ rt = (struct rtable *) skb->dst;
+ if (rt)
+ dst_set_expires(&rt->u.dst, 0);
+}
+
+static int ip_rt_bug(struct sk_buff *skb)
+{
+ printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
+ NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
+ skb->dev ? skb->dev->name : "?");
+ kfree_skb(skb);
+ return 0;
+}
+
+/*
+ We do not cache source address of outgoing interface,
+ because it is used only by IP RR, TS and SRR options,
+ so that it out of fast path.
+
+ BTW remember: "addr" is allowed to be not aligned
+ in IP options!
+ */
+
+void ip_rt_get_source(u8 *addr, struct rtable *rt)
+{
+ u32 src;
+ struct fib_result res;
+
+ if (rt->fl.iif == 0)
+ src = rt->rt_src;
+ else if (fib_lookup(&rt->fl, &res) == 0) {
+ src = FIB_RES_PREFSRC(res);
+ fib_res_put(&res);
+ } else
+ src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
+ RT_SCOPE_UNIVERSE);
+ memcpy(addr, &src, 4);
+}
+
+#ifdef CONFIG_NET_CLS_ROUTE
+static void set_class_tag(struct rtable *rt, u32 tag)
+{
+ if (!(rt->u.dst.tclassid & 0xFFFF))
+ rt->u.dst.tclassid |= tag & 0xFFFF;
+ if (!(rt->u.dst.tclassid & 0xFFFF0000))
+ rt->u.dst.tclassid |= tag & 0xFFFF0000;
+}
+#endif
+
+static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
+{
+ struct fib_info *fi = res->fi;
+
+ if (fi) {
+ if (FIB_RES_GW(*res) &&
+ FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
+ rt->rt_gateway = FIB_RES_GW(*res);
+ memcpy(rt->u.dst.metrics, fi->fib_metrics,
+ sizeof(rt->u.dst.metrics));
+ if (fi->fib_mtu == 0) {
+ rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
+ if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
+ rt->rt_gateway != rt->rt_dst &&
+ rt->u.dst.dev->mtu > 576)
+ rt->u.dst.metrics[RTAX_MTU-1] = 576;
+ }
+#ifdef CONFIG_NET_CLS_ROUTE
+ rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
+#endif
+ } else
+ rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
+
+ if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
+ rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
+ if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
+ rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
+ if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
+ rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
+ ip_rt_min_advmss);
+ if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
+ rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
+
+#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ set_class_tag(rt, fib_rules_tclass(res));
+#endif
+ set_class_tag(rt, itag);
+#endif
+ rt->rt_type = res->type;
+}
+
+static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
+ u8 tos, struct net_device *dev, int our)
+{
+ unsigned hash;
+ struct rtable *rth;
+ u32 spec_dst;
+ struct in_device *in_dev = in_dev_get(dev);
+ u32 itag = 0;
+
+ /* Primary sanity checks. */
+
+ if (in_dev == NULL)
+ return -EINVAL;
+
+ if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
+ skb->protocol != htons(ETH_P_IP))
+ goto e_inval;
+
+ if (ZERONET(saddr)) {
+ if (!LOCAL_MCAST(daddr))
+ goto e_inval;
+ spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
+ } else if (fib_validate_source(saddr, 0, tos, 0,
+ dev, &spec_dst, &itag) < 0)
+ goto e_inval;
+
+ rth = dst_alloc(&ipv4_dst_ops);
+ if (!rth)
+ goto e_nobufs;
+
+ rth->u.dst.output= ip_rt_bug;
+
+ atomic_set(&rth->u.dst.__refcnt, 1);
+ rth->u.dst.flags= DST_HOST;
+ if (in_dev->cnf.no_policy)
+ rth->u.dst.flags |= DST_NOPOLICY;
+ rth->fl.fl4_dst = daddr;
+ rth->rt_dst = daddr;
+ rth->fl.fl4_tos = tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->fl.fl4_fwmark= skb->nfmark;
+#endif
+ rth->fl.fl4_src = saddr;
+ rth->rt_src = saddr;
+#ifdef CONFIG_NET_CLS_ROUTE
+ rth->u.dst.tclassid = itag;
+#endif
+ rth->rt_iif =
+ rth->fl.iif = dev->ifindex;
+ rth->u.dst.dev = &loopback_dev;
+ dev_hold(rth->u.dst.dev);
+ rth->idev = in_dev_get(rth->u.dst.dev);
+ rth->fl.oif = 0;
+ rth->rt_gateway = daddr;
+ rth->rt_spec_dst= spec_dst;
+ rth->rt_type = RTN_MULTICAST;
+ rth->rt_flags = RTCF_MULTICAST;
+ if (our) {
+ rth->u.dst.input= ip_local_deliver;
+ rth->rt_flags |= RTCF_LOCAL;
+ }
+
+#ifdef CONFIG_IP_MROUTE
+ if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
+ rth->u.dst.input = ip_mr_input;
+#endif
+ RT_CACHE_STAT_INC(in_slow_mc);
+
+ in_dev_put(in_dev);
+ hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
+ return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
+
+e_nobufs:
+ in_dev_put(in_dev);
+ return -ENOBUFS;
+
+e_inval:
+ in_dev_put(in_dev);
+ return -EINVAL;
+}
+
+
+static void ip_handle_martian_source(struct net_device *dev,
+ struct in_device *in_dev,
+ struct sk_buff *skb,
+ u32 daddr,
+ u32 saddr)
+{
+ RT_CACHE_STAT_INC(in_martian_src);
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+ if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
+ /*
+ * RFC1812 recommendation, if source is martian,
+ * the only hint is MAC header.
+ */
+ printk(KERN_WARNING "martian source %u.%u.%u.%u from "
+ "%u.%u.%u.%u, on dev %s\n",
+ NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
+ if (dev->hard_header_len) {
+ int i;
+ unsigned char *p = skb->mac.raw;
+ printk(KERN_WARNING "ll header: ");
+ for (i = 0; i < dev->hard_header_len; i++, p++) {
+ printk("%02x", *p);
+ if (i < (dev->hard_header_len - 1))
+ printk(":");
+ }
+ printk("\n");
+ }
+ }
+#endif
+}
+
+static inline int __mkroute_input(struct sk_buff *skb,
+ struct fib_result* res,
+ struct in_device *in_dev,
+ u32 daddr, u32 saddr, u32 tos,
+ struct rtable **result)
+{
+
+ struct rtable *rth;
+ int err;
+ struct in_device *out_dev;
+ unsigned flags = 0;
+ u32 spec_dst, itag;
+
+ /* get a working reference to the output device */
+ out_dev = in_dev_get(FIB_RES_DEV(*res));
+ if (out_dev == NULL) {
+ if (net_ratelimit())
+ printk(KERN_CRIT "Bug in ip_route_input" \
+ "_slow(). Please, report\n");
+ return -EINVAL;
+ }
+
+
+ err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
+ in_dev->dev, &spec_dst, &itag);
+ if (err < 0) {
+ ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
+ saddr);
+
+ err = -EINVAL;
+ goto cleanup;
+ }
+
+ if (err)
+ flags |= RTCF_DIRECTSRC;
+
+ if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
+ (IN_DEV_SHARED_MEDIA(out_dev) ||
+ inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
+ flags |= RTCF_DOREDIRECT;
+
+ if (skb->protocol != htons(ETH_P_IP)) {
+ /* Not IP (i.e. ARP). Do not create route, if it is
+ * invalid for proxy arp. DNAT routes are always valid.
+ */
+ if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
+ err = -EINVAL;
+ goto cleanup;
+ }
+ }
+
+
+ rth = dst_alloc(&ipv4_dst_ops);
+ if (!rth) {
+ err = -ENOBUFS;
+ goto cleanup;
+ }
+
+ rth->u.dst.flags= DST_HOST;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+ if (res->fi->fib_nhs > 1)
+ rth->u.dst.flags |= DST_BALANCED;
+#endif
+ if (in_dev->cnf.no_policy)
+ rth->u.dst.flags |= DST_NOPOLICY;
+ if (in_dev->cnf.no_xfrm)
+ rth->u.dst.flags |= DST_NOXFRM;
+ rth->fl.fl4_dst = daddr;
+ rth->rt_dst = daddr;
+ rth->fl.fl4_tos = tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->fl.fl4_fwmark= skb->nfmark;
+#endif
+ rth->fl.fl4_src = saddr;
+ rth->rt_src = saddr;
+ rth->rt_gateway = daddr;
+ rth->rt_iif =
+ rth->fl.iif = in_dev->dev->ifindex;
+ rth->u.dst.dev = (out_dev)->dev;
+ dev_hold(rth->u.dst.dev);
+ rth->idev = in_dev_get(rth->u.dst.dev);
+ rth->fl.oif = 0;
+ rth->rt_spec_dst= spec_dst;
+
+ rth->u.dst.input = ip_forward;
+ rth->u.dst.output = ip_output;
+
+ rt_set_nexthop(rth, res, itag);
+
+ rth->rt_flags = flags;
+
+ *result = rth;
+ err = 0;
+ cleanup:
+ /* release the working reference to the output device */
+ in_dev_put(out_dev);
+ return err;
+}
+
+static inline int ip_mkroute_input_def(struct sk_buff *skb,
+ struct fib_result* res,
+ const struct flowi *fl,
+ struct in_device *in_dev,
+ u32 daddr, u32 saddr, u32 tos)
+{
+ struct rtable* rth;
+ int err;
+ unsigned hash;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
+ fib_select_multipath(fl, res);
+#endif
+
+ /* create a routing cache entry */
+ err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
+ if (err)
+ return err;
+ atomic_set(&rth->u.dst.__refcnt, 1);
+
+ /* put it into the cache */
+ hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
+ return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+}
+
+static inline int ip_mkroute_input(struct sk_buff *skb,
+ struct fib_result* res,
+ const struct flowi *fl,
+ struct in_device *in_dev,
+ u32 daddr, u32 saddr, u32 tos)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+ struct rtable* rth;
+ unsigned char hop, hopcount, lasthop;
+ int err = -EINVAL;
+ unsigned int hash;
+
+ if (res->fi)
+ hopcount = res->fi->fib_nhs;
+ else
+ hopcount = 1;
+
+ lasthop = hopcount - 1;
+
+ /* distinguish between multipath and singlepath */
+ if (hopcount < 2)
+ return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
+ saddr, tos);
+
+ /* add all alternatives to the routing cache */
+ for (hop = 0; hop < hopcount; hop++) {
+ res->nh_sel = hop;
+
+ /* create a routing cache entry */
+ err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
+ &rth);
+ if (err)
+ return err;
+
+ /* put it into the cache */
+ hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
+ err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+ if (err)
+ return err;
+
+ /* forward hop information to multipath impl. */
+ multipath_set_nhinfo(rth,
+ FIB_RES_NETWORK(*res),
+ FIB_RES_NETMASK(*res),
+ res->prefixlen,
+ &FIB_RES_NH(*res));
+
+ /* only for the last hop the reference count is handled
+ * outside
+ */
+ if (hop == lasthop)
+ atomic_set(&(skb->dst->__refcnt), 1);
+ }
+ return err;
+#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+ return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
+#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+}
+
+
+/*
+ * NOTE. We drop all the packets that has local source
+ * addresses, because every properly looped back packet
+ * must have correct destination already attached by output routine.
+ *
+ * Such approach solves two big problems:
+ * 1. Not simplex devices are handled properly.
+ * 2. IP spoofing attempts are filtered with 100% of guarantee.
+ */
+
+static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
+ u8 tos, struct net_device *dev)
+{
+ struct fib_result res;
+ struct in_device *in_dev = in_dev_get(dev);
+ struct flowi fl = { .nl_u = { .ip4_u =
+ { .daddr = daddr,
+ .saddr = saddr,
+ .tos = tos,
+ .scope = RT_SCOPE_UNIVERSE,
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ .fwmark = skb->nfmark
+#endif
+ } },
+ .iif = dev->ifindex };
+ unsigned flags = 0;
+ u32 itag = 0;
+ struct rtable * rth;
+ unsigned hash;
+ u32 spec_dst;
+ int err = -EINVAL;
+ int free_res = 0;
+
+ /* IP on this device is disabled. */
+
+ if (!in_dev)
+ goto out;
+
+ /* Check for the most weird martians, which can be not detected
+ by fib_lookup.
+ */
+
+ if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
+ goto martian_source;
+
+ if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
+ goto brd_input;
+
+ /* Accept zero addresses only to limited broadcast;
+ * I even do not know to fix it or not. Waiting for complains :-)
+ */
+ if (ZERONET(saddr))
+ goto martian_source;
+
+ if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
+ goto martian_destination;
+
+ /*
+ * Now we are ready to route packet.
+ */
+ if ((err = fib_lookup(&fl, &res)) != 0) {
+ if (!IN_DEV_FORWARD(in_dev))
+ goto e_inval;
+ goto no_route;
+ }
+ free_res = 1;
+
+ RT_CACHE_STAT_INC(in_slow_tot);
+
+ if (res.type == RTN_BROADCAST)
+ goto brd_input;
+
+ if (res.type == RTN_LOCAL) {
+ int result;
+ result = fib_validate_source(saddr, daddr, tos,
+ loopback_dev.ifindex,
+ dev, &spec_dst, &itag);
+ if (result < 0)
+ goto martian_source;
+ if (result)
+ flags |= RTCF_DIRECTSRC;
+ spec_dst = daddr;
+ goto local_input;
+ }
+
+ if (!IN_DEV_FORWARD(in_dev))
+ goto e_inval;
+ if (res.type != RTN_UNICAST)
+ goto martian_destination;
+
+ err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
+ if (err == -ENOBUFS)
+ goto e_nobufs;
+ if (err == -EINVAL)
+ goto e_inval;
+
+done:
+ in_dev_put(in_dev);
+ if (free_res)
+ fib_res_put(&res);
+out: return err;
+
+brd_input:
+ if (skb->protocol != htons(ETH_P_IP))
+ goto e_inval;
+
+ if (ZERONET(saddr))
+ spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
+ else {
+ err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
+ &itag);
+ if (err < 0)
+ goto martian_source;
+ if (err)
+ flags |= RTCF_DIRECTSRC;
+ }
+ flags |= RTCF_BROADCAST;
+ res.type = RTN_BROADCAST;
+ RT_CACHE_STAT_INC(in_brd);
+
+local_input:
+ rth = dst_alloc(&ipv4_dst_ops);
+ if (!rth)
+ goto e_nobufs;
+
+ rth->u.dst.output= ip_rt_bug;
+
+ atomic_set(&rth->u.dst.__refcnt, 1);
+ rth->u.dst.flags= DST_HOST;
+ if (in_dev->cnf.no_policy)
+ rth->u.dst.flags |= DST_NOPOLICY;
+ rth->fl.fl4_dst = daddr;
+ rth->rt_dst = daddr;
+ rth->fl.fl4_tos = tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->fl.fl4_fwmark= skb->nfmark;
+#endif
+ rth->fl.fl4_src = saddr;
+ rth->rt_src = saddr;
+#ifdef CONFIG_NET_CLS_ROUTE
+ rth->u.dst.tclassid = itag;
+#endif
+ rth->rt_iif =
+ rth->fl.iif = dev->ifindex;
+ rth->u.dst.dev = &loopback_dev;
+ dev_hold(rth->u.dst.dev);
+ rth->idev = in_dev_get(rth->u.dst.dev);
+ rth->rt_gateway = daddr;
+ rth->rt_spec_dst= spec_dst;
+ rth->u.dst.input= ip_local_deliver;
+ rth->rt_flags = flags|RTCF_LOCAL;
+ if (res.type == RTN_UNREACHABLE) {
+ rth->u.dst.input= ip_error;
+ rth->u.dst.error= -err;
+ rth->rt_flags &= ~RTCF_LOCAL;
+ }
+ rth->rt_type = res.type;
+ hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
+ err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+ goto done;
+
+no_route:
+ RT_CACHE_STAT_INC(in_no_route);
+ spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+ res.type = RTN_UNREACHABLE;
+ goto local_input;
+
+ /*
+ * Do not cache martian addresses: they should be logged (RFC1812)
+ */
+martian_destination:
+ RT_CACHE_STAT_INC(in_martian_dst);
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+ if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
+ printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
+ "%u.%u.%u.%u, dev %s\n",
+ NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
+#endif
+e_inval:
+ err = -EINVAL;
+ goto done;
+
+e_nobufs:
+ err = -ENOBUFS;
+ goto done;
+
+martian_source:
+ ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
+ goto e_inval;
+}
+
+int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
+ u8 tos, struct net_device *dev)
+{
+ struct rtable * rth;
+ unsigned hash;
+ int iif = dev->ifindex;
+
+ tos &= IPTOS_RT_MASK;
+ hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
+
+ rcu_read_lock();
+ for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+ rth = rcu_dereference(rth->u.rt_next)) {
+ if (rth->fl.fl4_dst == daddr &&
+ rth->fl.fl4_src == saddr &&
+ rth->fl.iif == iif &&
+ rth->fl.oif == 0 &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->fl.fl4_fwmark == skb->nfmark &&
+#endif
+ rth->fl.fl4_tos == tos) {
+ rth->u.dst.lastuse = jiffies;
+ dst_hold(&rth->u.dst);
+ rth->u.dst.__use++;
+ RT_CACHE_STAT_INC(in_hit);
+ rcu_read_unlock();
+ skb->dst = (struct dst_entry*)rth;
+ return 0;
+ }
+ RT_CACHE_STAT_INC(in_hlist_search);
+ }
+ rcu_read_unlock();
+
+ /* Multicast recognition logic is moved from route cache to here.
+ The problem was that too many Ethernet cards have broken/missing
+ hardware multicast filters :-( As result the host on multicasting
+ network acquires a lot of useless route cache entries, sort of
+ SDR messages from all the world. Now we try to get rid of them.
+ Really, provided software IP multicast filter is organized
+ reasonably (at least, hashed), it does not result in a slowdown
+ comparing with route cache reject entries.
+ Note, that multicast routers are not affected, because
+ route cache entry is created eventually.
+ */
+ if (MULTICAST(daddr)) {
+ struct in_device *in_dev;
+
+ rcu_read_lock();
+ if ((in_dev = __in_dev_get(dev)) != NULL) {
+ int our = ip_check_mc(in_dev, daddr, saddr,
+ skb->nh.iph->protocol);
+ if (our
+#ifdef CONFIG_IP_MROUTE
+ || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
+#endif
+ ) {
+ rcu_read_unlock();
+ return ip_route_input_mc(skb, daddr, saddr,
+ tos, dev, our);
+ }
+ }
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ return ip_route_input_slow(skb, daddr, saddr, tos, dev);
+}
+
+static inline int __mkroute_output(struct rtable **result,
+ struct fib_result* res,
+ const struct flowi *fl,
+ const struct flowi *oldflp,
+ struct net_device *dev_out,
+ unsigned flags)
+{
+ struct rtable *rth;
+ struct in_device *in_dev;
+ u32 tos = RT_FL_TOS(oldflp);
+ int err = 0;
+
+ if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
+ return -EINVAL;
+
+ if (fl->fl4_dst == 0xFFFFFFFF)
+ res->type = RTN_BROADCAST;
+ else if (MULTICAST(fl->fl4_dst))
+ res->type = RTN_MULTICAST;
+ else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
+ return -EINVAL;
+
+ if (dev_out->flags & IFF_LOOPBACK)
+ flags |= RTCF_LOCAL;
+
+ /* get work reference to inet device */
+ in_dev = in_dev_get(dev_out);
+ if (!in_dev)
+ return -EINVAL;
+
+ if (res->type == RTN_BROADCAST) {
+ flags |= RTCF_BROADCAST | RTCF_LOCAL;
+ if (res->fi) {
+ fib_info_put(res->fi);
+ res->fi = NULL;
+ }
+ } else if (res->type == RTN_MULTICAST) {
+ flags |= RTCF_MULTICAST|RTCF_LOCAL;
+ if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
+ oldflp->proto))
+ flags &= ~RTCF_LOCAL;
+ /* If multicast route do not exist use
+ default one, but do not gateway in this case.
+ Yes, it is hack.
+ */
+ if (res->fi && res->prefixlen < 4) {
+ fib_info_put(res->fi);
+ res->fi = NULL;
+ }
+ }
+
+
+ rth = dst_alloc(&ipv4_dst_ops);
+ if (!rth) {
+ err = -ENOBUFS;
+ goto cleanup;
+ }
+
+ rth->u.dst.flags= DST_HOST;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+ if (res->fi) {
+ rth->rt_multipath_alg = res->fi->fib_mp_alg;
+ if (res->fi->fib_nhs > 1)
+ rth->u.dst.flags |= DST_BALANCED;
+ }
+#endif
+ if (in_dev->cnf.no_xfrm)
+ rth->u.dst.flags |= DST_NOXFRM;
+ if (in_dev->cnf.no_policy)
+ rth->u.dst.flags |= DST_NOPOLICY;
+
+ rth->fl.fl4_dst = oldflp->fl4_dst;
+ rth->fl.fl4_tos = tos;
+ rth->fl.fl4_src = oldflp->fl4_src;
+ rth->fl.oif = oldflp->oif;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
+#endif
+ rth->rt_dst = fl->fl4_dst;
+ rth->rt_src = fl->fl4_src;
+ rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
+ /* get references to the devices that are to be hold by the routing
+ cache entry */
+ rth->u.dst.dev = dev_out;
+ dev_hold(dev_out);
+ rth->idev = in_dev_get(dev_out);
+ rth->rt_gateway = fl->fl4_dst;
+ rth->rt_spec_dst= fl->fl4_src;
+
+ rth->u.dst.output=ip_output;
+
+ RT_CACHE_STAT_INC(out_slow_tot);
+
+ if (flags & RTCF_LOCAL) {
+ rth->u.dst.input = ip_local_deliver;
+ rth->rt_spec_dst = fl->fl4_dst;
+ }
+ if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
+ rth->rt_spec_dst = fl->fl4_src;
+ if (flags & RTCF_LOCAL &&
+ !(dev_out->flags & IFF_LOOPBACK)) {
+ rth->u.dst.output = ip_mc_output;
+ RT_CACHE_STAT_INC(out_slow_mc);
+ }
+#ifdef CONFIG_IP_MROUTE
+ if (res->type == RTN_MULTICAST) {
+ if (IN_DEV_MFORWARD(in_dev) &&
+ !LOCAL_MCAST(oldflp->fl4_dst)) {
+ rth->u.dst.input = ip_mr_input;
+ rth->u.dst.output = ip_mc_output;
+ }
+ }
+#endif
+ }
+
+ rt_set_nexthop(rth, res, 0);
+
+ rth->rt_flags = flags;
+
+ *result = rth;
+ cleanup:
+ /* release work reference to inet device */
+ in_dev_put(in_dev);
+
+ return err;
+}
+
+static inline int ip_mkroute_output_def(struct rtable **rp,
+ struct fib_result* res,
+ const struct flowi *fl,
+ const struct flowi *oldflp,
+ struct net_device *dev_out,
+ unsigned flags)
+{
+ struct rtable *rth;
+ int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
+ unsigned hash;
+ if (err == 0) {
+ u32 tos = RT_FL_TOS(oldflp);
+
+ atomic_set(&rth->u.dst.__refcnt, 1);
+
+ hash = rt_hash_code(oldflp->fl4_dst,
+ oldflp->fl4_src ^ (oldflp->oif << 5), tos);
+ err = rt_intern_hash(hash, rth, rp);
+ }
+
+ return err;
+}
+
+static inline int ip_mkroute_output(struct rtable** rp,
+ struct fib_result* res,
+ const struct flowi *fl,
+ const struct flowi *oldflp,
+ struct net_device *dev_out,
+ unsigned flags)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+ u32 tos = RT_FL_TOS(oldflp);
+ unsigned char hop;
+ unsigned hash;
+ int err = -EINVAL;
+ struct rtable *rth;
+
+ if (res->fi && res->fi->fib_nhs > 1) {
+ unsigned char hopcount = res->fi->fib_nhs;
+
+ for (hop = 0; hop < hopcount; hop++) {
+ struct net_device *dev2nexthop;
+
+ res->nh_sel = hop;
+
+ /* hold a work reference to the output device */
+ dev2nexthop = FIB_RES_DEV(*res);
+ dev_hold(dev2nexthop);
+
+ err = __mkroute_output(&rth, res, fl, oldflp,
+ dev2nexthop, flags);
+
+ if (err != 0)
+ goto cleanup;
+
+ hash = rt_hash_code(oldflp->fl4_dst,
+ oldflp->fl4_src ^
+ (oldflp->oif << 5), tos);
+ err = rt_intern_hash(hash, rth, rp);
+
+ /* forward hop information to multipath impl. */
+ multipath_set_nhinfo(rth,
+ FIB_RES_NETWORK(*res),
+ FIB_RES_NETMASK(*res),
+ res->prefixlen,
+ &FIB_RES_NH(*res));
+ cleanup:
+ /* release work reference to output device */
+ dev_put(dev2nexthop);
+
+ if (err != 0)
+ return err;
+ }
+ atomic_set(&(*rp)->u.dst.__refcnt, 1);
+ return err;
+ } else {
+ return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
+ flags);
+ }
+#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+ return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
+#endif
+}
+
+/*
+ * Major route resolver routine.
+ */
+
+static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
+{
+ u32 tos = RT_FL_TOS(oldflp);
+ struct flowi fl = { .nl_u = { .ip4_u =
+ { .daddr = oldflp->fl4_dst,
+ .saddr = oldflp->fl4_src,
+ .tos = tos & IPTOS_RT_MASK,
+ .scope = ((tos & RTO_ONLINK) ?
+ RT_SCOPE_LINK :
+ RT_SCOPE_UNIVERSE),
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ .fwmark = oldflp->fl4_fwmark
+#endif
+ } },
+ .iif = loopback_dev.ifindex,
+ .oif = oldflp->oif };
+ struct fib_result res;
+ unsigned flags = 0;
+ struct net_device *dev_out = NULL;
+ int free_res = 0;
+ int err;
+
+
+ res.fi = NULL;
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ res.r = NULL;
+#endif
+
+ if (oldflp->fl4_src) {
+ err = -EINVAL;
+ if (MULTICAST(oldflp->fl4_src) ||
+ BADCLASS(oldflp->fl4_src) ||
+ ZERONET(oldflp->fl4_src))
+ goto out;
+
+ /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+ dev_out = ip_dev_find(oldflp->fl4_src);
+ if (dev_out == NULL)
+ goto out;
+
+ /* I removed check for oif == dev_out->oif here.
+ It was wrong for two reasons:
+ 1. ip_dev_find(saddr) can return wrong iface, if saddr is
+ assigned to multiple interfaces.
+ 2. Moreover, we are allowed to send packets with saddr
+ of another iface. --ANK
+ */
+
+ if (oldflp->oif == 0
+ && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
+ /* Special hack: user can direct multicasts
+ and limited broadcast via necessary interface
+ without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
+ This hack is not just for fun, it allows
+ vic,vat and friends to work.
+ They bind socket to loopback, set ttl to zero
+ and expect that it will work.
+ From the viewpoint of routing cache they are broken,
+ because we are not allowed to build multicast path
+ with loopback source addr (look, routing cache
+ cannot know, that ttl is zero, so that packet
+ will not leave this host and route is valid).
+ Luckily, this hack is good workaround.
+ */
+
+ fl.oif = dev_out->ifindex;
+ goto make_route;
+ }
+ if (dev_out)
+ dev_put(dev_out);
+ dev_out = NULL;
+ }
+
+
+ if (oldflp->oif) {
+ dev_out = dev_get_by_index(oldflp->oif);
+ err = -ENODEV;
+ if (dev_out == NULL)
+ goto out;
+ if (__in_dev_get(dev_out) == NULL) {
+ dev_put(dev_out);
+ goto out; /* Wrong error code */
+ }
+
+ if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
+ if (!fl.fl4_src)
+ fl.fl4_src = inet_select_addr(dev_out, 0,
+ RT_SCOPE_LINK);
+ goto make_route;
+ }
+ if (!fl.fl4_src) {
+ if (MULTICAST(oldflp->fl4_dst))
+ fl.fl4_src = inet_select_addr(dev_out, 0,
+ fl.fl4_scope);
+ else if (!oldflp->fl4_dst)
+ fl.fl4_src = inet_select_addr(dev_out, 0,
+ RT_SCOPE_HOST);
+ }
+ }
+
+ if (!fl.fl4_dst) {
+ fl.fl4_dst = fl.fl4_src;
+ if (!fl.fl4_dst)
+ fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
+ if (dev_out)
+ dev_put(dev_out);
+ dev_out = &loopback_dev;
+ dev_hold(dev_out);
+ fl.oif = loopback_dev.ifindex;
+ res.type = RTN_LOCAL;
+ flags |= RTCF_LOCAL;
+ goto make_route;
+ }
+
+ if (fib_lookup(&fl, &res)) {
+ res.fi = NULL;
+ if (oldflp->oif) {
+ /* Apparently, routing tables are wrong. Assume,
+ that the destination is on link.
+
+ WHY? DW.
+ Because we are allowed to send to iface
+ even if it has NO routes and NO assigned
+ addresses. When oif is specified, routing
+ tables are looked up with only one purpose:
+ to catch if destination is gatewayed, rather than
+ direct. Moreover, if MSG_DONTROUTE is set,
+ we send packet, ignoring both routing tables
+ and ifaddr state. --ANK
+
+
+ We could make it even if oif is unknown,
+ likely IPv6, but we do not.
+ */
+
+ if (fl.fl4_src == 0)
+ fl.fl4_src = inet_select_addr(dev_out, 0,
+ RT_SCOPE_LINK);
+ res.type = RTN_UNICAST;
+ goto make_route;
+ }
+ if (dev_out)
+ dev_put(dev_out);
+ err = -ENETUNREACH;
+ goto out;
+ }
+ free_res = 1;
+
+ if (res.type == RTN_LOCAL) {
+ if (!fl.fl4_src)
+ fl.fl4_src = fl.fl4_dst;
+ if (dev_out)
+ dev_put(dev_out);
+ dev_out = &loopback_dev;
+ dev_hold(dev_out);
+ fl.oif = dev_out->ifindex;
+ if (res.fi)
+ fib_info_put(res.fi);
+ res.fi = NULL;
+ flags |= RTCF_LOCAL;
+ goto make_route;
+ }
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (res.fi->fib_nhs > 1 && fl.oif == 0)
+ fib_select_multipath(&fl, &res);
+ else
+#endif
+ if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
+ fib_select_default(&fl, &res);
+
+ if (!fl.fl4_src)
+ fl.fl4_src = FIB_RES_PREFSRC(res);
+
+ if (dev_out)
+ dev_put(dev_out);
+ dev_out = FIB_RES_DEV(res);
+ dev_hold(dev_out);
+ fl.oif = dev_out->ifindex;
+
+
+make_route:
+ err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
+
+
+ if (free_res)
+ fib_res_put(&res);
+ if (dev_out)
+ dev_put(dev_out);
+out: return err;
+}
+
+int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
+{
+ unsigned hash;
+ struct rtable *rth;
+
+ hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
+
+ rcu_read_lock_bh();
+ for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+ rth = rcu_dereference(rth->u.rt_next)) {
+ if (rth->fl.fl4_dst == flp->fl4_dst &&
+ rth->fl.fl4_src == flp->fl4_src &&
+ rth->fl.iif == 0 &&
+ rth->fl.oif == flp->oif &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->fl.fl4_fwmark == flp->fl4_fwmark &&
+#endif
+ !((rth->fl.fl4_tos ^ flp->fl4_tos) &
+ (IPTOS_RT_MASK | RTO_ONLINK))) {
+
+ /* check for multipath routes and choose one if
+ * necessary
+ */
+ if (multipath_select_route(flp, rth, rp)) {
+ dst_hold(&(*rp)->u.dst);
+ RT_CACHE_STAT_INC(out_hit);
+ rcu_read_unlock_bh();
+ return 0;
+ }
+
+ rth->u.dst.lastuse = jiffies;
+ dst_hold(&rth->u.dst);
+ rth->u.dst.__use++;
+ RT_CACHE_STAT_INC(out_hit);
+ rcu_read_unlock_bh();
+ *rp = rth;
+ return 0;
+ }
+ RT_CACHE_STAT_INC(out_hlist_search);
+ }
+ rcu_read_unlock_bh();
+
+ return ip_route_output_slow(rp, flp);
+}
+
+int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
+{
+ int err;
+
+ if ((err = __ip_route_output_key(rp, flp)) != 0)
+ return err;
+
+ if (flp->proto) {
+ if (!flp->fl4_src)
+ flp->fl4_src = (*rp)->rt_src;
+ if (!flp->fl4_dst)
+ flp->fl4_dst = (*rp)->rt_dst;
+ return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
+ }
+
+ return 0;
+}
+
+int ip_route_output_key(struct rtable **rp, struct flowi *flp)
+{
+ return ip_route_output_flow(rp, flp, NULL, 0);
+}
+
+static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+ int nowait)
+{
+ struct rtable *rt = (struct rtable*)skb->dst;
+ struct rtmsg *r;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb->tail;
+ struct rta_cacheinfo ci;
+#ifdef CONFIG_IP_MROUTE
+ struct rtattr *eptr;
+#endif
+ nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
+ r = NLMSG_DATA(nlh);
+ nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
+ r->rtm_family = AF_INET;
+ r->rtm_dst_len = 32;
+ r->rtm_src_len = 0;
+ r->rtm_tos = rt->fl.fl4_tos;
+ r->rtm_table = RT_TABLE_MAIN;
+ r->rtm_type = rt->rt_type;
+ r->rtm_scope = RT_SCOPE_UNIVERSE;
+ r->rtm_protocol = RTPROT_UNSPEC;
+ r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
+ if (rt->rt_flags & RTCF_NOTIFY)
+ r->rtm_flags |= RTM_F_NOTIFY;
+ RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
+ if (rt->fl.fl4_src) {
+ r->rtm_src_len = 32;
+ RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
+ }
+ if (rt->u.dst.dev)
+ RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (rt->u.dst.tclassid)
+ RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
+#endif
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+ if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
+ __u32 alg = rt->rt_multipath_alg;
+
+ RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
+ }
+#endif
+ if (rt->fl.iif)
+ RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
+ else if (rt->rt_src != rt->fl.fl4_src)
+ RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
+ if (rt->rt_dst != rt->rt_gateway)
+ RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
+ if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
+ goto rtattr_failure;
+ ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
+ ci.rta_used = rt->u.dst.__use;
+ ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
+ if (rt->u.dst.expires)
+ ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
+ else
+ ci.rta_expires = 0;
+ ci.rta_error = rt->u.dst.error;
+ ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
+ if (rt->peer) {
+ ci.rta_id = rt->peer->ip_id_count;
+ if (rt->peer->tcp_ts_stamp) {
+ ci.rta_ts = rt->peer->tcp_ts;
+ ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
+ }
+ }
+#ifdef CONFIG_IP_MROUTE
+ eptr = (struct rtattr*)skb->tail;
+#endif
+ RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
+ if (rt->fl.iif) {
+#ifdef CONFIG_IP_MROUTE
+ u32 dst = rt->rt_dst;
+
+ if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
+ ipv4_devconf.mc_forwarding) {
+ int err = ipmr_get_route(skb, r, nowait);
+ if (err <= 0) {
+ if (!nowait) {
+ if (err == 0)
+ return 0;
+ goto nlmsg_failure;
+ } else {
+ if (err == -EMSGSIZE)
+ goto nlmsg_failure;
+ ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
+ }
+ }
+ } else
+#endif
+ RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
+ }
+
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+nlmsg_failure:
+rtattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
+{
+ struct rtattr **rta = arg;
+ struct rtmsg *rtm = NLMSG_DATA(nlh);
+ struct rtable *rt = NULL;
+ u32 dst = 0;
+ u32 src = 0;
+ int iif = 0;
+ int err = -ENOBUFS;
+ struct sk_buff *skb;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ goto out;
+
+ /* Reserve room for dummy headers, this skb can pass
+ through good chunk of routing engine.
+ */
+ skb->mac.raw = skb->data;
+ skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
+
+ if (rta[RTA_SRC - 1])
+ memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
+ if (rta[RTA_DST - 1])
+ memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
+ if (rta[RTA_IIF - 1])
+ memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
+
+ if (iif) {
+ struct net_device *dev = __dev_get_by_index(iif);
+ err = -ENODEV;
+ if (!dev)
+ goto out_free;
+ skb->protocol = htons(ETH_P_IP);
+ skb->dev = dev;
+ local_bh_disable();
+ err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
+ local_bh_enable();
+ rt = (struct rtable*)skb->dst;
+ if (!err && rt->u.dst.error)
+ err = -rt->u.dst.error;
+ } else {
+ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
+ .saddr = src,
+ .tos = rtm->rtm_tos } } };
+ int oif = 0;
+ if (rta[RTA_OIF - 1])
+ memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
+ fl.oif = oif;
+ err = ip_route_output_key(&rt, &fl);
+ }
+ if (err)
+ goto out_free;
+
+ skb->dst = &rt->u.dst;
+ if (rtm->rtm_flags & RTM_F_NOTIFY)
+ rt->rt_flags |= RTCF_NOTIFY;
+
+ NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
+
+ err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+ RTM_NEWROUTE, 0);
+ if (!err)
+ goto out_free;
+ if (err < 0) {
+ err = -EMSGSIZE;
+ goto out_free;
+ }
+
+ err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
+ if (err > 0)
+ err = 0;
+out: return err;
+
+out_free:
+ kfree_skb(skb);
+ goto out;
+}
+
+int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct rtable *rt;
+ int h, s_h;
+ int idx, s_idx;
+
+ s_h = cb->args[0];
+ s_idx = idx = cb->args[1];
+ for (h = 0; h <= rt_hash_mask; h++) {
+ if (h < s_h) continue;
+ if (h > s_h)
+ s_idx = 0;
+ rcu_read_lock_bh();
+ for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
+ rt = rcu_dereference(rt->u.rt_next), idx++) {
+ if (idx < s_idx)
+ continue;
+ skb->dst = dst_clone(&rt->u.dst);
+ if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWROUTE, 1) <= 0) {
+ dst_release(xchg(&skb->dst, NULL));
+ rcu_read_unlock_bh();
+ goto done;
+ }
+ dst_release(xchg(&skb->dst, NULL));
+ }
+ rcu_read_unlock_bh();
+ }
+
+done:
+ cb->args[0] = h;
+ cb->args[1] = idx;
+ return skb->len;
+}
+
+void ip_rt_multicast_event(struct in_device *in_dev)
+{
+ rt_cache_flush(0);
+}
+
+#ifdef CONFIG_SYSCTL
+static int flush_delay;
+
+static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ if (write) {
+ proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+ rt_cache_flush(flush_delay);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
+ int __user *name,
+ int nlen,
+ void __user *oldval,
+ size_t __user *oldlenp,
+ void __user *newval,
+ size_t newlen,
+ void **context)
+{
+ int delay;
+ if (newlen != sizeof(int))
+ return -EINVAL;
+ if (get_user(delay, (int __user *)newval))
+ return -EFAULT;
+ rt_cache_flush(delay);
+ return 0;
+}
+
+ctl_table ipv4_route_table[] = {
+ {
+ .ctl_name = NET_IPV4_ROUTE_FLUSH,
+ .procname = "flush",
+ .data = &flush_delay,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &ipv4_sysctl_rtcache_flush,
+ .strategy = &ipv4_sysctl_rtcache_flush_strategy,
+ },
+ {
+ .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
+ .procname = "min_delay",
+ .data = &ip_rt_min_delay,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
+ .procname = "max_delay",
+ .data = &ip_rt_max_delay,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
+ .procname = "gc_thresh",
+ .data = &ipv4_dst_ops.gc_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
+ .procname = "max_size",
+ .data = &ip_rt_max_size,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ /* Deprecated. Use gc_min_interval_ms */
+
+ .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
+ .procname = "gc_min_interval",
+ .data = &ip_rt_gc_min_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
+ .procname = "gc_min_interval_ms",
+ .data = &ip_rt_gc_min_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_ms_jiffies,
+ .strategy = &sysctl_ms_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
+ .procname = "gc_timeout",
+ .data = &ip_rt_gc_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
+ .procname = "gc_interval",
+ .data = &ip_rt_gc_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
+ .procname = "redirect_load",
+ .data = &ip_rt_redirect_load,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
+ .procname = "redirect_number",
+ .data = &ip_rt_redirect_number,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
+ .procname = "redirect_silence",
+ .data = &ip_rt_redirect_silence,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
+ .procname = "error_cost",
+ .data = &ip_rt_error_cost,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
+ .procname = "error_burst",
+ .data = &ip_rt_error_burst,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
+ .procname = "gc_elasticity",
+ .data = &ip_rt_gc_elasticity,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
+ .procname = "mtu_expires",
+ .data = &ip_rt_mtu_expires,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+ {
+ .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
+ .procname = "min_pmtu",
+ .data = &ip_rt_min_pmtu,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
+ .procname = "min_adv_mss",
+ .data = &ip_rt_min_advmss,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
+ .procname = "secret_interval",
+ .data = &ip_rt_secret_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+ { .ctl_name = 0 }
+};
+#endif
+
+#ifdef CONFIG_NET_CLS_ROUTE
+struct ip_rt_acct *ip_rt_acct;
+
+/* This code sucks. But you should have seen it before! --RR */
+
+/* IP route accounting ptr for this logical cpu number. */
+#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
+
+#ifdef CONFIG_PROC_FS
+static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
+ int length, int *eof, void *data)
+{
+ unsigned int i;
+
+ if ((offset & 3) || (length & 3))
+ return -EIO;
+
+ if (offset >= sizeof(struct ip_rt_acct) * 256) {
+ *eof = 1;
+ return 0;
+ }
+
+ if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
+ length = sizeof(struct ip_rt_acct) * 256 - offset;
+ *eof = 1;
+ }
+
+ offset /= sizeof(u32);
+
+ if (length > 0) {
+ u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
+ u32 *dst = (u32 *) buffer;
+
+ /* Copy first cpu. */
+ *start = buffer;
+ memcpy(dst, src, length);
+
+ /* Add the other cpus in, one int at a time */
+ for_each_cpu(i) {
+ unsigned int j;
+
+ src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
+
+ for (j = 0; j < length/4; j++)
+ dst[j] += src[j];
+ }
+ }
+ return length;
+}
+#endif /* CONFIG_PROC_FS */
+#endif /* CONFIG_NET_CLS_ROUTE */
+
+static __initdata unsigned long rhash_entries;
+static int __init set_rhash_entries(char *str)
+{
+ if (!str)
+ return 0;
+ rhash_entries = simple_strtoul(str, &str, 0);
+ return 1;
+}
+__setup("rhash_entries=", set_rhash_entries);
+
+int __init ip_rt_init(void)
+{
+ int i, order, goal, rc = 0;
+
+ rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
+ (jiffies ^ (jiffies >> 7)));
+
+#ifdef CONFIG_NET_CLS_ROUTE
+ for (order = 0;
+ (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
+ /* NOTHING */;
+ ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
+ if (!ip_rt_acct)
+ panic("IP: failed to allocate ip_rt_acct\n");
+ memset(ip_rt_acct, 0, PAGE_SIZE << order);
+#endif
+
+ ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
+ sizeof(struct rtable),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+
+ if (!ipv4_dst_ops.kmem_cachep)
+ panic("IP: failed to allocate ip_dst_cache\n");
+
+ goal = num_physpages >> (26 - PAGE_SHIFT);
+ if (rhash_entries)
+ goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
+ for (order = 0; (1UL << order) < goal; order++)
+ /* NOTHING */;
+
+ do {
+ rt_hash_mask = (1UL << order) * PAGE_SIZE /
+ sizeof(struct rt_hash_bucket);
+ while (rt_hash_mask & (rt_hash_mask - 1))
+ rt_hash_mask--;
+ rt_hash_table = (struct rt_hash_bucket *)
+ __get_free_pages(GFP_ATOMIC, order);
+ } while (rt_hash_table == NULL && --order > 0);
+
+ if (!rt_hash_table)
+ panic("Failed to allocate IP route cache hash table\n");
+
+ printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
+ rt_hash_mask,
+ (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
+
+ for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
+ /* NOTHING */;
+
+ rt_hash_mask--;
+ for (i = 0; i <= rt_hash_mask; i++) {
+ spin_lock_init(&rt_hash_table[i].lock);
+ rt_hash_table[i].chain = NULL;
+ }
+
+ ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
+ ip_rt_max_size = (rt_hash_mask + 1) * 16;
+
+ rt_cache_stat = alloc_percpu(struct rt_cache_stat);
+ if (!rt_cache_stat)
+ return -ENOMEM;
+
+ devinet_init();
+ ip_fib_init();
+
+ init_timer(&rt_flush_timer);
+ rt_flush_timer.function = rt_run_flush;
+ init_timer(&rt_periodic_timer);
+ rt_periodic_timer.function = rt_check_expire;
+ init_timer(&rt_secret_timer);
+ rt_secret_timer.function = rt_secret_rebuild;
+
+ /* All the timers, started at system startup tend
+ to synchronize. Perturb it a bit.
+ */
+ rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
+ ip_rt_gc_interval;
+ add_timer(&rt_periodic_timer);
+
+ rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
+ ip_rt_secret_interval;
+ add_timer(&rt_secret_timer);
+
+#ifdef CONFIG_PROC_FS
+ {
+ struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
+ if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
+ !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
+ proc_net_stat))) {
+ free_percpu(rt_cache_stat);
+ return -ENOMEM;
+ }
+ rtstat_pde->proc_fops = &rt_cpu_seq_fops;
+ }
+#ifdef CONFIG_NET_CLS_ROUTE
+ create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
+#endif
+#endif
+#ifdef CONFIG_XFRM
+ xfrm_init();
+ xfrm4_init();
+#endif
+ return rc;
+}
+
+EXPORT_SYMBOL(__ip_select_ident);
+EXPORT_SYMBOL(ip_route_input);
+EXPORT_SYMBOL(ip_route_output_key);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
new file mode 100644
index 00000000000..e923d2f021a
--- /dev/null
+++ b/net/ipv4/syncookies.c
@@ -0,0 +1,279 @@
+/*
+ * Syncookies implementation for the Linux kernel
+ *
+ * Copyright (C) 1997 Andi Kleen
+ * Based on ideas by D.J.Bernstein and Eric Schenk.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * $Id: syncookies.c,v 1.18 2002/02/01 22:01:04 davem Exp $
+ *
+ * Missing: IPv6 support.
+ */
+
+#include <linux/tcp.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/cryptohash.h>
+#include <linux/kernel.h>
+#include <net/tcp.h>
+
+extern int sysctl_tcp_syncookies;
+
+static __u32 syncookie_secret[2][16-3+SHA_DIGEST_WORDS];
+
+static __init int init_syncookies(void)
+{
+ get_random_bytes(syncookie_secret, sizeof(syncookie_secret));
+ return 0;
+}
+module_init(init_syncookies);
+
+#define COOKIEBITS 24 /* Upper bits store count */
+#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
+
+static u32 cookie_hash(u32 saddr, u32 daddr, u32 sport, u32 dport,
+ u32 count, int c)
+{
+ __u32 tmp[16 + 5 + SHA_WORKSPACE_WORDS];
+
+ memcpy(tmp + 3, syncookie_secret[c], sizeof(syncookie_secret[c]));
+ tmp[0] = saddr;
+ tmp[1] = daddr;
+ tmp[2] = (sport << 16) + dport;
+ tmp[3] = count;
+ sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
+
+ return tmp[17];
+}
+
+static __u32 secure_tcp_syn_cookie(__u32 saddr, __u32 daddr, __u16 sport,
+ __u16 dport, __u32 sseq, __u32 count,
+ __u32 data)
+{
+ /*
+ * Compute the secure sequence number.
+ * The output should be:
+ * HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24)
+ * + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24).
+ * Where sseq is their sequence number and count increases every
+ * minute by 1.
+ * As an extra hack, we add a small "data" value that encodes the
+ * MSS into the second hash value.
+ */
+
+ return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
+ sseq + (count << COOKIEBITS) +
+ ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
+ & COOKIEMASK));
+}
+
+/*
+ * This retrieves the small "data" value from the syncookie.
+ * If the syncookie is bad, the data returned will be out of
+ * range. This must be checked by the caller.
+ *
+ * The count value used to generate the cookie must be within
+ * "maxdiff" if the current (passed-in) "count". The return value
+ * is (__u32)-1 if this test fails.
+ */
+static __u32 check_tcp_syn_cookie(__u32 cookie, __u32 saddr, __u32 daddr,
+ __u16 sport, __u16 dport, __u32 sseq,
+ __u32 count, __u32 maxdiff)
+{
+ __u32 diff;
+
+ /* Strip away the layers from the cookie */
+ cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;
+
+ /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
+ diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS);
+ if (diff >= maxdiff)
+ return (__u32)-1;
+
+ return (cookie -
+ cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
+ & COOKIEMASK; /* Leaving the data behind */
+}
+
+/*
+ * This table has to be sorted and terminated with (__u16)-1.
+ * XXX generate a better table.
+ * Unresolved Issues: HIPPI with a 64k MSS is not well supported.
+ */
+static __u16 const msstab[] = {
+ 64 - 1,
+ 256 - 1,
+ 512 - 1,
+ 536 - 1,
+ 1024 - 1,
+ 1440 - 1,
+ 1460 - 1,
+ 4312 - 1,
+ (__u16)-1
+};
+/* The number doesn't include the -1 terminator */
+#define NUM_MSS (ARRAY_SIZE(msstab) - 1)
+
+/*
+ * Generate a syncookie. mssp points to the mss, which is returned
+ * rounded down to the value encoded in the cookie.
+ */
+__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int mssind;
+ const __u16 mss = *mssp;
+
+
+ tp->last_synq_overflow = jiffies;
+
+ /* XXX sort msstab[] by probability? Binary search? */
+ for (mssind = 0; mss > msstab[mssind + 1]; mssind++)
+ ;
+ *mssp = msstab[mssind] + 1;
+
+ NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESSENT);
+
+ return secure_tcp_syn_cookie(skb->nh.iph->saddr, skb->nh.iph->daddr,
+ skb->h.th->source, skb->h.th->dest,
+ ntohl(skb->h.th->seq),
+ jiffies / (HZ * 60), mssind);
+}
+
+/*
+ * This (misnamed) value is the age of syncookie which is permitted.
+ * Its ideal value should be dependent on TCP_TIMEOUT_INIT and
+ * sysctl_tcp_retries1. It's a rather complicated formula (exponential
+ * backoff) to compute at runtime so it's currently hardcoded here.
+ */
+#define COUNTER_TRIES 4
+/*
+ * Check if a ack sequence number is a valid syncookie.
+ * Return the decoded mss if it is, or 0 if not.
+ */
+static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
+{
+ __u32 seq;
+ __u32 mssind;
+
+ seq = ntohl(skb->h.th->seq)-1;
+ mssind = check_tcp_syn_cookie(cookie,
+ skb->nh.iph->saddr, skb->nh.iph->daddr,
+ skb->h.th->source, skb->h.th->dest,
+ seq, jiffies / (HZ * 60), COUNTER_TRIES);
+
+ return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
+}
+
+extern struct or_calltable or_ipv4;
+
+static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
+ struct open_request *req,
+ struct dst_entry *dst)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sock *child;
+
+ child = tp->af_specific->syn_recv_sock(sk, skb, req, dst);
+ if (child)
+ tcp_acceptq_queue(sk, req, child);
+ else
+ tcp_openreq_free(req);
+
+ return child;
+}
+
+struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
+ struct ip_options *opt)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ __u32 cookie = ntohl(skb->h.th->ack_seq) - 1;
+ struct sock *ret = sk;
+ struct open_request *req;
+ int mss;
+ struct rtable *rt;
+ __u8 rcv_wscale;
+
+ if (!sysctl_tcp_syncookies || !skb->h.th->ack)
+ goto out;
+
+ if (time_after(jiffies, tp->last_synq_overflow + TCP_TIMEOUT_INIT) ||
+ (mss = cookie_check(skb, cookie)) == 0) {
+ NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESFAILED);
+ goto out;
+ }
+
+ NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESRECV);
+
+ req = tcp_openreq_alloc();
+ ret = NULL;
+ if (!req)
+ goto out;
+
+ req->rcv_isn = htonl(skb->h.th->seq) - 1;
+ req->snt_isn = cookie;
+ req->mss = mss;
+ req->rmt_port = skb->h.th->source;
+ req->af.v4_req.loc_addr = skb->nh.iph->daddr;
+ req->af.v4_req.rmt_addr = skb->nh.iph->saddr;
+ req->class = &or_ipv4; /* for savety */
+ req->af.v4_req.opt = NULL;
+
+ /* We throwed the options of the initial SYN away, so we hope
+ * the ACK carries the same options again (see RFC1122 4.2.3.8)
+ */
+ if (opt && opt->optlen) {
+ int opt_size = sizeof(struct ip_options) + opt->optlen;
+
+ req->af.v4_req.opt = kmalloc(opt_size, GFP_ATOMIC);
+ if (req->af.v4_req.opt) {
+ if (ip_options_echo(req->af.v4_req.opt, skb)) {
+ kfree(req->af.v4_req.opt);
+ req->af.v4_req.opt = NULL;
+ }
+ }
+ }
+
+ req->snd_wscale = req->rcv_wscale = req->tstamp_ok = 0;
+ req->wscale_ok = req->sack_ok = 0;
+ req->expires = 0UL;
+ req->retrans = 0;
+
+ /*
+ * We need to lookup the route here to get at the correct
+ * window size. We should better make sure that the window size
+ * hasn't changed since we received the original syn, but I see
+ * no easy way to do this.
+ */
+ {
+ struct flowi fl = { .nl_u = { .ip4_u =
+ { .daddr = ((opt && opt->srr) ?
+ opt->faddr :
+ req->af.v4_req.rmt_addr),
+ .saddr = req->af.v4_req.loc_addr,
+ .tos = RT_CONN_FLAGS(sk) } },
+ .proto = IPPROTO_TCP,
+ .uli_u = { .ports =
+ { .sport = skb->h.th->dest,
+ .dport = skb->h.th->source } } };
+ if (ip_route_output_key(&rt, &fl)) {
+ tcp_openreq_free(req);
+ goto out;
+ }
+ }
+
+ /* Try to redo what tcp_v4_send_synack did. */
+ req->window_clamp = dst_metric(&rt->u.dst, RTAX_WINDOW);
+ tcp_select_initial_window(tcp_full_space(sk), req->mss,
+ &req->rcv_wnd, &req->window_clamp,
+ 0, &rcv_wscale);
+ /* BTW win scale with syncookies is 0 by definition */
+ req->rcv_wscale = rcv_wscale;
+
+ ret = get_cookie_sock(sk, skb, req, &rt->u.dst);
+out: return ret;
+}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
new file mode 100644
index 00000000000..3aafb298c1c
--- /dev/null
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -0,0 +1,698 @@
+/*
+ * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem.
+ *
+ * $Id: sysctl_net_ipv4.c,v 1.50 2001/10/20 00:00:11 davem Exp $
+ *
+ * Begun April 1, 1996, Mike Shaver.
+ * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS]
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/config.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/tcp.h>
+
+/* From af_inet.c */
+extern int sysctl_ip_nonlocal_bind;
+
+/* From icmp.c */
+extern int sysctl_icmp_echo_ignore_all;
+extern int sysctl_icmp_echo_ignore_broadcasts;
+extern int sysctl_icmp_ignore_bogus_error_responses;
+
+/* From ip_fragment.c */
+extern int sysctl_ipfrag_low_thresh;
+extern int sysctl_ipfrag_high_thresh;
+extern int sysctl_ipfrag_time;
+extern int sysctl_ipfrag_secret_interval;
+
+/* From ip_output.c */
+extern int sysctl_ip_dynaddr;
+
+/* From icmp.c */
+extern int sysctl_icmp_ratelimit;
+extern int sysctl_icmp_ratemask;
+
+/* From igmp.c */
+extern int sysctl_igmp_max_memberships;
+extern int sysctl_igmp_max_msf;
+
+/* From inetpeer.c */
+extern int inet_peer_threshold;
+extern int inet_peer_minttl;
+extern int inet_peer_maxttl;
+extern int inet_peer_gc_mintime;
+extern int inet_peer_gc_maxtime;
+
+#ifdef CONFIG_SYSCTL
+static int tcp_retr1_max = 255;
+static int ip_local_port_range_min[] = { 1, 1 };
+static int ip_local_port_range_max[] = { 65535, 65535 };
+#endif
+
+struct ipv4_config ipv4_config;
+
+extern ctl_table ipv4_route_table[];
+
+#ifdef CONFIG_SYSCTL
+
+static
+int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int val = ipv4_devconf.forwarding;
+ int ret;
+
+ ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+
+ if (write && ipv4_devconf.forwarding != val)
+ inet_forward_change();
+
+ return ret;
+}
+
+static int ipv4_sysctl_forward_strategy(ctl_table *table,
+ int __user *name, int nlen,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen,
+ void **context)
+{
+ int *valp = table->data;
+ int new;
+
+ if (!newval || !newlen)
+ return 0;
+
+ if (newlen != sizeof(int))
+ return -EINVAL;
+
+ if (get_user(new, (int __user *)newval))
+ return -EFAULT;
+
+ if (new == *valp)
+ return 0;
+
+ if (oldval && oldlenp) {
+ size_t len;
+
+ if (get_user(len, oldlenp))
+ return -EFAULT;
+
+ if (len) {
+ if (len > table->maxlen)
+ len = table->maxlen;
+ if (copy_to_user(oldval, valp, len))
+ return -EFAULT;
+ if (put_user(len, oldlenp))
+ return -EFAULT;
+ }
+ }
+
+ *valp = new;
+ inet_forward_change();
+ return 1;
+}
+
+ctl_table ipv4_table[] = {
+ {
+ .ctl_name = NET_IPV4_TCP_TIMESTAMPS,
+ .procname = "tcp_timestamps",
+ .data = &sysctl_tcp_timestamps,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_TCP_WINDOW_SCALING,
+ .procname = "tcp_window_scaling",
+ .data = &sysctl_tcp_window_scaling,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_TCP_SACK,
+ .procname = "tcp_sack",
+ .data = &sysctl_tcp_sack,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_TCP_RETRANS_COLLAPSE,
+ .procname = "tcp_retrans_collapse",
+ .data = &sysctl_tcp_retrans_collapse,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_FORWARD,
+ .procname = "ip_forward",
+ .data = &ipv4_devconf.forwarding,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &ipv4_sysctl_forward,
+ .strategy = &ipv4_sysctl_forward_strategy
+ },
+ {
+ .ctl_name = NET_IPV4_DEFAULT_TTL,
+ .procname = "ip_default_ttl",
+ .data = &sysctl_ip_default_ttl,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &ipv4_doint_and_flush,
+ .strategy = &ipv4_doint_and_flush_strategy,
+ },
+ {
+ .ctl_name = NET_IPV4_AUTOCONFIG,
+ .procname = "ip_autoconfig",
+ .data = &ipv4_config.autoconfig,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_NO_PMTU_DISC,
+ .procname = "ip_no_pmtu_disc",
+ .data = &ipv4_config.no_pmtu_disc,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_NONLOCAL_BIND,
+ .procname = "ip_nonlocal_bind",
+ .data = &sysctl_ip_nonlocal_bind,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_TCP_SYN_RETRIES,
+ .procname = "tcp_syn_retries",
+ .data = &sysctl_tcp_syn_retries,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_TCP_SYNACK_RETRIES,
+ .procname = "tcp_synack_retries",
+ .data = &sysctl_tcp_synack_retries,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_TCP_MAX_ORPHANS,
+ .procname = "tcp_max_orphans",
+ .data = &sysctl_tcp_max_orphans,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_TCP_MAX_TW_BUCKETS,
+ .procname = "tcp_max_tw_buckets",
+ .data = &sysctl_tcp_max_tw_buckets,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_IPFRAG_HIGH_THRESH,
+ .procname = "ipfrag_high_thresh",
+ .data = &sysctl_ipfrag_high_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_IPFRAG_LOW_THRESH,
+ .procname = "ipfrag_low_thresh",
+ .data = &sysctl_ipfrag_low_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_DYNADDR,
+ .procname = "ip_dynaddr",
+ .data = &sysctl_ip_dynaddr,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_IPFRAG_TIME,
+ .procname = "ipfrag_time",
+ .data = &sysctl_ipfrag_time,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies
+ },
+ {
+ .ctl_name = NET_IPV4_TCP_KEEPALIVE_TIME,
+ .procname = "tcp_keepalive_time",
+ .data = &sysctl_tcp_keepalive_time,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies
+ },
+ {
+ .ctl_name = NET_IPV4_TCP_KEEPALIVE_PROBES,
+ .procname = "tcp_keepalive_probes",
+ .data = &sysctl_tcp_keepalive_probes,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_TCP_KEEPALIVE_INTVL,
+ .procname = "tcp_keepalive_intvl",
+ .data = &sysctl_tcp_keepalive_intvl,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies
+ },
+ {
+ .ctl_name = NET_IPV4_TCP_RETRIES1,
+ .procname = "tcp_retries1",
+ .data = &sysctl_tcp_retries1,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra2 = &tcp_retr1_max
+ },
+ {
+ .ctl_name = NET_IPV4_TCP_RETRIES2,
+ .procname = "tcp_retries2",
+ .data = &sysctl_tcp_retries2,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_TCP_FIN_TIMEOUT,
+ .procname = "tcp_fin_timeout",
+ .data = &sysctl_tcp_fin_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies
+ },
+#ifdef CONFIG_SYN_COOKIES
+ {
+ .ctl_name = NET_TCP_SYNCOOKIES,
+ .procname = "tcp_syncookies",
+ .data = &sysctl_tcp_syncookies,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+#endif
+ {
+ .ctl_name = NET_TCP_TW_RECYCLE,
+ .procname = "tcp_tw_recycle",
+ .data = &sysctl_tcp_tw_recycle,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_TCP_ABORT_ON_OVERFLOW,
+ .procname = "tcp_abort_on_overflow",
+ .data = &sysctl_tcp_abort_on_overflow,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_TCP_STDURG,
+ .procname = "tcp_stdurg",
+ .data = &sysctl_tcp_stdurg,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_TCP_RFC1337,
+ .procname = "tcp_rfc1337",
+ .data = &sysctl_tcp_rfc1337,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_TCP_MAX_SYN_BACKLOG,
+ .procname = "tcp_max_syn_backlog",
+ .data = &sysctl_max_syn_backlog,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_LOCAL_PORT_RANGE,
+ .procname = "ip_local_port_range",
+ .data = &sysctl_local_port_range,
+ .maxlen = sizeof(sysctl_local_port_range),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = ip_local_port_range_min,
+ .extra2 = ip_local_port_range_max
+ },
+ {
+ .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_ALL,
+ .procname = "icmp_echo_ignore_all",
+ .data = &sysctl_icmp_echo_ignore_all,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS,
+ .procname = "icmp_echo_ignore_broadcasts",
+ .data = &sysctl_icmp_echo_ignore_broadcasts,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES,
+ .procname = "icmp_ignore_bogus_error_responses",
+ .data = &sysctl_icmp_ignore_bogus_error_responses,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_ROUTE,
+ .procname = "route",
+ .maxlen = 0,
+ .mode = 0555,
+ .child = ipv4_route_table
+ },
+#ifdef CONFIG_IP_MULTICAST
+ {
+ .ctl_name = NET_IPV4_IGMP_MAX_MEMBERSHIPS,
+ .procname = "igmp_max_memberships",
+ .data = &sysctl_igmp_max_memberships,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+
+#endif
+ {
+ .ctl_name = NET_IPV4_IGMP_MAX_MSF,
+ .procname = "igmp_max_msf",
+ .data = &sysctl_igmp_max_msf,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_INET_PEER_THRESHOLD,
+ .procname = "inet_peer_threshold",
+ .data = &inet_peer_threshold,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_INET_PEER_MINTTL,
+ .procname = "inet_peer_minttl",
+ .data = &inet_peer_minttl,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies
+ },
+ {
+ .ctl_name = NET_IPV4_INET_PEER_MAXTTL,
+ .procname = "inet_peer_maxttl",
+ .data = &inet_peer_maxttl,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies
+ },
+ {
+ .ctl_name = NET_IPV4_INET_PEER_GC_MINTIME,
+ .procname = "inet_peer_gc_mintime",
+ .data = &inet_peer_gc_mintime,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies
+ },
+ {
+ .ctl_name = NET_IPV4_INET_PEER_GC_MAXTIME,
+ .procname = "inet_peer_gc_maxtime",
+ .data = &inet_peer_gc_maxtime,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies
+ },
+ {
+ .ctl_name = NET_TCP_ORPHAN_RETRIES,
+ .procname = "tcp_orphan_retries",
+ .data = &sysctl_tcp_orphan_retries,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_TCP_FACK,
+ .procname = "tcp_fack",
+ .data = &sysctl_tcp_fack,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_TCP_REORDERING,
+ .procname = "tcp_reordering",
+ .data = &sysctl_tcp_reordering,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_TCP_ECN,
+ .procname = "tcp_ecn",
+ .data = &sysctl_tcp_ecn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_TCP_DSACK,
+ .procname = "tcp_dsack",
+ .data = &sysctl_tcp_dsack,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_TCP_MEM,
+ .procname = "tcp_mem",
+ .data = &sysctl_tcp_mem,
+ .maxlen = sizeof(sysctl_tcp_mem),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_TCP_WMEM,
+ .procname = "tcp_wmem",
+ .data = &sysctl_tcp_wmem,
+ .maxlen = sizeof(sysctl_tcp_wmem),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_TCP_RMEM,
+ .procname = "tcp_rmem",
+ .data = &sysctl_tcp_rmem,
+ .maxlen = sizeof(sysctl_tcp_rmem),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_TCP_APP_WIN,
+ .procname = "tcp_app_win",
+ .data = &sysctl_tcp_app_win,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_TCP_ADV_WIN_SCALE,
+ .procname = "tcp_adv_win_scale",
+ .data = &sysctl_tcp_adv_win_scale,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_ICMP_RATELIMIT,
+ .procname = "icmp_ratelimit",
+ .data = &sysctl_icmp_ratelimit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_ICMP_RATEMASK,
+ .procname = "icmp_ratemask",
+ .data = &sysctl_icmp_ratemask,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_TCP_TW_REUSE,
+ .procname = "tcp_tw_reuse",
+ .data = &sysctl_tcp_tw_reuse,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_TCP_FRTO,
+ .procname = "tcp_frto",
+ .data = &sysctl_tcp_frto,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_TCP_LOW_LATENCY,
+ .procname = "tcp_low_latency",
+ .data = &sysctl_tcp_low_latency,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_IPFRAG_SECRET_INTERVAL,
+ .procname = "ipfrag_secret_interval",
+ .data = &sysctl_ipfrag_secret_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies
+ },
+ {
+ .ctl_name = NET_TCP_NO_METRICS_SAVE,
+ .procname = "tcp_no_metrics_save",
+ .data = &sysctl_tcp_nometrics_save,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_TCP_WESTWOOD,
+ .procname = "tcp_westwood",
+ .data = &sysctl_tcp_westwood,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_TCP_VEGAS,
+ .procname = "tcp_vegas_cong_avoid",
+ .data = &sysctl_tcp_vegas_cong_avoid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_TCP_VEGAS_ALPHA,
+ .procname = "tcp_vegas_alpha",
+ .data = &sysctl_tcp_vegas_alpha,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_TCP_VEGAS_BETA,
+ .procname = "tcp_vegas_beta",
+ .data = &sysctl_tcp_vegas_beta,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_TCP_VEGAS_GAMMA,
+ .procname = "tcp_vegas_gamma",
+ .data = &sysctl_tcp_vegas_gamma,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_TCP_BIC,
+ .procname = "tcp_bic",
+ .data = &sysctl_tcp_bic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_TCP_BIC_FAST_CONVERGENCE,
+ .procname = "tcp_bic_fast_convergence",
+ .data = &sysctl_tcp_bic_fast_convergence,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_TCP_BIC_LOW_WINDOW,
+ .procname = "tcp_bic_low_window",
+ .data = &sysctl_tcp_bic_low_window,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_TCP_MODERATE_RCVBUF,
+ .procname = "tcp_moderate_rcvbuf",
+ .data = &sysctl_tcp_moderate_rcvbuf,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_TCP_TSO_WIN_DIVISOR,
+ .procname = "tcp_tso_win_divisor",
+ .data = &sysctl_tcp_tso_win_divisor,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_TCP_BIC_BETA,
+ .procname = "tcp_bic_beta",
+ .data = &sysctl_tcp_bic_beta,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ { .ctl_name = 0 }
+};
+
+#endif /* CONFIG_SYSCTL */
+
+EXPORT_SYMBOL(ipv4_config);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
new file mode 100644
index 00000000000..5cff56af785
--- /dev/null
+++ b/net/ipv4/tcp.c
@@ -0,0 +1,2386 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version: $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ * Linus Torvalds, <torvalds@cs.helsinki.fi>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Matthew Dillon, <dillon@apollo.west.oic.com>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ *
+ * Fixes:
+ * Alan Cox : Numerous verify_area() calls
+ * Alan Cox : Set the ACK bit on a reset
+ * Alan Cox : Stopped it crashing if it closed while
+ * sk->inuse=1 and was trying to connect
+ * (tcp_err()).
+ * Alan Cox : All icmp error handling was broken
+ * pointers passed where wrong and the
+ * socket was looked up backwards. Nobody
+ * tested any icmp error code obviously.
+ * Alan Cox : tcp_err() now handled properly. It
+ * wakes people on errors. poll
+ * behaves and the icmp error race
+ * has gone by moving it into sock.c
+ * Alan Cox : tcp_send_reset() fixed to work for
+ * everything not just packets for
+ * unknown sockets.
+ * Alan Cox : tcp option processing.
+ * Alan Cox : Reset tweaked (still not 100%) [Had
+ * syn rule wrong]
+ * Herp Rosmanith : More reset fixes
+ * Alan Cox : No longer acks invalid rst frames.
+ * Acking any kind of RST is right out.
+ * Alan Cox : Sets an ignore me flag on an rst
+ * receive otherwise odd bits of prattle
+ * escape still
+ * Alan Cox : Fixed another acking RST frame bug.
+ * Should stop LAN workplace lockups.
+ * Alan Cox : Some tidyups using the new skb list
+ * facilities
+ * Alan Cox : sk->keepopen now seems to work
+ * Alan Cox : Pulls options out correctly on accepts
+ * Alan Cox : Fixed assorted sk->rqueue->next errors
+ * Alan Cox : PSH doesn't end a TCP read. Switched a
+ * bit to skb ops.
+ * Alan Cox : Tidied tcp_data to avoid a potential
+ * nasty.
+ * Alan Cox : Added some better commenting, as the
+ * tcp is hard to follow
+ * Alan Cox : Removed incorrect check for 20 * psh
+ * Michael O'Reilly : ack < copied bug fix.
+ * Johannes Stille : Misc tcp fixes (not all in yet).
+ * Alan Cox : FIN with no memory -> CRASH
+ * Alan Cox : Added socket option proto entries.
+ * Also added awareness of them to accept.
+ * Alan Cox : Added TCP options (SOL_TCP)
+ * Alan Cox : Switched wakeup calls to callbacks,
+ * so the kernel can layer network
+ * sockets.
+ * Alan Cox : Use ip_tos/ip_ttl settings.
+ * Alan Cox : Handle FIN (more) properly (we hope).
+ * Alan Cox : RST frames sent on unsynchronised
+ * state ack error.
+ * Alan Cox : Put in missing check for SYN bit.
+ * Alan Cox : Added tcp_select_window() aka NET2E
+ * window non shrink trick.
+ * Alan Cox : Added a couple of small NET2E timer
+ * fixes
+ * Charles Hedrick : TCP fixes
+ * Toomas Tamm : TCP window fixes
+ * Alan Cox : Small URG fix to rlogin ^C ack fight
+ * Charles Hedrick : Rewrote most of it to actually work
+ * Linus : Rewrote tcp_read() and URG handling
+ * completely
+ * Gerhard Koerting: Fixed some missing timer handling
+ * Matthew Dillon : Reworked TCP machine states as per RFC
+ * Gerhard Koerting: PC/TCP workarounds
+ * Adam Caldwell : Assorted timer/timing errors
+ * Matthew Dillon : Fixed another RST bug
+ * Alan Cox : Move to kernel side addressing changes.
+ * Alan Cox : Beginning work on TCP fastpathing
+ * (not yet usable)
+ * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
+ * Alan Cox : TCP fast path debugging
+ * Alan Cox : Window clamping
+ * Michael Riepe : Bug in tcp_check()
+ * Matt Dillon : More TCP improvements and RST bug fixes
+ * Matt Dillon : Yet more small nasties remove from the
+ * TCP code (Be very nice to this man if
+ * tcp finally works 100%) 8)
+ * Alan Cox : BSD accept semantics.
+ * Alan Cox : Reset on closedown bug.
+ * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
+ * Michael Pall : Handle poll() after URG properly in
+ * all cases.
+ * Michael Pall : Undo the last fix in tcp_read_urg()
+ * (multi URG PUSH broke rlogin).
+ * Michael Pall : Fix the multi URG PUSH problem in
+ * tcp_readable(), poll() after URG
+ * works now.
+ * Michael Pall : recv(...,MSG_OOB) never blocks in the
+ * BSD api.
+ * Alan Cox : Changed the semantics of sk->socket to
+ * fix a race and a signal problem with
+ * accept() and async I/O.
+ * Alan Cox : Relaxed the rules on tcp_sendto().
+ * Yury Shevchuk : Really fixed accept() blocking problem.
+ * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
+ * clients/servers which listen in on
+ * fixed ports.
+ * Alan Cox : Cleaned the above up and shrank it to
+ * a sensible code size.
+ * Alan Cox : Self connect lockup fix.
+ * Alan Cox : No connect to multicast.
+ * Ross Biro : Close unaccepted children on master
+ * socket close.
+ * Alan Cox : Reset tracing code.
+ * Alan Cox : Spurious resets on shutdown.
+ * Alan Cox : Giant 15 minute/60 second timer error
+ * Alan Cox : Small whoops in polling before an
+ * accept.
+ * Alan Cox : Kept the state trace facility since
+ * it's handy for debugging.
+ * Alan Cox : More reset handler fixes.
+ * Alan Cox : Started rewriting the code based on
+ * the RFC's for other useful protocol
+ * references see: Comer, KA9Q NOS, and
+ * for a reference on the difference
+ * between specifications and how BSD
+ * works see the 4.4lite source.
+ * A.N.Kuznetsov : Don't time wait on completion of tidy
+ * close.
+ * Linus Torvalds : Fin/Shutdown & copied_seq changes.
+ * Linus Torvalds : Fixed BSD port reuse to work first syn
+ * Alan Cox : Reimplemented timers as per the RFC
+ * and using multiple timers for sanity.
+ * Alan Cox : Small bug fixes, and a lot of new
+ * comments.
+ * Alan Cox : Fixed dual reader crash by locking
+ * the buffers (much like datagram.c)
+ * Alan Cox : Fixed stuck sockets in probe. A probe
+ * now gets fed up of retrying without
+ * (even a no space) answer.
+ * Alan Cox : Extracted closing code better
+ * Alan Cox : Fixed the closing state machine to
+ * resemble the RFC.
+ * Alan Cox : More 'per spec' fixes.
+ * Jorge Cwik : Even faster checksumming.
+ * Alan Cox : tcp_data() doesn't ack illegal PSH
+ * only frames. At least one pc tcp stack
+ * generates them.
+ * Alan Cox : Cache last socket.
+ * Alan Cox : Per route irtt.
+ * Matt Day : poll()->select() match BSD precisely on error
+ * Alan Cox : New buffers
+ * Marc Tamsky : Various sk->prot->retransmits and
+ * sk->retransmits misupdating fixed.
+ * Fixed tcp_write_timeout: stuck close,
+ * and TCP syn retries gets used now.
+ * Mark Yarvis : In tcp_read_wakeup(), don't send an
+ * ack if state is TCP_CLOSED.
+ * Alan Cox : Look up device on a retransmit - routes may
+ * change. Doesn't yet cope with MSS shrink right
+ * but it's a start!
+ * Marc Tamsky : Closing in closing fixes.
+ * Mike Shaver : RFC1122 verifications.
+ * Alan Cox : rcv_saddr errors.
+ * Alan Cox : Block double connect().
+ * Alan Cox : Small hooks for enSKIP.
+ * Alexey Kuznetsov: Path MTU discovery.
+ * Alan Cox : Support soft errors.
+ * Alan Cox : Fix MTU discovery pathological case
+ * when the remote claims no mtu!
+ * Marc Tamsky : TCP_CLOSE fix.
+ * Colin (G3TNE) : Send a reset on syn ack replies in
+ * window but wrong (fixes NT lpd problems)
+ * Pedro Roque : Better TCP window handling, delayed ack.
+ * Joerg Reuter : No modification of locked buffers in
+ * tcp_do_retransmit()
+ * Eric Schenk : Changed receiver side silly window
+ * avoidance algorithm to BSD style
+ * algorithm. This doubles throughput
+ * against machines running Solaris,
+ * and seems to result in general
+ * improvement.
+ * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
+ * Willy Konynenberg : Transparent proxying support.
+ * Mike McLagan : Routing by source
+ * Keith Owens : Do proper merging with partial SKB's in
+ * tcp_do_sendmsg to avoid burstiness.
+ * Eric Schenk : Fix fast close down bug with
+ * shutdown() followed by close().
+ * Andi Kleen : Make poll agree with SIGIO
+ * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
+ * lingertime == 0 (RFC 793 ABORT Call)
+ * Hirokazu Takahashi : Use copy_from_user() instead of
+ * csum_and_copy_from_user() if possible.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or(at your option) any later version.
+ *
+ * Description of States:
+ *
+ * TCP_SYN_SENT sent a connection request, waiting for ack
+ *
+ * TCP_SYN_RECV received a connection request, sent ack,
+ * waiting for final ack in three-way handshake.
+ *
+ * TCP_ESTABLISHED connection established
+ *
+ * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
+ * transmission of remaining buffered data
+ *
+ * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
+ * to shutdown
+ *
+ * TCP_CLOSING both sides have shutdown but we still have
+ * data we have to finish sending
+ *
+ * TCP_TIME_WAIT timeout to catch resent junk before entering
+ * closed, can only be entered from FIN_WAIT2
+ * or CLOSING. Required because the other end
+ * may not have gotten our last ACK causing it
+ * to retransmit the data packet (which we ignore)
+ *
+ * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
+ * us to finish writing our data and to shutdown
+ * (we have to close() to move on to LAST_ACK)
+ *
+ * TCP_LAST_ACK out side has shutdown after remote has
+ * shutdown. There may still be data in our
+ * buffer that we have to finish sending
+ *
+ * TCP_CLOSE socket is finished
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/smp_lock.h>
+#include <linux/fs.h>
+#include <linux/random.h>
+#include <linux/bootmem.h>
+
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+
+
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+
+int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
+
+DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
+
+kmem_cache_t *tcp_openreq_cachep;
+kmem_cache_t *tcp_bucket_cachep;
+kmem_cache_t *tcp_timewait_cachep;
+
+atomic_t tcp_orphan_count = ATOMIC_INIT(0);
+
+int sysctl_tcp_mem[3];
+int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
+int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
+
+EXPORT_SYMBOL(sysctl_tcp_mem);
+EXPORT_SYMBOL(sysctl_tcp_rmem);
+EXPORT_SYMBOL(sysctl_tcp_wmem);
+
+atomic_t tcp_memory_allocated; /* Current allocated memory. */
+atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
+
+EXPORT_SYMBOL(tcp_memory_allocated);
+EXPORT_SYMBOL(tcp_sockets_allocated);
+
+/*
+ * Pressure flag: try to collapse.
+ * Technical note: it is used by multiple contexts non atomically.
+ * All the sk_stream_mem_schedule() is of this nature: accounting
+ * is strict, actions are advisory and have some latency.
+ */
+int tcp_memory_pressure;
+
+EXPORT_SYMBOL(tcp_memory_pressure);
+
+void tcp_enter_memory_pressure(void)
+{
+ if (!tcp_memory_pressure) {
+ NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
+ tcp_memory_pressure = 1;
+ }
+}
+
+EXPORT_SYMBOL(tcp_enter_memory_pressure);
+
+/*
+ * LISTEN is a special case for poll..
+ */
+static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
+ poll_table *wait)
+{
+ return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
+}
+
+/*
+ * Wait for a TCP event.
+ *
+ * Note that we don't need to lock the socket, as the upper poll layers
+ * take care of normal races (between the test and the event) and we don't
+ * go look at any of the socket buffers directly.
+ */
+unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+ unsigned int mask;
+ struct sock *sk = sock->sk;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ poll_wait(file, sk->sk_sleep, wait);
+ if (sk->sk_state == TCP_LISTEN)
+ return tcp_listen_poll(sk, wait);
+
+ /* Socket is not locked. We are protected from async events
+ by poll logic and correct handling of state changes
+ made by another threads is impossible in any case.
+ */
+
+ mask = 0;
+ if (sk->sk_err)
+ mask = POLLERR;
+
+ /*
+ * POLLHUP is certainly not done right. But poll() doesn't
+ * have a notion of HUP in just one direction, and for a
+ * socket the read side is more interesting.
+ *
+ * Some poll() documentation says that POLLHUP is incompatible
+ * with the POLLOUT/POLLWR flags, so somebody should check this
+ * all. But careful, it tends to be safer to return too many
+ * bits than too few, and you can easily break real applications
+ * if you don't tell them that something has hung up!
+ *
+ * Check-me.
+ *
+ * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
+ * our fs/select.c). It means that after we received EOF,
+ * poll always returns immediately, making impossible poll() on write()
+ * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
+ * if and only if shutdown has been made in both directions.
+ * Actually, it is interesting to look how Solaris and DUX
+ * solve this dilemma. I would prefer, if PULLHUP were maskable,
+ * then we could set it on SND_SHUTDOWN. BTW examples given
+ * in Stevens' books assume exactly this behaviour, it explains
+ * why PULLHUP is incompatible with POLLOUT. --ANK
+ *
+ * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
+ * blocking on fresh not-connected or disconnected socket. --ANK
+ */
+ if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
+ mask |= POLLHUP;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLIN | POLLRDNORM;
+
+ /* Connected? */
+ if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ /* Potential race condition. If read of tp below will
+ * escape above sk->sk_state, we can be illegally awaken
+ * in SYN_* states. */
+ if ((tp->rcv_nxt != tp->copied_seq) &&
+ (tp->urg_seq != tp->copied_seq ||
+ tp->rcv_nxt != tp->copied_seq + 1 ||
+ sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
+ mask |= POLLIN | POLLRDNORM;
+
+ if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
+ if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+ mask |= POLLOUT | POLLWRNORM;
+ } else { /* send SIGIO later */
+ set_bit(SOCK_ASYNC_NOSPACE,
+ &sk->sk_socket->flags);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+
+ /* Race breaker. If space is freed after
+ * wspace test but before the flags are set,
+ * IO signal will be lost.
+ */
+ if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+ mask |= POLLOUT | POLLWRNORM;
+ }
+ }
+
+ if (tp->urg_data & TCP_URG_VALID)
+ mask |= POLLPRI;
+ }
+ return mask;
+}
+
+int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int answ;
+
+ switch (cmd) {
+ case SIOCINQ:
+ if (sk->sk_state == TCP_LISTEN)
+ return -EINVAL;
+
+ lock_sock(sk);
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+ answ = 0;
+ else if (sock_flag(sk, SOCK_URGINLINE) ||
+ !tp->urg_data ||
+ before(tp->urg_seq, tp->copied_seq) ||
+ !before(tp->urg_seq, tp->rcv_nxt)) {
+ answ = tp->rcv_nxt - tp->copied_seq;
+
+ /* Subtract 1, if FIN is in queue. */
+ if (answ && !skb_queue_empty(&sk->sk_receive_queue))
+ answ -=
+ ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
+ } else
+ answ = tp->urg_seq - tp->copied_seq;
+ release_sock(sk);
+ break;
+ case SIOCATMARK:
+ answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
+ break;
+ case SIOCOUTQ:
+ if (sk->sk_state == TCP_LISTEN)
+ return -EINVAL;
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+ answ = 0;
+ else
+ answ = tp->write_seq - tp->snd_una;
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ };
+
+ return put_user(answ, (int __user *)arg);
+}
+
+
+int tcp_listen_start(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_listen_opt *lopt;
+
+ sk->sk_max_ack_backlog = 0;
+ sk->sk_ack_backlog = 0;
+ tp->accept_queue = tp->accept_queue_tail = NULL;
+ rwlock_init(&tp->syn_wait_lock);
+ tcp_delack_init(tp);
+
+ lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
+ if (!lopt)
+ return -ENOMEM;
+
+ memset(lopt, 0, sizeof(struct tcp_listen_opt));
+ for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
+ if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
+ break;
+ get_random_bytes(&lopt->hash_rnd, 4);
+
+ write_lock_bh(&tp->syn_wait_lock);
+ tp->listen_opt = lopt;
+ write_unlock_bh(&tp->syn_wait_lock);
+
+ /* There is race window here: we announce ourselves listening,
+ * but this transition is still not validated by get_port().
+ * It is OK, because this socket enters to hash table only
+ * after validation is complete.
+ */
+ sk->sk_state = TCP_LISTEN;
+ if (!sk->sk_prot->get_port(sk, inet->num)) {
+ inet->sport = htons(inet->num);
+
+ sk_dst_reset(sk);
+ sk->sk_prot->hash(sk);
+
+ return 0;
+ }
+
+ sk->sk_state = TCP_CLOSE;
+ write_lock_bh(&tp->syn_wait_lock);
+ tp->listen_opt = NULL;
+ write_unlock_bh(&tp->syn_wait_lock);
+ kfree(lopt);
+ return -EADDRINUSE;
+}
+
+/*
+ * This routine closes sockets which have been at least partially
+ * opened, but not yet accepted.
+ */
+
+static void tcp_listen_stop (struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_listen_opt *lopt = tp->listen_opt;
+ struct open_request *acc_req = tp->accept_queue;
+ struct open_request *req;
+ int i;
+
+ tcp_delete_keepalive_timer(sk);
+
+ /* make all the listen_opt local to us */
+ write_lock_bh(&tp->syn_wait_lock);
+ tp->listen_opt = NULL;
+ write_unlock_bh(&tp->syn_wait_lock);
+ tp->accept_queue = tp->accept_queue_tail = NULL;
+
+ if (lopt->qlen) {
+ for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
+ while ((req = lopt->syn_table[i]) != NULL) {
+ lopt->syn_table[i] = req->dl_next;
+ lopt->qlen--;
+ tcp_openreq_free(req);
+
+ /* Following specs, it would be better either to send FIN
+ * (and enter FIN-WAIT-1, it is normal close)
+ * or to send active reset (abort).
+ * Certainly, it is pretty dangerous while synflood, but it is
+ * bad justification for our negligence 8)
+ * To be honest, we are not able to make either
+ * of the variants now. --ANK
+ */
+ }
+ }
+ }
+ BUG_TRAP(!lopt->qlen);
+
+ kfree(lopt);
+
+ while ((req = acc_req) != NULL) {
+ struct sock *child = req->sk;
+
+ acc_req = req->dl_next;
+
+ local_bh_disable();
+ bh_lock_sock(child);
+ BUG_TRAP(!sock_owned_by_user(child));
+ sock_hold(child);
+
+ tcp_disconnect(child, O_NONBLOCK);
+
+ sock_orphan(child);
+
+ atomic_inc(&tcp_orphan_count);
+
+ tcp_destroy_sock(child);
+
+ bh_unlock_sock(child);
+ local_bh_enable();
+ sock_put(child);
+
+ sk_acceptq_removed(sk);
+ tcp_openreq_fastfree(req);
+ }
+ BUG_TRAP(!sk->sk_ack_backlog);
+}
+
+static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+ tp->pushed_seq = tp->write_seq;
+}
+
+static inline int forced_push(struct tcp_sock *tp)
+{
+ return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
+}
+
+static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
+ struct sk_buff *skb)
+{
+ skb->csum = 0;
+ TCP_SKB_CB(skb)->seq = tp->write_seq;
+ TCP_SKB_CB(skb)->end_seq = tp->write_seq;
+ TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
+ TCP_SKB_CB(skb)->sacked = 0;
+ skb_header_release(skb);
+ __skb_queue_tail(&sk->sk_write_queue, skb);
+ sk_charge_skb(sk, skb);
+ if (!sk->sk_send_head)
+ sk->sk_send_head = skb;
+ else if (tp->nonagle&TCP_NAGLE_PUSH)
+ tp->nonagle &= ~TCP_NAGLE_PUSH;
+}
+
+static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
+ struct sk_buff *skb)
+{
+ if (flags & MSG_OOB) {
+ tp->urg_mode = 1;
+ tp->snd_up = tp->write_seq;
+ TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
+ }
+}
+
+static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
+ int mss_now, int nonagle)
+{
+ if (sk->sk_send_head) {
+ struct sk_buff *skb = sk->sk_write_queue.prev;
+ if (!(flags & MSG_MORE) || forced_push(tp))
+ tcp_mark_push(tp, skb);
+ tcp_mark_urg(tp, flags, skb);
+ __tcp_push_pending_frames(sk, tp, mss_now,
+ (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
+ }
+}
+
+static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
+ size_t psize, int flags)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int mss_now;
+ int err;
+ ssize_t copied;
+ long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+
+ /* Wait for a connection to finish. */
+ if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+ if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+ goto out_err;
+
+ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+ mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+ copied = 0;
+
+ err = -EPIPE;
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+ goto do_error;
+
+ while (psize > 0) {
+ struct sk_buff *skb = sk->sk_write_queue.prev;
+ struct page *page = pages[poffset / PAGE_SIZE];
+ int copy, i, can_coalesce;
+ int offset = poffset % PAGE_SIZE;
+ int size = min_t(size_t, psize, PAGE_SIZE - offset);
+
+ if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
+new_segment:
+ if (!sk_stream_memory_free(sk))
+ goto wait_for_sndbuf;
+
+ skb = sk_stream_alloc_pskb(sk, 0, 0,
+ sk->sk_allocation);
+ if (!skb)
+ goto wait_for_memory;
+
+ skb_entail(sk, tp, skb);
+ copy = mss_now;
+ }
+
+ if (copy > size)
+ copy = size;
+
+ i = skb_shinfo(skb)->nr_frags;
+ can_coalesce = skb_can_coalesce(skb, i, page, offset);
+ if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+ tcp_mark_push(tp, skb);
+ goto new_segment;
+ }
+ if (sk->sk_forward_alloc < copy &&
+ !sk_stream_mem_schedule(sk, copy, 0))
+ goto wait_for_memory;
+
+ if (can_coalesce) {
+ skb_shinfo(skb)->frags[i - 1].size += copy;
+ } else {
+ get_page(page);
+ skb_fill_page_desc(skb, i, page, offset, copy);
+ }
+
+ skb->len += copy;
+ skb->data_len += copy;
+ skb->truesize += copy;
+ sk->sk_wmem_queued += copy;
+ sk->sk_forward_alloc -= copy;
+ skb->ip_summed = CHECKSUM_HW;
+ tp->write_seq += copy;
+ TCP_SKB_CB(skb)->end_seq += copy;
+ skb_shinfo(skb)->tso_segs = 0;
+
+ if (!copied)
+ TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
+
+ copied += copy;
+ poffset += copy;
+ if (!(psize -= copy))
+ goto out;
+
+ if (skb->len != mss_now || (flags & MSG_OOB))
+ continue;
+
+ if (forced_push(tp)) {
+ tcp_mark_push(tp, skb);
+ __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
+ } else if (skb == sk->sk_send_head)
+ tcp_push_one(sk, mss_now);
+ continue;
+
+wait_for_sndbuf:
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+ if (copied)
+ tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+
+ if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+ goto do_error;
+
+ mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+ }
+
+out:
+ if (copied)
+ tcp_push(sk, tp, flags, mss_now, tp->nonagle);
+ return copied;
+
+do_error:
+ if (copied)
+ goto out;
+out_err:
+ return sk_stream_error(sk, flags, err);
+}
+
+ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
+ size_t size, int flags)
+{
+ ssize_t res;
+ struct sock *sk = sock->sk;
+
+#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
+
+ if (!(sk->sk_route_caps & NETIF_F_SG) ||
+ !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
+ return sock_no_sendpage(sock, page, offset, size, flags);
+
+#undef TCP_ZC_CSUM_FLAGS
+
+ lock_sock(sk);
+ TCP_CHECK_TIMER(sk);
+ res = do_tcp_sendpages(sk, &page, offset, size, flags);
+ TCP_CHECK_TIMER(sk);
+ release_sock(sk);
+ return res;
+}
+
+#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
+#define TCP_OFF(sk) (sk->sk_sndmsg_off)
+
+static inline int select_size(struct sock *sk, struct tcp_sock *tp)
+{
+ int tmp = tp->mss_cache_std;
+
+ if (sk->sk_route_caps & NETIF_F_SG) {
+ int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
+
+ if (tmp >= pgbreak &&
+ tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
+ tmp = pgbreak;
+ }
+ return tmp;
+}
+
+int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ size_t size)
+{
+ struct iovec *iov;
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ int iovlen, flags;
+ int mss_now;
+ int err, copied;
+ long timeo;
+
+ lock_sock(sk);
+ TCP_CHECK_TIMER(sk);
+
+ flags = msg->msg_flags;
+ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+
+ /* Wait for a connection to finish. */
+ if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+ if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+ goto out_err;
+
+ /* This should be in poll */
+ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+ mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+
+ /* Ok commence sending. */
+ iovlen = msg->msg_iovlen;
+ iov = msg->msg_iov;
+ copied = 0;
+
+ err = -EPIPE;
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+ goto do_error;
+
+ while (--iovlen >= 0) {
+ int seglen = iov->iov_len;
+ unsigned char __user *from = iov->iov_base;
+
+ iov++;
+
+ while (seglen > 0) {
+ int copy;
+
+ skb = sk->sk_write_queue.prev;
+
+ if (!sk->sk_send_head ||
+ (copy = mss_now - skb->len) <= 0) {
+
+new_segment:
+ /* Allocate new segment. If the interface is SG,
+ * allocate skb fitting to single page.
+ */
+ if (!sk_stream_memory_free(sk))
+ goto wait_for_sndbuf;
+
+ skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
+ 0, sk->sk_allocation);
+ if (!skb)
+ goto wait_for_memory;
+
+ /*
+ * Check whether we can use HW checksum.
+ */
+ if (sk->sk_route_caps &
+ (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
+ NETIF_F_HW_CSUM))
+ skb->ip_summed = CHECKSUM_HW;
+
+ skb_entail(sk, tp, skb);
+ copy = mss_now;
+ }
+
+ /* Try to append data to the end of skb. */
+ if (copy > seglen)
+ copy = seglen;
+
+ /* Where to copy to? */
+ if (skb_tailroom(skb) > 0) {
+ /* We have some space in skb head. Superb! */
+ if (copy > skb_tailroom(skb))
+ copy = skb_tailroom(skb);
+ if ((err = skb_add_data(skb, from, copy)) != 0)
+ goto do_fault;
+ } else {
+ int merge = 0;
+ int i = skb_shinfo(skb)->nr_frags;
+ struct page *page = TCP_PAGE(sk);
+ int off = TCP_OFF(sk);
+
+ if (skb_can_coalesce(skb, i, page, off) &&
+ off != PAGE_SIZE) {
+ /* We can extend the last page
+ * fragment. */
+ merge = 1;
+ } else if (i == MAX_SKB_FRAGS ||
+ (!i &&
+ !(sk->sk_route_caps & NETIF_F_SG))) {
+ /* Need to add new fragment and cannot
+ * do this because interface is non-SG,
+ * or because all the page slots are
+ * busy. */
+ tcp_mark_push(tp, skb);
+ goto new_segment;
+ } else if (page) {
+ /* If page is cached, align
+ * offset to L1 cache boundary
+ */
+ off = (off + L1_CACHE_BYTES - 1) &
+ ~(L1_CACHE_BYTES - 1);
+ if (off == PAGE_SIZE) {
+ put_page(page);
+ TCP_PAGE(sk) = page = NULL;
+ }
+ }
+
+ if (!page) {
+ /* Allocate new cache page. */
+ if (!(page = sk_stream_alloc_page(sk)))
+ goto wait_for_memory;
+ off = 0;
+ }
+
+ if (copy > PAGE_SIZE - off)
+ copy = PAGE_SIZE - off;
+
+ /* Time to copy data. We are close to
+ * the end! */
+ err = skb_copy_to_page(sk, from, skb, page,
+ off, copy);
+ if (err) {
+ /* If this page was new, give it to the
+ * socket so it does not get leaked.
+ */
+ if (!TCP_PAGE(sk)) {
+ TCP_PAGE(sk) = page;
+ TCP_OFF(sk) = 0;
+ }
+ goto do_error;
+ }
+
+ /* Update the skb. */
+ if (merge) {
+ skb_shinfo(skb)->frags[i - 1].size +=
+ copy;
+ } else {
+ skb_fill_page_desc(skb, i, page, off, copy);
+ if (TCP_PAGE(sk)) {
+ get_page(page);
+ } else if (off + copy < PAGE_SIZE) {
+ get_page(page);
+ TCP_PAGE(sk) = page;
+ }
+ }
+
+ TCP_OFF(sk) = off + copy;
+ }
+
+ if (!copied)
+ TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
+
+ tp->write_seq += copy;
+ TCP_SKB_CB(skb)->end_seq += copy;
+ skb_shinfo(skb)->tso_segs = 0;
+
+ from += copy;
+ copied += copy;
+ if ((seglen -= copy) == 0 && iovlen == 0)
+ goto out;
+
+ if (skb->len != mss_now || (flags & MSG_OOB))
+ continue;
+
+ if (forced_push(tp)) {
+ tcp_mark_push(tp, skb);
+ __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
+ } else if (skb == sk->sk_send_head)
+ tcp_push_one(sk, mss_now);
+ continue;
+
+wait_for_sndbuf:
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+ if (copied)
+ tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+
+ if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+ goto do_error;
+
+ mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+ }
+ }
+
+out:
+ if (copied)
+ tcp_push(sk, tp, flags, mss_now, tp->nonagle);
+ TCP_CHECK_TIMER(sk);
+ release_sock(sk);
+ return copied;
+
+do_fault:
+ if (!skb->len) {
+ if (sk->sk_send_head == skb)
+ sk->sk_send_head = NULL;
+ __skb_unlink(skb, skb->list);
+ sk_stream_free_skb(sk, skb);
+ }
+
+do_error:
+ if (copied)
+ goto out;
+out_err:
+ err = sk_stream_error(sk, flags, err);
+ TCP_CHECK_TIMER(sk);
+ release_sock(sk);
+ return err;
+}
+
+/*
+ * Handle reading urgent data. BSD has very simple semantics for
+ * this, no blocking and very strange errors 8)
+ */
+
+static int tcp_recv_urg(struct sock *sk, long timeo,
+ struct msghdr *msg, int len, int flags,
+ int *addr_len)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* No URG data to read. */
+ if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
+ tp->urg_data == TCP_URG_READ)
+ return -EINVAL; /* Yes this is right ! */
+
+ if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
+ return -ENOTCONN;
+
+ if (tp->urg_data & TCP_URG_VALID) {
+ int err = 0;
+ char c = tp->urg_data;
+
+ if (!(flags & MSG_PEEK))
+ tp->urg_data = TCP_URG_READ;
+
+ /* Read urgent data. */
+ msg->msg_flags |= MSG_OOB;
+
+ if (len > 0) {
+ if (!(flags & MSG_TRUNC))
+ err = memcpy_toiovec(msg->msg_iov, &c, 1);
+ len = 1;
+ } else
+ msg->msg_flags |= MSG_TRUNC;
+
+ return err ? -EFAULT : len;
+ }
+
+ if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
+ return 0;
+
+ /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
+ * the available implementations agree in this case:
+ * this call should never block, independent of the
+ * blocking state of the socket.
+ * Mike <pall@rz.uni-karlsruhe.de>
+ */
+ return -EAGAIN;
+}
+
+/* Clean up the receive buffer for full frames taken by the user,
+ * then send an ACK if necessary. COPIED is the number of bytes
+ * tcp_recvmsg has given to the user so far, it speeds up the
+ * calculation of whether or not we must ACK for the sake of
+ * a window update.
+ */
+static void cleanup_rbuf(struct sock *sk, int copied)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int time_to_ack = 0;
+
+#if TCP_DEBUG
+ struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+
+ BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
+#endif
+
+ if (tcp_ack_scheduled(tp)) {
+ /* Delayed ACKs frequently hit locked sockets during bulk
+ * receive. */
+ if (tp->ack.blocked ||
+ /* Once-per-two-segments ACK was not sent by tcp_input.c */
+ tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
+ /*
+ * If this read emptied read buffer, we send ACK, if
+ * connection is not bidirectional, user drained
+ * receive buffer and there was a small segment
+ * in queue.
+ */
+ (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
+ !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
+ time_to_ack = 1;
+ }
+
+ /* We send an ACK if we can now advertise a non-zero window
+ * which has been raised "significantly".
+ *
+ * Even if window raised up to infinity, do not send window open ACK
+ * in states, where we will not receive more. It is useless.
+ */
+ if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
+ __u32 rcv_window_now = tcp_receive_window(tp);
+
+ /* Optimize, __tcp_select_window() is not cheap. */
+ if (2*rcv_window_now <= tp->window_clamp) {
+ __u32 new_window = __tcp_select_window(sk);
+
+ /* Send ACK now, if this read freed lots of space
+ * in our buffer. Certainly, new_window is new window.
+ * We can advertise it now, if it is not less than current one.
+ * "Lots" means "at least twice" here.
+ */
+ if (new_window && new_window >= 2 * rcv_window_now)
+ time_to_ack = 1;
+ }
+ }
+ if (time_to_ack)
+ tcp_send_ack(sk);
+}
+
+static void tcp_prequeue_process(struct sock *sk)
+{
+ struct sk_buff *skb;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
+
+ /* RX process wants to run with disabled BHs, though it is not
+ * necessary */
+ local_bh_disable();
+ while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
+ sk->sk_backlog_rcv(sk, skb);
+ local_bh_enable();
+
+ /* Clear memory counter. */
+ tp->ucopy.memory = 0;
+}
+
+static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
+{
+ struct sk_buff *skb;
+ u32 offset;
+
+ skb_queue_walk(&sk->sk_receive_queue, skb) {
+ offset = seq - TCP_SKB_CB(skb)->seq;
+ if (skb->h.th->syn)
+ offset--;
+ if (offset < skb->len || skb->h.th->fin) {
+ *off = offset;
+ return skb;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * This routine provides an alternative to tcp_recvmsg() for routines
+ * that would like to handle copying from skbuffs directly in 'sendfile'
+ * fashion.
+ * Note:
+ * - It is assumed that the socket was locked by the caller.
+ * - The routine does not block.
+ * - At present, there is no support for reading OOB data
+ * or for 'peeking' the socket using this routine
+ * (although both would be easy to implement).
+ */
+int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor)
+{
+ struct sk_buff *skb;
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 seq = tp->copied_seq;
+ u32 offset;
+ int copied = 0;
+
+ if (sk->sk_state == TCP_LISTEN)
+ return -ENOTCONN;
+ while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
+ if (offset < skb->len) {
+ size_t used, len;
+
+ len = skb->len - offset;
+ /* Stop reading if we hit a patch of urgent data */
+ if (tp->urg_data) {
+ u32 urg_offset = tp->urg_seq - seq;
+ if (urg_offset < len)
+ len = urg_offset;
+ if (!len)
+ break;
+ }
+ used = recv_actor(desc, skb, offset, len);
+ if (used <= len) {
+ seq += used;
+ copied += used;
+ offset += used;
+ }
+ if (offset != skb->len)
+ break;
+ }
+ if (skb->h.th->fin) {
+ sk_eat_skb(sk, skb);
+ ++seq;
+ break;
+ }
+ sk_eat_skb(sk, skb);
+ if (!desc->count)
+ break;
+ }
+ tp->copied_seq = seq;
+
+ tcp_rcv_space_adjust(sk);
+
+ /* Clean up data we have read: This will do ACK frames. */
+ if (copied)
+ cleanup_rbuf(sk, copied);
+ return copied;
+}
+
+/*
+ * This routine copies from a sock struct into the user buffer.
+ *
+ * Technical note: in 2.3 we work on _locked_ socket, so that
+ * tricks with *seq access order and skb->users are not required.
+ * Probably, code can be easily improved even more.
+ */
+
+int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ size_t len, int nonblock, int flags, int *addr_len)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int copied = 0;
+ u32 peek_seq;
+ u32 *seq;
+ unsigned long used;
+ int err;
+ int target; /* Read at least this many bytes */
+ long timeo;
+ struct task_struct *user_recv = NULL;
+
+ lock_sock(sk);
+
+ TCP_CHECK_TIMER(sk);
+
+ err = -ENOTCONN;
+ if (sk->sk_state == TCP_LISTEN)
+ goto out;
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+
+ /* Urgent data needs to be handled specially. */
+ if (flags & MSG_OOB)
+ goto recv_urg;
+
+ seq = &tp->copied_seq;
+ if (flags & MSG_PEEK) {
+ peek_seq = tp->copied_seq;
+ seq = &peek_seq;
+ }
+
+ target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+
+ do {
+ struct sk_buff *skb;
+ u32 offset;
+
+ /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
+ if (tp->urg_data && tp->urg_seq == *seq) {
+ if (copied)
+ break;
+ if (signal_pending(current)) {
+ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
+ break;
+ }
+ }
+
+ /* Next get a buffer. */
+
+ skb = skb_peek(&sk->sk_receive_queue);
+ do {
+ if (!skb)
+ break;
+
+ /* Now that we have two receive queues this
+ * shouldn't happen.
+ */
+ if (before(*seq, TCP_SKB_CB(skb)->seq)) {
+ printk(KERN_INFO "recvmsg bug: copied %X "
+ "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
+ break;
+ }
+ offset = *seq - TCP_SKB_CB(skb)->seq;
+ if (skb->h.th->syn)
+ offset--;
+ if (offset < skb->len)
+ goto found_ok_skb;
+ if (skb->h.th->fin)
+ goto found_fin_ok;
+ BUG_TRAP(flags & MSG_PEEK);
+ skb = skb->next;
+ } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
+
+ /* Well, if we have backlog, try to process it now yet. */
+
+ if (copied >= target && !sk->sk_backlog.tail)
+ break;
+
+ if (copied) {
+ if (sk->sk_err ||
+ sk->sk_state == TCP_CLOSE ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ !timeo ||
+ signal_pending(current) ||
+ (flags & MSG_PEEK))
+ break;
+ } else {
+ if (sock_flag(sk, SOCK_DONE))
+ break;
+
+ if (sk->sk_err) {
+ copied = sock_error(sk);
+ break;
+ }
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+
+ if (sk->sk_state == TCP_CLOSE) {
+ if (!sock_flag(sk, SOCK_DONE)) {
+ /* This occurs when user tries to read
+ * from never connected socket.
+ */
+ copied = -ENOTCONN;
+ break;
+ }
+ break;
+ }
+
+ if (!timeo) {
+ copied = -EAGAIN;
+ break;
+ }
+
+ if (signal_pending(current)) {
+ copied = sock_intr_errno(timeo);
+ break;
+ }
+ }
+
+ cleanup_rbuf(sk, copied);
+
+ if (tp->ucopy.task == user_recv) {
+ /* Install new reader */
+ if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
+ user_recv = current;
+ tp->ucopy.task = user_recv;
+ tp->ucopy.iov = msg->msg_iov;
+ }
+
+ tp->ucopy.len = len;
+
+ BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
+ (flags & (MSG_PEEK | MSG_TRUNC)));
+
+ /* Ugly... If prequeue is not empty, we have to
+ * process it before releasing socket, otherwise
+ * order will be broken at second iteration.
+ * More elegant solution is required!!!
+ *
+ * Look: we have the following (pseudo)queues:
+ *
+ * 1. packets in flight
+ * 2. backlog
+ * 3. prequeue
+ * 4. receive_queue
+ *
+ * Each queue can be processed only if the next ones
+ * are empty. At this point we have empty receive_queue.
+ * But prequeue _can_ be not empty after 2nd iteration,
+ * when we jumped to start of loop because backlog
+ * processing added something to receive_queue.
+ * We cannot release_sock(), because backlog contains
+ * packets arrived _after_ prequeued ones.
+ *
+ * Shortly, algorithm is clear --- to process all
+ * the queues in order. We could make it more directly,
+ * requeueing packets from backlog to prequeue, if
+ * is not empty. It is more elegant, but eats cycles,
+ * unfortunately.
+ */
+ if (skb_queue_len(&tp->ucopy.prequeue))
+ goto do_prequeue;
+
+ /* __ Set realtime policy in scheduler __ */
+ }
+
+ if (copied >= target) {
+ /* Do not sleep, just process backlog. */
+ release_sock(sk);
+ lock_sock(sk);
+ } else
+ sk_wait_data(sk, &timeo);
+
+ if (user_recv) {
+ int chunk;
+
+ /* __ Restore normal policy in scheduler __ */
+
+ if ((chunk = len - tp->ucopy.len) != 0) {
+ NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
+ len -= chunk;
+ copied += chunk;
+ }
+
+ if (tp->rcv_nxt == tp->copied_seq &&
+ skb_queue_len(&tp->ucopy.prequeue)) {
+do_prequeue:
+ tcp_prequeue_process(sk);
+
+ if ((chunk = len - tp->ucopy.len) != 0) {
+ NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
+ len -= chunk;
+ copied += chunk;
+ }
+ }
+ }
+ if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
+ if (net_ratelimit())
+ printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
+ current->comm, current->pid);
+ peek_seq = tp->copied_seq;
+ }
+ continue;
+
+ found_ok_skb:
+ /* Ok so how much can we use? */
+ used = skb->len - offset;
+ if (len < used)
+ used = len;
+
+ /* Do we have urgent data here? */
+ if (tp->urg_data) {
+ u32 urg_offset = tp->urg_seq - *seq;
+ if (urg_offset < used) {
+ if (!urg_offset) {
+ if (!sock_flag(sk, SOCK_URGINLINE)) {
+ ++*seq;
+ offset++;
+ used--;
+ if (!used)
+ goto skip_copy;
+ }
+ } else
+ used = urg_offset;
+ }
+ }
+
+ if (!(flags & MSG_TRUNC)) {
+ err = skb_copy_datagram_iovec(skb, offset,
+ msg->msg_iov, used);
+ if (err) {
+ /* Exception. Bailout! */
+ if (!copied)
+ copied = -EFAULT;
+ break;
+ }
+ }
+
+ *seq += used;
+ copied += used;
+ len -= used;
+
+ tcp_rcv_space_adjust(sk);
+
+skip_copy:
+ if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
+ tp->urg_data = 0;
+ tcp_fast_path_check(sk, tp);
+ }
+ if (used + offset < skb->len)
+ continue;
+
+ if (skb->h.th->fin)
+ goto found_fin_ok;
+ if (!(flags & MSG_PEEK))
+ sk_eat_skb(sk, skb);
+ continue;
+
+ found_fin_ok:
+ /* Process the FIN. */
+ ++*seq;
+ if (!(flags & MSG_PEEK))
+ sk_eat_skb(sk, skb);
+ break;
+ } while (len > 0);
+
+ if (user_recv) {
+ if (skb_queue_len(&tp->ucopy.prequeue)) {
+ int chunk;
+
+ tp->ucopy.len = copied > 0 ? len : 0;
+
+ tcp_prequeue_process(sk);
+
+ if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
+ NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
+ len -= chunk;
+ copied += chunk;
+ }
+ }
+
+ tp->ucopy.task = NULL;
+ tp->ucopy.len = 0;
+ }
+
+ /* According to UNIX98, msg_name/msg_namelen are ignored
+ * on connected socket. I was just happy when found this 8) --ANK
+ */
+
+ /* Clean up data we have read: This will do ACK frames. */
+ cleanup_rbuf(sk, copied);
+
+ TCP_CHECK_TIMER(sk);
+ release_sock(sk);
+ return copied;
+
+out:
+ TCP_CHECK_TIMER(sk);
+ release_sock(sk);
+ return err;
+
+recv_urg:
+ err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
+ goto out;
+}
+
+/*
+ * State processing on a close. This implements the state shift for
+ * sending our FIN frame. Note that we only send a FIN for some
+ * states. A shutdown() may have already sent the FIN, or we may be
+ * closed.
+ */
+
+static unsigned char new_state[16] = {
+ /* current state: new state: action: */
+ /* (Invalid) */ TCP_CLOSE,
+ /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+ /* TCP_SYN_SENT */ TCP_CLOSE,
+ /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+ /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
+ /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
+ /* TCP_TIME_WAIT */ TCP_CLOSE,
+ /* TCP_CLOSE */ TCP_CLOSE,
+ /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN,
+ /* TCP_LAST_ACK */ TCP_LAST_ACK,
+ /* TCP_LISTEN */ TCP_CLOSE,
+ /* TCP_CLOSING */ TCP_CLOSING,
+};
+
+static int tcp_close_state(struct sock *sk)
+{
+ int next = (int)new_state[sk->sk_state];
+ int ns = next & TCP_STATE_MASK;
+
+ tcp_set_state(sk, ns);
+
+ return next & TCP_ACTION_FIN;
+}
+
+/*
+ * Shutdown the sending side of a connection. Much like close except
+ * that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
+ */
+
+void tcp_shutdown(struct sock *sk, int how)
+{
+ /* We need to grab some memory, and put together a FIN,
+ * and then put it into the queue to be sent.
+ * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
+ */
+ if (!(how & SEND_SHUTDOWN))
+ return;
+
+ /* If we've already sent a FIN, or it's a closed state, skip this. */
+ if ((1 << sk->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_SYN_SENT |
+ TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
+ /* Clear out any half completed packets. FIN if needed. */
+ if (tcp_close_state(sk))
+ tcp_send_fin(sk);
+ }
+}
+
+/*
+ * At this point, there should be no process reference to this
+ * socket, and thus no user references at all. Therefore we
+ * can assume the socket waitqueue is inactive and nobody will
+ * try to jump onto it.
+ */
+void tcp_destroy_sock(struct sock *sk)
+{
+ BUG_TRAP(sk->sk_state == TCP_CLOSE);
+ BUG_TRAP(sock_flag(sk, SOCK_DEAD));
+
+ /* It cannot be in hash table! */
+ BUG_TRAP(sk_unhashed(sk));
+
+ /* If it has not 0 inet_sk(sk)->num, it must be bound */
+ BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
+
+ sk->sk_prot->destroy(sk);
+
+ sk_stream_kill_queues(sk);
+
+ xfrm_sk_free_policy(sk);
+
+#ifdef INET_REFCNT_DEBUG
+ if (atomic_read(&sk->sk_refcnt) != 1) {
+ printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
+ sk, atomic_read(&sk->sk_refcnt));
+ }
+#endif
+
+ atomic_dec(&tcp_orphan_count);
+ sock_put(sk);
+}
+
+void tcp_close(struct sock *sk, long timeout)
+{
+ struct sk_buff *skb;
+ int data_was_unread = 0;
+
+ lock_sock(sk);
+ sk->sk_shutdown = SHUTDOWN_MASK;
+
+ if (sk->sk_state == TCP_LISTEN) {
+ tcp_set_state(sk, TCP_CLOSE);
+
+ /* Special case. */
+ tcp_listen_stop(sk);
+
+ goto adjudge_to_death;
+ }
+
+ /* We need to flush the recv. buffs. We do this only on the
+ * descriptor close, not protocol-sourced closes, because the
+ * reader process may not have drained the data yet!
+ */
+ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
+ skb->h.th->fin;
+ data_was_unread += len;
+ __kfree_skb(skb);
+ }
+
+ sk_stream_mem_reclaim(sk);
+
+ /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
+ * 3.10, we send a RST here because data was lost. To
+ * witness the awful effects of the old behavior of always
+ * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
+ * a bulk GET in an FTP client, suspend the process, wait
+ * for the client to advertise a zero window, then kill -9
+ * the FTP client, wheee... Note: timeout is always zero
+ * in such a case.
+ */
+ if (data_was_unread) {
+ /* Unread data was tossed, zap the connection. */
+ NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_send_active_reset(sk, GFP_KERNEL);
+ } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
+ /* Check zero linger _after_ checking for unread data. */
+ sk->sk_prot->disconnect(sk, 0);
+ NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
+ } else if (tcp_close_state(sk)) {
+ /* We FIN if the application ate all the data before
+ * zapping the connection.
+ */
+
+ /* RED-PEN. Formally speaking, we have broken TCP state
+ * machine. State transitions:
+ *
+ * TCP_ESTABLISHED -> TCP_FIN_WAIT1
+ * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
+ * TCP_CLOSE_WAIT -> TCP_LAST_ACK
+ *
+ * are legal only when FIN has been sent (i.e. in window),
+ * rather than queued out of window. Purists blame.
+ *
+ * F.e. "RFC state" is ESTABLISHED,
+ * if Linux state is FIN-WAIT-1, but FIN is still not sent.
+ *
+ * The visible declinations are that sometimes
+ * we enter time-wait state, when it is not required really
+ * (harmless), do not send active resets, when they are
+ * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
+ * they look as CLOSING or LAST_ACK for Linux)
+ * Probably, I missed some more holelets.
+ * --ANK
+ */
+ tcp_send_fin(sk);
+ }
+
+ sk_stream_wait_close(sk, timeout);
+
+adjudge_to_death:
+ /* It is the last release_sock in its life. It will remove backlog. */
+ release_sock(sk);
+
+
+ /* Now socket is owned by kernel and we acquire BH lock
+ to finish close. No need to check for user refs.
+ */
+ local_bh_disable();
+ bh_lock_sock(sk);
+ BUG_TRAP(!sock_owned_by_user(sk));
+
+ sock_hold(sk);
+ sock_orphan(sk);
+
+ /* This is a (useful) BSD violating of the RFC. There is a
+ * problem with TCP as specified in that the other end could
+ * keep a socket open forever with no application left this end.
+ * We use a 3 minute timeout (about the same as BSD) then kill
+ * our end. If they send after that then tough - BUT: long enough
+ * that we won't make the old 4*rto = almost no time - whoops
+ * reset mistake.
+ *
+ * Nope, it was not mistake. It is really desired behaviour
+ * f.e. on http servers, when such sockets are useless, but
+ * consume significant resources. Let's do it with special
+ * linger2 option. --ANK
+ */
+
+ if (sk->sk_state == TCP_FIN_WAIT2) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ if (tp->linger2 < 0) {
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
+ } else {
+ int tmo = tcp_fin_time(tp);
+
+ if (tmo > TCP_TIMEWAIT_LEN) {
+ tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
+ } else {
+ atomic_inc(&tcp_orphan_count);
+ tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+ goto out;
+ }
+ }
+ }
+ if (sk->sk_state != TCP_CLOSE) {
+ sk_stream_mem_reclaim(sk);
+ if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
+ (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
+ atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
+ if (net_ratelimit())
+ printk(KERN_INFO "TCP: too many of orphaned "
+ "sockets\n");
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
+ }
+ }
+ atomic_inc(&tcp_orphan_count);
+
+ if (sk->sk_state == TCP_CLOSE)
+ tcp_destroy_sock(sk);
+ /* Otherwise, socket is reprieved until protocol close. */
+
+out:
+ bh_unlock_sock(sk);
+ local_bh_enable();
+ sock_put(sk);
+}
+
+/* These states need RST on ABORT according to RFC793 */
+
+static inline int tcp_need_reset(int state)
+{
+ return (1 << state) &
+ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
+ TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
+}
+
+int tcp_disconnect(struct sock *sk, int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ int err = 0;
+ int old_state = sk->sk_state;
+
+ if (old_state != TCP_CLOSE)
+ tcp_set_state(sk, TCP_CLOSE);
+
+ /* ABORT function of RFC793 */
+ if (old_state == TCP_LISTEN) {
+ tcp_listen_stop(sk);
+ } else if (tcp_need_reset(old_state) ||
+ (tp->snd_nxt != tp->write_seq &&
+ (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
+ /* The last check adjusts for discrepance of Linux wrt. RFC
+ * states
+ */
+ tcp_send_active_reset(sk, gfp_any());
+ sk->sk_err = ECONNRESET;
+ } else if (old_state == TCP_SYN_SENT)
+ sk->sk_err = ECONNRESET;
+
+ tcp_clear_xmit_timers(sk);
+ __skb_queue_purge(&sk->sk_receive_queue);
+ sk_stream_writequeue_purge(sk);
+ __skb_queue_purge(&tp->out_of_order_queue);
+
+ inet->dport = 0;
+
+ if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+ inet_reset_saddr(sk);
+
+ sk->sk_shutdown = 0;
+ sock_reset_flag(sk, SOCK_DONE);
+ tp->srtt = 0;
+ if ((tp->write_seq += tp->max_window + 2) == 0)
+ tp->write_seq = 1;
+ tp->backoff = 0;
+ tp->snd_cwnd = 2;
+ tp->probes_out = 0;
+ tp->packets_out = 0;
+ tp->snd_ssthresh = 0x7fffffff;
+ tp->snd_cwnd_cnt = 0;
+ tcp_set_ca_state(tp, TCP_CA_Open);
+ tcp_clear_retrans(tp);
+ tcp_delack_init(tp);
+ sk->sk_send_head = NULL;
+ tp->rx_opt.saw_tstamp = 0;
+ tcp_sack_reset(&tp->rx_opt);
+ __sk_dst_reset(sk);
+
+ BUG_TRAP(!inet->num || tp->bind_hash);
+
+ sk->sk_error_report(sk);
+ return err;
+}
+
+/*
+ * Wait for an incoming connection, avoid race
+ * conditions. This must be called with the socket locked.
+ */
+static int wait_for_connect(struct sock *sk, long timeo)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ DEFINE_WAIT(wait);
+ int err;
+
+ /*
+ * True wake-one mechanism for incoming connections: only
+ * one process gets woken up, not the 'whole herd'.
+ * Since we do not 'race & poll' for established sockets
+ * anymore, the common case will execute the loop only once.
+ *
+ * Subtle issue: "add_wait_queue_exclusive()" will be added
+ * after any current non-exclusive waiters, and we know that
+ * it will always _stay_ after any new non-exclusive waiters
+ * because all non-exclusive waiters are added at the
+ * beginning of the wait-queue. As such, it's ok to "drop"
+ * our exclusiveness temporarily when we get woken up without
+ * having to remove and re-insert us on the wait queue.
+ */
+ for (;;) {
+ prepare_to_wait_exclusive(sk->sk_sleep, &wait,
+ TASK_INTERRUPTIBLE);
+ release_sock(sk);
+ if (!tp->accept_queue)
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ err = 0;
+ if (tp->accept_queue)
+ break;
+ err = -EINVAL;
+ if (sk->sk_state != TCP_LISTEN)
+ break;
+ err = sock_intr_errno(timeo);
+ if (signal_pending(current))
+ break;
+ err = -EAGAIN;
+ if (!timeo)
+ break;
+ }
+ finish_wait(sk->sk_sleep, &wait);
+ return err;
+}
+
+/*
+ * This will accept the next outstanding connection.
+ */
+
+struct sock *tcp_accept(struct sock *sk, int flags, int *err)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct open_request *req;
+ struct sock *newsk;
+ int error;
+
+ lock_sock(sk);
+
+ /* We need to make sure that this socket is listening,
+ * and that it has something pending.
+ */
+ error = -EINVAL;
+ if (sk->sk_state != TCP_LISTEN)
+ goto out;
+
+ /* Find already established connection */
+ if (!tp->accept_queue) {
+ long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+ /* If this is a non blocking socket don't sleep */
+ error = -EAGAIN;
+ if (!timeo)
+ goto out;
+
+ error = wait_for_connect(sk, timeo);
+ if (error)
+ goto out;
+ }
+
+ req = tp->accept_queue;
+ if ((tp->accept_queue = req->dl_next) == NULL)
+ tp->accept_queue_tail = NULL;
+
+ newsk = req->sk;
+ sk_acceptq_removed(sk);
+ tcp_openreq_fastfree(req);
+ BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
+ release_sock(sk);
+ return newsk;
+
+out:
+ release_sock(sk);
+ *err = error;
+ return NULL;
+}
+
+/*
+ * Socket option code for TCP.
+ */
+int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
+ int optlen)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int val;
+ int err = 0;
+
+ if (level != SOL_TCP)
+ return tp->af_specific->setsockopt(sk, level, optname,
+ optval, optlen);
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case TCP_MAXSEG:
+ /* Values greater than interface MTU won't take effect. However
+ * at the point when this call is done we typically don't yet
+ * know which interface is going to be used */
+ if (val < 8 || val > MAX_TCP_WINDOW) {
+ err = -EINVAL;
+ break;
+ }
+ tp->rx_opt.user_mss = val;
+ break;
+
+ case TCP_NODELAY:
+ if (val) {
+ /* TCP_NODELAY is weaker than TCP_CORK, so that
+ * this option on corked socket is remembered, but
+ * it is not activated until cork is cleared.
+ *
+ * However, when TCP_NODELAY is set we make
+ * an explicit push, which overrides even TCP_CORK
+ * for currently queued segments.
+ */
+ tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
+ tcp_push_pending_frames(sk, tp);
+ } else {
+ tp->nonagle &= ~TCP_NAGLE_OFF;
+ }
+ break;
+
+ case TCP_CORK:
+ /* When set indicates to always queue non-full frames.
+ * Later the user clears this option and we transmit
+ * any pending partial frames in the queue. This is
+ * meant to be used alongside sendfile() to get properly
+ * filled frames when the user (for example) must write
+ * out headers with a write() call first and then use
+ * sendfile to send out the data parts.
+ *
+ * TCP_CORK can be set together with TCP_NODELAY and it is
+ * stronger than TCP_NODELAY.
+ */
+ if (val) {
+ tp->nonagle |= TCP_NAGLE_CORK;
+ } else {
+ tp->nonagle &= ~TCP_NAGLE_CORK;
+ if (tp->nonagle&TCP_NAGLE_OFF)
+ tp->nonagle |= TCP_NAGLE_PUSH;
+ tcp_push_pending_frames(sk, tp);
+ }
+ break;
+
+ case TCP_KEEPIDLE:
+ if (val < 1 || val > MAX_TCP_KEEPIDLE)
+ err = -EINVAL;
+ else {
+ tp->keepalive_time = val * HZ;
+ if (sock_flag(sk, SOCK_KEEPOPEN) &&
+ !((1 << sk->sk_state) &
+ (TCPF_CLOSE | TCPF_LISTEN))) {
+ __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
+ if (tp->keepalive_time > elapsed)
+ elapsed = tp->keepalive_time - elapsed;
+ else
+ elapsed = 0;
+ tcp_reset_keepalive_timer(sk, elapsed);
+ }
+ }
+ break;
+ case TCP_KEEPINTVL:
+ if (val < 1 || val > MAX_TCP_KEEPINTVL)
+ err = -EINVAL;
+ else
+ tp->keepalive_intvl = val * HZ;
+ break;
+ case TCP_KEEPCNT:
+ if (val < 1 || val > MAX_TCP_KEEPCNT)
+ err = -EINVAL;
+ else
+ tp->keepalive_probes = val;
+ break;
+ case TCP_SYNCNT:
+ if (val < 1 || val > MAX_TCP_SYNCNT)
+ err = -EINVAL;
+ else
+ tp->syn_retries = val;
+ break;
+
+ case TCP_LINGER2:
+ if (val < 0)
+ tp->linger2 = -1;
+ else if (val > sysctl_tcp_fin_timeout / HZ)
+ tp->linger2 = 0;
+ else
+ tp->linger2 = val * HZ;
+ break;
+
+ case TCP_DEFER_ACCEPT:
+ tp->defer_accept = 0;
+ if (val > 0) {
+ /* Translate value in seconds to number of
+ * retransmits */
+ while (tp->defer_accept < 32 &&
+ val > ((TCP_TIMEOUT_INIT / HZ) <<
+ tp->defer_accept))
+ tp->defer_accept++;
+ tp->defer_accept++;
+ }
+ break;
+
+ case TCP_WINDOW_CLAMP:
+ if (!val) {
+ if (sk->sk_state != TCP_CLOSE) {
+ err = -EINVAL;
+ break;
+ }
+ tp->window_clamp = 0;
+ } else
+ tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
+ SOCK_MIN_RCVBUF / 2 : val;
+ break;
+
+ case TCP_QUICKACK:
+ if (!val) {
+ tp->ack.pingpong = 1;
+ } else {
+ tp->ack.pingpong = 0;
+ if ((1 << sk->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
+ tcp_ack_scheduled(tp)) {
+ tp->ack.pending |= TCP_ACK_PUSHED;
+ cleanup_rbuf(sk, 1);
+ if (!(val & 1))
+ tp->ack.pingpong = 1;
+ }
+ }
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ };
+ release_sock(sk);
+ return err;
+}
+
+/* Return information about state of tcp endpoint in API format. */
+void tcp_get_info(struct sock *sk, struct tcp_info *info)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 now = tcp_time_stamp;
+
+ memset(info, 0, sizeof(*info));
+
+ info->tcpi_state = sk->sk_state;
+ info->tcpi_ca_state = tp->ca_state;
+ info->tcpi_retransmits = tp->retransmits;
+ info->tcpi_probes = tp->probes_out;
+ info->tcpi_backoff = tp->backoff;
+
+ if (tp->rx_opt.tstamp_ok)
+ info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
+ if (tp->rx_opt.sack_ok)
+ info->tcpi_options |= TCPI_OPT_SACK;
+ if (tp->rx_opt.wscale_ok) {
+ info->tcpi_options |= TCPI_OPT_WSCALE;
+ info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
+ info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
+ }
+
+ if (tp->ecn_flags&TCP_ECN_OK)
+ info->tcpi_options |= TCPI_OPT_ECN;
+
+ info->tcpi_rto = jiffies_to_usecs(tp->rto);
+ info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
+ info->tcpi_snd_mss = tp->mss_cache_std;
+ info->tcpi_rcv_mss = tp->ack.rcv_mss;
+
+ info->tcpi_unacked = tp->packets_out;
+ info->tcpi_sacked = tp->sacked_out;
+ info->tcpi_lost = tp->lost_out;
+ info->tcpi_retrans = tp->retrans_out;
+ info->tcpi_fackets = tp->fackets_out;
+
+ info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
+ info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
+ info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
+
+ info->tcpi_pmtu = tp->pmtu_cookie;
+ info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
+ info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
+ info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
+ info->tcpi_snd_ssthresh = tp->snd_ssthresh;
+ info->tcpi_snd_cwnd = tp->snd_cwnd;
+ info->tcpi_advmss = tp->advmss;
+ info->tcpi_reordering = tp->reordering;
+
+ info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
+ info->tcpi_rcv_space = tp->rcvq_space.space;
+
+ info->tcpi_total_retrans = tp->total_retrans;
+}
+
+EXPORT_SYMBOL_GPL(tcp_get_info);
+
+int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
+ int __user *optlen)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int val, len;
+
+ if (level != SOL_TCP)
+ return tp->af_specific->getsockopt(sk, level, optname,
+ optval, optlen);
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ len = min_t(unsigned int, len, sizeof(int));
+
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case TCP_MAXSEG:
+ val = tp->mss_cache_std;
+ if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ val = tp->rx_opt.user_mss;
+ break;
+ case TCP_NODELAY:
+ val = !!(tp->nonagle&TCP_NAGLE_OFF);
+ break;
+ case TCP_CORK:
+ val = !!(tp->nonagle&TCP_NAGLE_CORK);
+ break;
+ case TCP_KEEPIDLE:
+ val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
+ break;
+ case TCP_KEEPINTVL:
+ val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
+ break;
+ case TCP_KEEPCNT:
+ val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
+ break;
+ case TCP_SYNCNT:
+ val = tp->syn_retries ? : sysctl_tcp_syn_retries;
+ break;
+ case TCP_LINGER2:
+ val = tp->linger2;
+ if (val >= 0)
+ val = (val ? : sysctl_tcp_fin_timeout) / HZ;
+ break;
+ case TCP_DEFER_ACCEPT:
+ val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
+ (tp->defer_accept - 1));
+ break;
+ case TCP_WINDOW_CLAMP:
+ val = tp->window_clamp;
+ break;
+ case TCP_INFO: {
+ struct tcp_info info;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ tcp_get_info(sk, &info);
+
+ len = min_t(unsigned int, len, sizeof(info));
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &info, len))
+ return -EFAULT;
+ return 0;
+ }
+ case TCP_QUICKACK:
+ val = !tp->ack.pingpong;
+ break;
+ default:
+ return -ENOPROTOOPT;
+ };
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+ return 0;
+}
+
+
+extern void __skb_cb_too_small_for_tcp(int, int);
+extern void tcpdiag_init(void);
+
+static __initdata unsigned long thash_entries;
+static int __init set_thash_entries(char *str)
+{
+ if (!str)
+ return 0;
+ thash_entries = simple_strtoul(str, &str, 0);
+ return 1;
+}
+__setup("thash_entries=", set_thash_entries);
+
+void __init tcp_init(void)
+{
+ struct sk_buff *skb = NULL;
+ int order, i;
+
+ if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
+ __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
+ sizeof(skb->cb));
+
+ tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
+ sizeof(struct open_request),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if (!tcp_openreq_cachep)
+ panic("tcp_init: Cannot alloc open_request cache.");
+
+ tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
+ sizeof(struct tcp_bind_bucket),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if (!tcp_bucket_cachep)
+ panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
+
+ tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
+ sizeof(struct tcp_tw_bucket),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if (!tcp_timewait_cachep)
+ panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
+
+ /* Size and allocate the main established and bind bucket
+ * hash tables.
+ *
+ * The methodology is similar to that of the buffer cache.
+ */
+ tcp_ehash = (struct tcp_ehash_bucket *)
+ alloc_large_system_hash("TCP established",
+ sizeof(struct tcp_ehash_bucket),
+ thash_entries,
+ (num_physpages >= 128 * 1024) ?
+ (25 - PAGE_SHIFT) :
+ (27 - PAGE_SHIFT),
+ HASH_HIGHMEM,
+ &tcp_ehash_size,
+ NULL,
+ 0);
+ tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
+ for (i = 0; i < (tcp_ehash_size << 1); i++) {
+ rwlock_init(&tcp_ehash[i].lock);
+ INIT_HLIST_HEAD(&tcp_ehash[i].chain);
+ }
+
+ tcp_bhash = (struct tcp_bind_hashbucket *)
+ alloc_large_system_hash("TCP bind",
+ sizeof(struct tcp_bind_hashbucket),
+ tcp_ehash_size,
+ (num_physpages >= 128 * 1024) ?
+ (25 - PAGE_SHIFT) :
+ (27 - PAGE_SHIFT),
+ HASH_HIGHMEM,
+ &tcp_bhash_size,
+ NULL,
+ 64 * 1024);
+ tcp_bhash_size = 1 << tcp_bhash_size;
+ for (i = 0; i < tcp_bhash_size; i++) {
+ spin_lock_init(&tcp_bhash[i].lock);
+ INIT_HLIST_HEAD(&tcp_bhash[i].chain);
+ }
+
+ /* Try to be a bit smarter and adjust defaults depending
+ * on available memory.
+ */
+ for (order = 0; ((1 << order) << PAGE_SHIFT) <
+ (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
+ order++)
+ ;
+ if (order > 4) {
+ sysctl_local_port_range[0] = 32768;
+ sysctl_local_port_range[1] = 61000;
+ sysctl_tcp_max_tw_buckets = 180000;
+ sysctl_tcp_max_orphans = 4096 << (order - 4);
+ sysctl_max_syn_backlog = 1024;
+ } else if (order < 3) {
+ sysctl_local_port_range[0] = 1024 * (3 - order);
+ sysctl_tcp_max_tw_buckets >>= (3 - order);
+ sysctl_tcp_max_orphans >>= (3 - order);
+ sysctl_max_syn_backlog = 128;
+ }
+ tcp_port_rover = sysctl_local_port_range[0] - 1;
+
+ sysctl_tcp_mem[0] = 768 << order;
+ sysctl_tcp_mem[1] = 1024 << order;
+ sysctl_tcp_mem[2] = 1536 << order;
+
+ if (order < 3) {
+ sysctl_tcp_wmem[2] = 64 * 1024;
+ sysctl_tcp_rmem[0] = PAGE_SIZE;
+ sysctl_tcp_rmem[1] = 43689;
+ sysctl_tcp_rmem[2] = 2 * 43689;
+ }
+
+ printk(KERN_INFO "TCP: Hash tables configured "
+ "(established %d bind %d)\n",
+ tcp_ehash_size << 1, tcp_bhash_size);
+}
+
+EXPORT_SYMBOL(tcp_accept);
+EXPORT_SYMBOL(tcp_close);
+EXPORT_SYMBOL(tcp_destroy_sock);
+EXPORT_SYMBOL(tcp_disconnect);
+EXPORT_SYMBOL(tcp_getsockopt);
+EXPORT_SYMBOL(tcp_ioctl);
+EXPORT_SYMBOL(tcp_openreq_cachep);
+EXPORT_SYMBOL(tcp_poll);
+EXPORT_SYMBOL(tcp_read_sock);
+EXPORT_SYMBOL(tcp_recvmsg);
+EXPORT_SYMBOL(tcp_sendmsg);
+EXPORT_SYMBOL(tcp_sendpage);
+EXPORT_SYMBOL(tcp_setsockopt);
+EXPORT_SYMBOL(tcp_shutdown);
+EXPORT_SYMBOL(tcp_statistics);
+EXPORT_SYMBOL(tcp_timewait_cachep);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
new file mode 100644
index 00000000000..313c1408da3
--- /dev/null
+++ b/net/ipv4/tcp_diag.c
@@ -0,0 +1,802 @@
+/*
+ * tcp_diag.c Module for monitoring TCP sockets.
+ *
+ * Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/random.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/time.h>
+
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+
+#include <linux/inet.h>
+#include <linux/stddef.h>
+
+#include <linux/tcp_diag.h>
+
+struct tcpdiag_entry
+{
+ u32 *saddr;
+ u32 *daddr;
+ u16 sport;
+ u16 dport;
+ u16 family;
+ u16 userlocks;
+};
+
+static struct sock *tcpnl;
+
+
+#define TCPDIAG_PUT(skb, attrtype, attrlen) \
+({ int rtalen = RTA_LENGTH(attrlen); \
+ struct rtattr *rta; \
+ if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \
+ rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \
+ rta->rta_type = attrtype; \
+ rta->rta_len = rtalen; \
+ RTA_DATA(rta); })
+
+static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
+ int ext, u32 pid, u32 seq, u16 nlmsg_flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcpdiagmsg *r;
+ struct nlmsghdr *nlh;
+ struct tcp_info *info = NULL;
+ struct tcpdiag_meminfo *minfo = NULL;
+ struct tcpvegas_info *vinfo = NULL;
+ unsigned char *b = skb->tail;
+
+ nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
+ nlh->nlmsg_flags = nlmsg_flags;
+ r = NLMSG_DATA(nlh);
+ if (sk->sk_state != TCP_TIME_WAIT) {
+ if (ext & (1<<(TCPDIAG_MEMINFO-1)))
+ minfo = TCPDIAG_PUT(skb, TCPDIAG_MEMINFO, sizeof(*minfo));
+ if (ext & (1<<(TCPDIAG_INFO-1)))
+ info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
+
+ if ((tcp_is_westwood(tp) || tcp_is_vegas(tp))
+ && (ext & (1<<(TCPDIAG_VEGASINFO-1))))
+ vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo));
+ }
+ r->tcpdiag_family = sk->sk_family;
+ r->tcpdiag_state = sk->sk_state;
+ r->tcpdiag_timer = 0;
+ r->tcpdiag_retrans = 0;
+
+ r->id.tcpdiag_if = sk->sk_bound_dev_if;
+ r->id.tcpdiag_cookie[0] = (u32)(unsigned long)sk;
+ r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
+
+ if (r->tcpdiag_state == TCP_TIME_WAIT) {
+ struct tcp_tw_bucket *tw = (struct tcp_tw_bucket*)sk;
+ long tmo = tw->tw_ttd - jiffies;
+ if (tmo < 0)
+ tmo = 0;
+
+ r->id.tcpdiag_sport = tw->tw_sport;
+ r->id.tcpdiag_dport = tw->tw_dport;
+ r->id.tcpdiag_src[0] = tw->tw_rcv_saddr;
+ r->id.tcpdiag_dst[0] = tw->tw_daddr;
+ r->tcpdiag_state = tw->tw_substate;
+ r->tcpdiag_timer = 3;
+ r->tcpdiag_expires = (tmo*1000+HZ-1)/HZ;
+ r->tcpdiag_rqueue = 0;
+ r->tcpdiag_wqueue = 0;
+ r->tcpdiag_uid = 0;
+ r->tcpdiag_inode = 0;
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+ if (r->tcpdiag_family == AF_INET6) {
+ ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
+ &tw->tw_v6_rcv_saddr);
+ ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
+ &tw->tw_v6_daddr);
+ }
+#endif
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+ }
+
+ r->id.tcpdiag_sport = inet->sport;
+ r->id.tcpdiag_dport = inet->dport;
+ r->id.tcpdiag_src[0] = inet->rcv_saddr;
+ r->id.tcpdiag_dst[0] = inet->daddr;
+
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+ if (r->tcpdiag_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
+ &np->rcv_saddr);
+ ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
+ &np->daddr);
+ }
+#endif
+
+#define EXPIRES_IN_MS(tmo) ((tmo-jiffies)*1000+HZ-1)/HZ
+
+ if (tp->pending == TCP_TIME_RETRANS) {
+ r->tcpdiag_timer = 1;
+ r->tcpdiag_retrans = tp->retransmits;
+ r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
+ } else if (tp->pending == TCP_TIME_PROBE0) {
+ r->tcpdiag_timer = 4;
+ r->tcpdiag_retrans = tp->probes_out;
+ r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
+ } else if (timer_pending(&sk->sk_timer)) {
+ r->tcpdiag_timer = 2;
+ r->tcpdiag_retrans = tp->probes_out;
+ r->tcpdiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
+ } else {
+ r->tcpdiag_timer = 0;
+ r->tcpdiag_expires = 0;
+ }
+#undef EXPIRES_IN_MS
+
+ r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq;
+ r->tcpdiag_wqueue = tp->write_seq - tp->snd_una;
+ r->tcpdiag_uid = sock_i_uid(sk);
+ r->tcpdiag_inode = sock_i_ino(sk);
+
+ if (minfo) {
+ minfo->tcpdiag_rmem = atomic_read(&sk->sk_rmem_alloc);
+ minfo->tcpdiag_wmem = sk->sk_wmem_queued;
+ minfo->tcpdiag_fmem = sk->sk_forward_alloc;
+ minfo->tcpdiag_tmem = atomic_read(&sk->sk_wmem_alloc);
+ }
+
+ if (info)
+ tcp_get_info(sk, info);
+
+ if (vinfo) {
+ if (tcp_is_vegas(tp)) {
+ vinfo->tcpv_enabled = tp->vegas.doing_vegas_now;
+ vinfo->tcpv_rttcnt = tp->vegas.cntRTT;
+ vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT);
+ vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT);
+ } else {
+ vinfo->tcpv_enabled = 0;
+ vinfo->tcpv_rttcnt = 0;
+ vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt);
+ vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min);
+ }
+ }
+
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+nlmsg_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport,
+ int dif);
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
+ struct in6_addr *daddr, u16 dport,
+ int dif);
+#else
+static inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
+ struct in6_addr *daddr, u16 dport,
+ int dif)
+{
+ return NULL;
+}
+#endif
+
+static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh)
+{
+ int err;
+ struct sock *sk;
+ struct tcpdiagreq *req = NLMSG_DATA(nlh);
+ struct sk_buff *rep;
+
+ if (req->tcpdiag_family == AF_INET) {
+ sk = tcp_v4_lookup(req->id.tcpdiag_dst[0], req->id.tcpdiag_dport,
+ req->id.tcpdiag_src[0], req->id.tcpdiag_sport,
+ req->id.tcpdiag_if);
+ }
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+ else if (req->tcpdiag_family == AF_INET6) {
+ sk = tcp_v6_lookup((struct in6_addr*)req->id.tcpdiag_dst, req->id.tcpdiag_dport,
+ (struct in6_addr*)req->id.tcpdiag_src, req->id.tcpdiag_sport,
+ req->id.tcpdiag_if);
+ }
+#endif
+ else {
+ return -EINVAL;
+ }
+
+ if (sk == NULL)
+ return -ENOENT;
+
+ err = -ESTALE;
+ if ((req->id.tcpdiag_cookie[0] != TCPDIAG_NOCOOKIE ||
+ req->id.tcpdiag_cookie[1] != TCPDIAG_NOCOOKIE) &&
+ ((u32)(unsigned long)sk != req->id.tcpdiag_cookie[0] ||
+ (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.tcpdiag_cookie[1]))
+ goto out;
+
+ err = -ENOMEM;
+ rep = alloc_skb(NLMSG_SPACE(sizeof(struct tcpdiagmsg)+
+ sizeof(struct tcpdiag_meminfo)+
+ sizeof(struct tcp_info)+64), GFP_KERNEL);
+ if (!rep)
+ goto out;
+
+ if (tcpdiag_fill(rep, sk, req->tcpdiag_ext,
+ NETLINK_CB(in_skb).pid,
+ nlh->nlmsg_seq, 0) <= 0)
+ BUG();
+
+ err = netlink_unicast(tcpnl, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
+ if (err > 0)
+ err = 0;
+
+out:
+ if (sk) {
+ if (sk->sk_state == TCP_TIME_WAIT)
+ tcp_tw_put((struct tcp_tw_bucket*)sk);
+ else
+ sock_put(sk);
+ }
+ return err;
+}
+
+static int bitstring_match(const u32 *a1, const u32 *a2, int bits)
+{
+ int words = bits >> 5;
+
+ bits &= 0x1f;
+
+ if (words) {
+ if (memcmp(a1, a2, words << 2))
+ return 0;
+ }
+ if (bits) {
+ __u32 w1, w2;
+ __u32 mask;
+
+ w1 = a1[words];
+ w2 = a2[words];
+
+ mask = htonl((0xffffffff) << (32 - bits));
+
+ if ((w1 ^ w2) & mask)
+ return 0;
+ }
+
+ return 1;
+}
+
+
+static int tcpdiag_bc_run(const void *bc, int len,
+ const struct tcpdiag_entry *entry)
+{
+ while (len > 0) {
+ int yes = 1;
+ const struct tcpdiag_bc_op *op = bc;
+
+ switch (op->code) {
+ case TCPDIAG_BC_NOP:
+ break;
+ case TCPDIAG_BC_JMP:
+ yes = 0;
+ break;
+ case TCPDIAG_BC_S_GE:
+ yes = entry->sport >= op[1].no;
+ break;
+ case TCPDIAG_BC_S_LE:
+ yes = entry->dport <= op[1].no;
+ break;
+ case TCPDIAG_BC_D_GE:
+ yes = entry->dport >= op[1].no;
+ break;
+ case TCPDIAG_BC_D_LE:
+ yes = entry->dport <= op[1].no;
+ break;
+ case TCPDIAG_BC_AUTO:
+ yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
+ break;
+ case TCPDIAG_BC_S_COND:
+ case TCPDIAG_BC_D_COND:
+ {
+ struct tcpdiag_hostcond *cond = (struct tcpdiag_hostcond*)(op+1);
+ u32 *addr;
+
+ if (cond->port != -1 &&
+ cond->port != (op->code == TCPDIAG_BC_S_COND ?
+ entry->sport : entry->dport)) {
+ yes = 0;
+ break;
+ }
+
+ if (cond->prefix_len == 0)
+ break;
+
+ if (op->code == TCPDIAG_BC_S_COND)
+ addr = entry->saddr;
+ else
+ addr = entry->daddr;
+
+ if (bitstring_match(addr, cond->addr, cond->prefix_len))
+ break;
+ if (entry->family == AF_INET6 &&
+ cond->family == AF_INET) {
+ if (addr[0] == 0 && addr[1] == 0 &&
+ addr[2] == htonl(0xffff) &&
+ bitstring_match(addr+3, cond->addr, cond->prefix_len))
+ break;
+ }
+ yes = 0;
+ break;
+ }
+ }
+
+ if (yes) {
+ len -= op->yes;
+ bc += op->yes;
+ } else {
+ len -= op->no;
+ bc += op->no;
+ }
+ }
+ return (len == 0);
+}
+
+static int valid_cc(const void *bc, int len, int cc)
+{
+ while (len >= 0) {
+ const struct tcpdiag_bc_op *op = bc;
+
+ if (cc > len)
+ return 0;
+ if (cc == len)
+ return 1;
+ if (op->yes < 4)
+ return 0;
+ len -= op->yes;
+ bc += op->yes;
+ }
+ return 0;
+}
+
+static int tcpdiag_bc_audit(const void *bytecode, int bytecode_len)
+{
+ const unsigned char *bc = bytecode;
+ int len = bytecode_len;
+
+ while (len > 0) {
+ struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc;
+
+//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
+ switch (op->code) {
+ case TCPDIAG_BC_AUTO:
+ case TCPDIAG_BC_S_COND:
+ case TCPDIAG_BC_D_COND:
+ case TCPDIAG_BC_S_GE:
+ case TCPDIAG_BC_S_LE:
+ case TCPDIAG_BC_D_GE:
+ case TCPDIAG_BC_D_LE:
+ if (op->yes < 4 || op->yes > len+4)
+ return -EINVAL;
+ case TCPDIAG_BC_JMP:
+ if (op->no < 4 || op->no > len+4)
+ return -EINVAL;
+ if (op->no < len &&
+ !valid_cc(bytecode, bytecode_len, len-op->no))
+ return -EINVAL;
+ break;
+ case TCPDIAG_BC_NOP:
+ if (op->yes < 4 || op->yes > len+4)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ bc += op->yes;
+ len -= op->yes;
+ }
+ return len == 0 ? 0 : -EINVAL;
+}
+
+static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk,
+ struct netlink_callback *cb)
+{
+ struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
+
+ if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
+ struct tcpdiag_entry entry;
+ struct rtattr *bc = (struct rtattr *)(r + 1);
+ struct inet_sock *inet = inet_sk(sk);
+
+ entry.family = sk->sk_family;
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+ if (entry.family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ entry.saddr = np->rcv_saddr.s6_addr32;
+ entry.daddr = np->daddr.s6_addr32;
+ } else
+#endif
+ {
+ entry.saddr = &inet->rcv_saddr;
+ entry.daddr = &inet->daddr;
+ }
+ entry.sport = inet->num;
+ entry.dport = ntohs(inet->dport);
+ entry.userlocks = sk->sk_userlocks;
+
+ if (!tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
+ return 0;
+ }
+
+ return tcpdiag_fill(skb, sk, r->tcpdiag_ext, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI);
+}
+
+static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
+ struct open_request *req,
+ u32 pid, u32 seq)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ unsigned char *b = skb->tail;
+ struct tcpdiagmsg *r;
+ struct nlmsghdr *nlh;
+ long tmo;
+
+ nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
+ nlh->nlmsg_flags = NLM_F_MULTI;
+ r = NLMSG_DATA(nlh);
+
+ r->tcpdiag_family = sk->sk_family;
+ r->tcpdiag_state = TCP_SYN_RECV;
+ r->tcpdiag_timer = 1;
+ r->tcpdiag_retrans = req->retrans;
+
+ r->id.tcpdiag_if = sk->sk_bound_dev_if;
+ r->id.tcpdiag_cookie[0] = (u32)(unsigned long)req;
+ r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
+
+ tmo = req->expires - jiffies;
+ if (tmo < 0)
+ tmo = 0;
+
+ r->id.tcpdiag_sport = inet->sport;
+ r->id.tcpdiag_dport = req->rmt_port;
+ r->id.tcpdiag_src[0] = req->af.v4_req.loc_addr;
+ r->id.tcpdiag_dst[0] = req->af.v4_req.rmt_addr;
+ r->tcpdiag_expires = jiffies_to_msecs(tmo),
+ r->tcpdiag_rqueue = 0;
+ r->tcpdiag_wqueue = 0;
+ r->tcpdiag_uid = sock_i_uid(sk);
+ r->tcpdiag_inode = 0;
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+ if (r->tcpdiag_family == AF_INET6) {
+ ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
+ &req->af.v6_req.loc_addr);
+ ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
+ &req->af.v6_req.rmt_addr);
+ }
+#endif
+ nlh->nlmsg_len = skb->tail - b;
+
+ return skb->len;
+
+nlmsg_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
+ struct netlink_callback *cb)
+{
+ struct tcpdiag_entry entry;
+ struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_listen_opt *lopt;
+ struct rtattr *bc = NULL;
+ struct inet_sock *inet = inet_sk(sk);
+ int j, s_j;
+ int reqnum, s_reqnum;
+ int err = 0;
+
+ s_j = cb->args[3];
+ s_reqnum = cb->args[4];
+
+ if (s_j > 0)
+ s_j--;
+
+ entry.family = sk->sk_family;
+
+ read_lock_bh(&tp->syn_wait_lock);
+
+ lopt = tp->listen_opt;
+ if (!lopt || !lopt->qlen)
+ goto out;
+
+ if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
+ bc = (struct rtattr *)(r + 1);
+ entry.sport = inet->num;
+ entry.userlocks = sk->sk_userlocks;
+ }
+
+ for (j = s_j; j < TCP_SYNQ_HSIZE; j++) {
+ struct open_request *req, *head = lopt->syn_table[j];
+
+ reqnum = 0;
+ for (req = head; req; reqnum++, req = req->dl_next) {
+ if (reqnum < s_reqnum)
+ continue;
+ if (r->id.tcpdiag_dport != req->rmt_port &&
+ r->id.tcpdiag_dport)
+ continue;
+
+ if (bc) {
+ entry.saddr =
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+ (entry.family == AF_INET6) ?
+ req->af.v6_req.loc_addr.s6_addr32 :
+#endif
+ &req->af.v4_req.loc_addr;
+ entry.daddr =
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+ (entry.family == AF_INET6) ?
+ req->af.v6_req.rmt_addr.s6_addr32 :
+#endif
+ &req->af.v4_req.rmt_addr;
+ entry.dport = ntohs(req->rmt_port);
+
+ if (!tcpdiag_bc_run(RTA_DATA(bc),
+ RTA_PAYLOAD(bc), &entry))
+ continue;
+ }
+
+ err = tcpdiag_fill_req(skb, sk, req,
+ NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq);
+ if (err < 0) {
+ cb->args[3] = j + 1;
+ cb->args[4] = reqnum;
+ goto out;
+ }
+ }
+
+ s_reqnum = 0;
+ }
+
+out:
+ read_unlock_bh(&tp->syn_wait_lock);
+
+ return err;
+}
+
+static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int i, num;
+ int s_i, s_num;
+ struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
+
+ s_i = cb->args[1];
+ s_num = num = cb->args[2];
+
+ if (cb->args[0] == 0) {
+ if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV)))
+ goto skip_listen_ht;
+ tcp_listen_lock();
+ for (i = s_i; i < TCP_LHTABLE_SIZE; i++) {
+ struct sock *sk;
+ struct hlist_node *node;
+
+ num = 0;
+ sk_for_each(sk, node, &tcp_listening_hash[i]) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (num < s_num) {
+ num++;
+ continue;
+ }
+
+ if (r->id.tcpdiag_sport != inet->sport &&
+ r->id.tcpdiag_sport)
+ goto next_listen;
+
+ if (!(r->tcpdiag_states&TCPF_LISTEN) ||
+ r->id.tcpdiag_dport ||
+ cb->args[3] > 0)
+ goto syn_recv;
+
+ if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
+ tcp_listen_unlock();
+ goto done;
+ }
+
+syn_recv:
+ if (!(r->tcpdiag_states&TCPF_SYN_RECV))
+ goto next_listen;
+
+ if (tcpdiag_dump_reqs(skb, sk, cb) < 0) {
+ tcp_listen_unlock();
+ goto done;
+ }
+
+next_listen:
+ cb->args[3] = 0;
+ cb->args[4] = 0;
+ ++num;
+ }
+
+ s_num = 0;
+ cb->args[3] = 0;
+ cb->args[4] = 0;
+ }
+ tcp_listen_unlock();
+skip_listen_ht:
+ cb->args[0] = 1;
+ s_i = num = s_num = 0;
+ }
+
+ if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV)))
+ return skb->len;
+
+ for (i = s_i; i < tcp_ehash_size; i++) {
+ struct tcp_ehash_bucket *head = &tcp_ehash[i];
+ struct sock *sk;
+ struct hlist_node *node;
+
+ if (i > s_i)
+ s_num = 0;
+
+ read_lock_bh(&head->lock);
+
+ num = 0;
+ sk_for_each(sk, node, &head->chain) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (num < s_num)
+ goto next_normal;
+ if (!(r->tcpdiag_states & (1 << sk->sk_state)))
+ goto next_normal;
+ if (r->id.tcpdiag_sport != inet->sport &&
+ r->id.tcpdiag_sport)
+ goto next_normal;
+ if (r->id.tcpdiag_dport != inet->dport && r->id.tcpdiag_dport)
+ goto next_normal;
+ if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
+ read_unlock_bh(&head->lock);
+ goto done;
+ }
+next_normal:
+ ++num;
+ }
+
+ if (r->tcpdiag_states&TCPF_TIME_WAIT) {
+ sk_for_each(sk, node,
+ &tcp_ehash[i + tcp_ehash_size].chain) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (num < s_num)
+ goto next_dying;
+ if (r->id.tcpdiag_sport != inet->sport &&
+ r->id.tcpdiag_sport)
+ goto next_dying;
+ if (r->id.tcpdiag_dport != inet->dport &&
+ r->id.tcpdiag_dport)
+ goto next_dying;
+ if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
+ read_unlock_bh(&head->lock);
+ goto done;
+ }
+next_dying:
+ ++num;
+ }
+ }
+ read_unlock_bh(&head->lock);
+ }
+
+done:
+ cb->args[1] = i;
+ cb->args[2] = num;
+ return skb->len;
+}
+
+static int tcpdiag_dump_done(struct netlink_callback *cb)
+{
+ return 0;
+}
+
+
+static __inline__ int
+tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
+ return 0;
+
+ if (nlh->nlmsg_type != TCPDIAG_GETSOCK)
+ goto err_inval;
+
+ if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len)
+ goto err_inval;
+
+ if (nlh->nlmsg_flags&NLM_F_DUMP) {
+ if (nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(struct tcpdiagreq))) {
+ struct rtattr *rta = (struct rtattr*)(NLMSG_DATA(nlh) + sizeof(struct tcpdiagreq));
+ if (rta->rta_type != TCPDIAG_REQ_BYTECODE ||
+ rta->rta_len < 8 ||
+ rta->rta_len > nlh->nlmsg_len - NLMSG_SPACE(sizeof(struct tcpdiagreq)))
+ goto err_inval;
+ if (tcpdiag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
+ goto err_inval;
+ }
+ return netlink_dump_start(tcpnl, skb, nlh,
+ tcpdiag_dump,
+ tcpdiag_dump_done);
+ } else {
+ return tcpdiag_get_exact(skb, nlh);
+ }
+
+err_inval:
+ return -EINVAL;
+}
+
+
+static inline void tcpdiag_rcv_skb(struct sk_buff *skb)
+{
+ int err;
+ struct nlmsghdr * nlh;
+
+ if (skb->len >= NLMSG_SPACE(0)) {
+ nlh = (struct nlmsghdr *)skb->data;
+ if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+ return;
+ err = tcpdiag_rcv_msg(skb, nlh);
+ if (err || nlh->nlmsg_flags & NLM_F_ACK)
+ netlink_ack(skb, nlh, err);
+ }
+}
+
+static void tcpdiag_rcv(struct sock *sk, int len)
+{
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ tcpdiag_rcv_skb(skb);
+ kfree_skb(skb);
+ }
+}
+
+static int __init tcpdiag_init(void)
+{
+ tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv);
+ if (tcpnl == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+static void __exit tcpdiag_exit(void)
+{
+ sock_release(tcpnl->sk_socket);
+}
+
+module_init(tcpdiag_init);
+module_exit(tcpdiag_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
new file mode 100644
index 00000000000..25049273590
--- /dev/null
+++ b/net/ipv4/tcp_input.c
@@ -0,0 +1,4959 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version: $Id: tcp_input.c,v 1.243 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ * Linus Torvalds, <torvalds@cs.helsinki.fi>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Matthew Dillon, <dillon@apollo.west.oic.com>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+/*
+ * Changes:
+ * Pedro Roque : Fast Retransmit/Recovery.
+ * Two receive queues.
+ * Retransmit queue handled by TCP.
+ * Better retransmit timer handling.
+ * New congestion avoidance.
+ * Header prediction.
+ * Variable renaming.
+ *
+ * Eric : Fast Retransmit.
+ * Randy Scott : MSS option defines.
+ * Eric Schenk : Fixes to slow start algorithm.
+ * Eric Schenk : Yet another double ACK bug.
+ * Eric Schenk : Delayed ACK bug fixes.
+ * Eric Schenk : Floyd style fast retrans war avoidance.
+ * David S. Miller : Don't allow zero congestion window.
+ * Eric Schenk : Fix retransmitter so that it sends
+ * next packet on ack of previous packet.
+ * Andi Kleen : Moved open_request checking here
+ * and process RSTs for open_requests.
+ * Andi Kleen : Better prune_queue, and other fixes.
+ * Andrey Savochkin: Fix RTT measurements in the presnce of
+ * timestamps.
+ * Andrey Savochkin: Check sequence numbers correctly when
+ * removing SACKs due to in sequence incoming
+ * data segments.
+ * Andi Kleen: Make sure we never ack data there is not
+ * enough room for. Also make this condition
+ * a fatal error if it might still happen.
+ * Andi Kleen: Add tcp_measure_rcv_mss to make
+ * connections with MSS<min(MTU,ann. MSS)
+ * work without delayed acks.
+ * Andi Kleen: Process packets with PSH set in the
+ * fast path.
+ * J Hadi Salim: ECN support
+ * Andrei Gurtov,
+ * Pasi Sarolahti,
+ * Panu Kuhlberg: Experimental audit of TCP (re)transmission
+ * engine. Lots of bugs are found.
+ * Pasi Sarolahti: F-RTO for dealing with spurious RTOs
+ * Angelo Dell'Aera: TCP Westwood+ support
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <net/tcp.h>
+#include <net/inet_common.h>
+#include <linux/ipsec.h>
+#include <asm/unaligned.h>
+
+int sysctl_tcp_timestamps = 1;
+int sysctl_tcp_window_scaling = 1;
+int sysctl_tcp_sack = 1;
+int sysctl_tcp_fack = 1;
+int sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
+int sysctl_tcp_ecn;
+int sysctl_tcp_dsack = 1;
+int sysctl_tcp_app_win = 31;
+int sysctl_tcp_adv_win_scale = 2;
+
+int sysctl_tcp_stdurg;
+int sysctl_tcp_rfc1337;
+int sysctl_tcp_max_orphans = NR_FILE;
+int sysctl_tcp_frto;
+int sysctl_tcp_nometrics_save;
+int sysctl_tcp_westwood;
+int sysctl_tcp_vegas_cong_avoid;
+
+int sysctl_tcp_moderate_rcvbuf = 1;
+
+/* Default values of the Vegas variables, in fixed-point representation
+ * with V_PARAM_SHIFT bits to the right of the binary point.
+ */
+#define V_PARAM_SHIFT 1
+int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT;
+int sysctl_tcp_vegas_beta = 3<<V_PARAM_SHIFT;
+int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT;
+int sysctl_tcp_bic = 1;
+int sysctl_tcp_bic_fast_convergence = 1;
+int sysctl_tcp_bic_low_window = 14;
+int sysctl_tcp_bic_beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
+
+#define FLAG_DATA 0x01 /* Incoming frame contained data. */
+#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
+#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
+#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
+#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
+#define FLAG_DATA_SACKED 0x20 /* New SACK. */
+#define FLAG_ECE 0x40 /* ECE in this ACK */
+#define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */
+#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
+
+#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
+#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
+#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
+#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
+
+#define IsReno(tp) ((tp)->rx_opt.sack_ok == 0)
+#define IsFack(tp) ((tp)->rx_opt.sack_ok & 2)
+#define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4)
+
+#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
+
+/* Adapt the MSS value used to make delayed ack decision to the
+ * real world.
+ */
+static inline void tcp_measure_rcv_mss(struct tcp_sock *tp,
+ struct sk_buff *skb)
+{
+ unsigned int len, lss;
+
+ lss = tp->ack.last_seg_size;
+ tp->ack.last_seg_size = 0;
+
+ /* skb->len may jitter because of SACKs, even if peer
+ * sends good full-sized frames.
+ */
+ len = skb->len;
+ if (len >= tp->ack.rcv_mss) {
+ tp->ack.rcv_mss = len;
+ } else {
+ /* Otherwise, we make more careful check taking into account,
+ * that SACKs block is variable.
+ *
+ * "len" is invariant segment length, including TCP header.
+ */
+ len += skb->data - skb->h.raw;
+ if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) ||
+ /* If PSH is not set, packet should be
+ * full sized, provided peer TCP is not badly broken.
+ * This observation (if it is correct 8)) allows
+ * to handle super-low mtu links fairly.
+ */
+ (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
+ !(tcp_flag_word(skb->h.th)&TCP_REMNANT))) {
+ /* Subtract also invariant (if peer is RFC compliant),
+ * tcp header plus fixed timestamp option length.
+ * Resulting "len" is MSS free of SACK jitter.
+ */
+ len -= tp->tcp_header_len;
+ tp->ack.last_seg_size = len;
+ if (len == lss) {
+ tp->ack.rcv_mss = len;
+ return;
+ }
+ }
+ tp->ack.pending |= TCP_ACK_PUSHED;
+ }
+}
+
+static void tcp_incr_quickack(struct tcp_sock *tp)
+{
+ unsigned quickacks = tp->rcv_wnd/(2*tp->ack.rcv_mss);
+
+ if (quickacks==0)
+ quickacks=2;
+ if (quickacks > tp->ack.quick)
+ tp->ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+}
+
+void tcp_enter_quickack_mode(struct tcp_sock *tp)
+{
+ tcp_incr_quickack(tp);
+ tp->ack.pingpong = 0;
+ tp->ack.ato = TCP_ATO_MIN;
+}
+
+/* Send ACKs quickly, if "quick" count is not exhausted
+ * and the session is not interactive.
+ */
+
+static __inline__ int tcp_in_quickack_mode(struct tcp_sock *tp)
+{
+ return (tp->ack.quick && !tp->ack.pingpong);
+}
+
+/* Buffer size and advertised window tuning.
+ *
+ * 1. Tuning sk->sk_sndbuf, when connection enters established state.
+ */
+
+static void tcp_fixup_sndbuf(struct sock *sk)
+{
+ int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 +
+ sizeof(struct sk_buff);
+
+ if (sk->sk_sndbuf < 3 * sndmem)
+ sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]);
+}
+
+/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
+ *
+ * All tcp_full_space() is split to two parts: "network" buffer, allocated
+ * forward and advertised in receiver window (tp->rcv_wnd) and
+ * "application buffer", required to isolate scheduling/application
+ * latencies from network.
+ * window_clamp is maximal advertised window. It can be less than
+ * tcp_full_space(), in this case tcp_full_space() - window_clamp
+ * is reserved for "application" buffer. The less window_clamp is
+ * the smoother our behaviour from viewpoint of network, but the lower
+ * throughput and the higher sensitivity of the connection to losses. 8)
+ *
+ * rcv_ssthresh is more strict window_clamp used at "slow start"
+ * phase to predict further behaviour of this connection.
+ * It is used for two goals:
+ * - to enforce header prediction at sender, even when application
+ * requires some significant "application buffer". It is check #1.
+ * - to prevent pruning of receive queue because of misprediction
+ * of receiver window. Check #2.
+ *
+ * The scheme does not work when sender sends good segments opening
+ * window and then starts to feed us spagetti. But it should work
+ * in common situations. Otherwise, we have to rely on queue collapsing.
+ */
+
+/* Slow part of check#2. */
+static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
+ struct sk_buff *skb)
+{
+ /* Optimize this! */
+ int truesize = tcp_win_from_space(skb->truesize)/2;
+ int window = tcp_full_space(sk)/2;
+
+ while (tp->rcv_ssthresh <= window) {
+ if (truesize <= skb->len)
+ return 2*tp->ack.rcv_mss;
+
+ truesize >>= 1;
+ window >>= 1;
+ }
+ return 0;
+}
+
+static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
+ struct sk_buff *skb)
+{
+ /* Check #1 */
+ if (tp->rcv_ssthresh < tp->window_clamp &&
+ (int)tp->rcv_ssthresh < tcp_space(sk) &&
+ !tcp_memory_pressure) {
+ int incr;
+
+ /* Check #2. Increase window, if skb with such overhead
+ * will fit to rcvbuf in future.
+ */
+ if (tcp_win_from_space(skb->truesize) <= skb->len)
+ incr = 2*tp->advmss;
+ else
+ incr = __tcp_grow_window(sk, tp, skb);
+
+ if (incr) {
+ tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp);
+ tp->ack.quick |= 1;
+ }
+ }
+}
+
+/* 3. Tuning rcvbuf, when connection enters established state. */
+
+static void tcp_fixup_rcvbuf(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
+
+ /* Try to select rcvbuf so that 4 mss-sized segments
+ * will fit to window and correspoding skbs will fit to our rcvbuf.
+ * (was 3; 4 is minimum to allow fast retransmit to work.)
+ */
+ while (tcp_win_from_space(rcvmem) < tp->advmss)
+ rcvmem += 128;
+ if (sk->sk_rcvbuf < 4 * rcvmem)
+ sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]);
+}
+
+/* 4. Try to fixup all. It is made iimediately after connection enters
+ * established state.
+ */
+static void tcp_init_buffer_space(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int maxwin;
+
+ if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
+ tcp_fixup_rcvbuf(sk);
+ if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
+ tcp_fixup_sndbuf(sk);
+
+ tp->rcvq_space.space = tp->rcv_wnd;
+
+ maxwin = tcp_full_space(sk);
+
+ if (tp->window_clamp >= maxwin) {
+ tp->window_clamp = maxwin;
+
+ if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss)
+ tp->window_clamp = max(maxwin -
+ (maxwin >> sysctl_tcp_app_win),
+ 4 * tp->advmss);
+ }
+
+ /* Force reservation of one segment. */
+ if (sysctl_tcp_app_win &&
+ tp->window_clamp > 2 * tp->advmss &&
+ tp->window_clamp + tp->advmss > maxwin)
+ tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
+
+ tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+static void init_bictcp(struct tcp_sock *tp)
+{
+ tp->bictcp.cnt = 0;
+
+ tp->bictcp.last_max_cwnd = 0;
+ tp->bictcp.last_cwnd = 0;
+ tp->bictcp.last_stamp = 0;
+}
+
+/* 5. Recalculate window clamp after socket hit its memory bounds. */
+static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
+{
+ struct sk_buff *skb;
+ unsigned int app_win = tp->rcv_nxt - tp->copied_seq;
+ int ofo_win = 0;
+
+ tp->ack.quick = 0;
+
+ skb_queue_walk(&tp->out_of_order_queue, skb) {
+ ofo_win += skb->len;
+ }
+
+ /* If overcommit is due to out of order segments,
+ * do not clamp window. Try to expand rcvbuf instead.
+ */
+ if (ofo_win) {
+ if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
+ !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
+ !tcp_memory_pressure &&
+ atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
+ sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
+ sysctl_tcp_rmem[2]);
+ }
+ if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
+ app_win += ofo_win;
+ if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf)
+ app_win >>= 1;
+ if (app_win > tp->ack.rcv_mss)
+ app_win -= tp->ack.rcv_mss;
+ app_win = max(app_win, 2U*tp->advmss);
+
+ if (!ofo_win)
+ tp->window_clamp = min(tp->window_clamp, app_win);
+ tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
+ }
+}
+
+/* Receiver "autotuning" code.
+ *
+ * The algorithm for RTT estimation w/o timestamps is based on
+ * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
+ * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps>
+ *
+ * More detail on this code can be found at
+ * <http://www.psc.edu/~jheffner/senior_thesis.ps>,
+ * though this reference is out of date. A new paper
+ * is pending.
+ */
+static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
+{
+ u32 new_sample = tp->rcv_rtt_est.rtt;
+ long m = sample;
+
+ if (m == 0)
+ m = 1;
+
+ if (new_sample != 0) {
+ /* If we sample in larger samples in the non-timestamp
+ * case, we could grossly overestimate the RTT especially
+ * with chatty applications or bulk transfer apps which
+ * are stalled on filesystem I/O.
+ *
+ * Also, since we are only going for a minimum in the
+ * non-timestamp case, we do not smoothe things out
+ * else with timestamps disabled convergance takes too
+ * long.
+ */
+ if (!win_dep) {
+ m -= (new_sample >> 3);
+ new_sample += m;
+ } else if (m < new_sample)
+ new_sample = m << 3;
+ } else {
+ /* No previous mesaure. */
+ new_sample = m << 3;
+ }
+
+ if (tp->rcv_rtt_est.rtt != new_sample)
+ tp->rcv_rtt_est.rtt = new_sample;
+}
+
+static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
+{
+ if (tp->rcv_rtt_est.time == 0)
+ goto new_measure;
+ if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
+ return;
+ tcp_rcv_rtt_update(tp,
+ jiffies - tp->rcv_rtt_est.time,
+ 1);
+
+new_measure:
+ tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
+ tp->rcv_rtt_est.time = tcp_time_stamp;
+}
+
+static inline void tcp_rcv_rtt_measure_ts(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ if (tp->rx_opt.rcv_tsecr &&
+ (TCP_SKB_CB(skb)->end_seq -
+ TCP_SKB_CB(skb)->seq >= tp->ack.rcv_mss))
+ tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
+}
+
+/*
+ * This function should be called every time data is copied to user space.
+ * It calculates the appropriate TCP receive buffer space.
+ */
+void tcp_rcv_space_adjust(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int time;
+ int space;
+
+ if (tp->rcvq_space.time == 0)
+ goto new_measure;
+
+ time = tcp_time_stamp - tp->rcvq_space.time;
+ if (time < (tp->rcv_rtt_est.rtt >> 3) ||
+ tp->rcv_rtt_est.rtt == 0)
+ return;
+
+ space = 2 * (tp->copied_seq - tp->rcvq_space.seq);
+
+ space = max(tp->rcvq_space.space, space);
+
+ if (tp->rcvq_space.space != space) {
+ int rcvmem;
+
+ tp->rcvq_space.space = space;
+
+ if (sysctl_tcp_moderate_rcvbuf) {
+ int new_clamp = space;
+
+ /* Receive space grows, normalize in order to
+ * take into account packet headers and sk_buff
+ * structure overhead.
+ */
+ space /= tp->advmss;
+ if (!space)
+ space = 1;
+ rcvmem = (tp->advmss + MAX_TCP_HEADER +
+ 16 + sizeof(struct sk_buff));
+ while (tcp_win_from_space(rcvmem) < tp->advmss)
+ rcvmem += 128;
+ space *= rcvmem;
+ space = min(space, sysctl_tcp_rmem[2]);
+ if (space > sk->sk_rcvbuf) {
+ sk->sk_rcvbuf = space;
+
+ /* Make the window clamp follow along. */
+ tp->window_clamp = new_clamp;
+ }
+ }
+ }
+
+new_measure:
+ tp->rcvq_space.seq = tp->copied_seq;
+ tp->rcvq_space.time = tcp_time_stamp;
+}
+
+/* There is something which you must keep in mind when you analyze the
+ * behavior of the tp->ato delayed ack timeout interval. When a
+ * connection starts up, we want to ack as quickly as possible. The
+ * problem is that "good" TCP's do slow start at the beginning of data
+ * transmission. The means that until we send the first few ACK's the
+ * sender will sit on his end and only queue most of his data, because
+ * he can only send snd_cwnd unacked packets at any given time. For
+ * each ACK we send, he increments snd_cwnd and transmits more of his
+ * queue. -DaveM
+ */
+static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
+{
+ u32 now;
+
+ tcp_schedule_ack(tp);
+
+ tcp_measure_rcv_mss(tp, skb);
+
+ tcp_rcv_rtt_measure(tp);
+
+ now = tcp_time_stamp;
+
+ if (!tp->ack.ato) {
+ /* The _first_ data packet received, initialize
+ * delayed ACK engine.
+ */
+ tcp_incr_quickack(tp);
+ tp->ack.ato = TCP_ATO_MIN;
+ } else {
+ int m = now - tp->ack.lrcvtime;
+
+ if (m <= TCP_ATO_MIN/2) {
+ /* The fastest case is the first. */
+ tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2;
+ } else if (m < tp->ack.ato) {
+ tp->ack.ato = (tp->ack.ato>>1) + m;
+ if (tp->ack.ato > tp->rto)
+ tp->ack.ato = tp->rto;
+ } else if (m > tp->rto) {
+ /* Too long gap. Apparently sender falled to
+ * restart window, so that we send ACKs quickly.
+ */
+ tcp_incr_quickack(tp);
+ sk_stream_mem_reclaim(sk);
+ }
+ }
+ tp->ack.lrcvtime = now;
+
+ TCP_ECN_check_ce(tp, skb);
+
+ if (skb->len >= 128)
+ tcp_grow_window(sk, tp, skb);
+}
+
+/* When starting a new connection, pin down the current choice of
+ * congestion algorithm.
+ */
+void tcp_ca_init(struct tcp_sock *tp)
+{
+ if (sysctl_tcp_westwood)
+ tp->adv_cong = TCP_WESTWOOD;
+ else if (sysctl_tcp_bic)
+ tp->adv_cong = TCP_BIC;
+ else if (sysctl_tcp_vegas_cong_avoid) {
+ tp->adv_cong = TCP_VEGAS;
+ tp->vegas.baseRTT = 0x7fffffff;
+ tcp_vegas_enable(tp);
+ }
+}
+
+/* Do RTT sampling needed for Vegas.
+ * Basically we:
+ * o min-filter RTT samples from within an RTT to get the current
+ * propagation delay + queuing delay (we are min-filtering to try to
+ * avoid the effects of delayed ACKs)
+ * o min-filter RTT samples from a much longer window (forever for now)
+ * to find the propagation delay (baseRTT)
+ */
+static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
+{
+ __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */
+
+ /* Filter to find propagation delay: */
+ if (vrtt < tp->vegas.baseRTT)
+ tp->vegas.baseRTT = vrtt;
+
+ /* Find the min RTT during the last RTT to find
+ * the current prop. delay + queuing delay:
+ */
+ tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt);
+ tp->vegas.cntRTT++;
+}
+
+/* Called to compute a smoothed rtt estimate. The data fed to this
+ * routine either comes from timestamps, or from segments that were
+ * known _not_ to have been retransmitted [see Karn/Partridge
+ * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
+ * piece by Van Jacobson.
+ * NOTE: the next three routines used to be one big routine.
+ * To save cycles in the RFC 1323 implementation it was better to break
+ * it up into three procedures. -- erics
+ */
+static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt)
+{
+ long m = mrtt; /* RTT */
+
+ if (tcp_vegas_enabled(tp))
+ vegas_rtt_calc(tp, mrtt);
+
+ /* The following amusing code comes from Jacobson's
+ * article in SIGCOMM '88. Note that rtt and mdev
+ * are scaled versions of rtt and mean deviation.
+ * This is designed to be as fast as possible
+ * m stands for "measurement".
+ *
+ * On a 1990 paper the rto value is changed to:
+ * RTO = rtt + 4 * mdev
+ *
+ * Funny. This algorithm seems to be very broken.
+ * These formulae increase RTO, when it should be decreased, increase
+ * too slowly, when it should be incresed fastly, decrease too fastly
+ * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
+ * does not matter how to _calculate_ it. Seems, it was trap
+ * that VJ failed to avoid. 8)
+ */
+ if(m == 0)
+ m = 1;
+ if (tp->srtt != 0) {
+ m -= (tp->srtt >> 3); /* m is now error in rtt est */
+ tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */
+ if (m < 0) {
+ m = -m; /* m is now abs(error) */
+ m -= (tp->mdev >> 2); /* similar update on mdev */
+ /* This is similar to one of Eifel findings.
+ * Eifel blocks mdev updates when rtt decreases.
+ * This solution is a bit different: we use finer gain
+ * for mdev in this case (alpha*beta).
+ * Like Eifel it also prevents growth of rto,
+ * but also it limits too fast rto decreases,
+ * happening in pure Eifel.
+ */
+ if (m > 0)
+ m >>= 3;
+ } else {
+ m -= (tp->mdev >> 2); /* similar update on mdev */
+ }
+ tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
+ if (tp->mdev > tp->mdev_max) {
+ tp->mdev_max = tp->mdev;
+ if (tp->mdev_max > tp->rttvar)
+ tp->rttvar = tp->mdev_max;
+ }
+ if (after(tp->snd_una, tp->rtt_seq)) {
+ if (tp->mdev_max < tp->rttvar)
+ tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2;
+ tp->rtt_seq = tp->snd_nxt;
+ tp->mdev_max = TCP_RTO_MIN;
+ }
+ } else {
+ /* no previous measure. */
+ tp->srtt = m<<3; /* take the measured time to be rtt */
+ tp->mdev = m<<1; /* make sure rto = 3*rtt */
+ tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
+ tp->rtt_seq = tp->snd_nxt;
+ }
+
+ tcp_westwood_update_rtt(tp, tp->srtt >> 3);
+}
+
+/* Calculate rto without backoff. This is the second half of Van Jacobson's
+ * routine referred to above.
+ */
+static inline void tcp_set_rto(struct tcp_sock *tp)
+{
+ /* Old crap is replaced with new one. 8)
+ *
+ * More seriously:
+ * 1. If rtt variance happened to be less 50msec, it is hallucination.
+ * It cannot be less due to utterly erratic ACK generation made
+ * at least by solaris and freebsd. "Erratic ACKs" has _nothing_
+ * to do with delayed acks, because at cwnd>2 true delack timeout
+ * is invisible. Actually, Linux-2.4 also generates erratic
+ * ACKs in some curcumstances.
+ */
+ tp->rto = (tp->srtt >> 3) + tp->rttvar;
+
+ /* 2. Fixups made earlier cannot be right.
+ * If we do not estimate RTO correctly without them,
+ * all the algo is pure shit and should be replaced
+ * with correct one. It is exaclty, which we pretend to do.
+ */
+}
+
+/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
+ * guarantees that rto is higher.
+ */
+static inline void tcp_bound_rto(struct tcp_sock *tp)
+{
+ if (tp->rto > TCP_RTO_MAX)
+ tp->rto = TCP_RTO_MAX;
+}
+
+/* Save metrics learned by this TCP session.
+ This function is called only, when TCP finishes successfully
+ i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
+ */
+void tcp_update_metrics(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (sysctl_tcp_nometrics_save)
+ return;
+
+ dst_confirm(dst);
+
+ if (dst && (dst->flags&DST_HOST)) {
+ int m;
+
+ if (tp->backoff || !tp->srtt) {
+ /* This session failed to estimate rtt. Why?
+ * Probably, no packets returned in time.
+ * Reset our results.
+ */
+ if (!(dst_metric_locked(dst, RTAX_RTT)))
+ dst->metrics[RTAX_RTT-1] = 0;
+ return;
+ }
+
+ m = dst_metric(dst, RTAX_RTT) - tp->srtt;
+
+ /* If newly calculated rtt larger than stored one,
+ * store new one. Otherwise, use EWMA. Remember,
+ * rtt overestimation is always better than underestimation.
+ */
+ if (!(dst_metric_locked(dst, RTAX_RTT))) {
+ if (m <= 0)
+ dst->metrics[RTAX_RTT-1] = tp->srtt;
+ else
+ dst->metrics[RTAX_RTT-1] -= (m>>3);
+ }
+
+ if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
+ if (m < 0)
+ m = -m;
+
+ /* Scale deviation to rttvar fixed point */
+ m >>= 1;
+ if (m < tp->mdev)
+ m = tp->mdev;
+
+ if (m >= dst_metric(dst, RTAX_RTTVAR))
+ dst->metrics[RTAX_RTTVAR-1] = m;
+ else
+ dst->metrics[RTAX_RTTVAR-1] -=
+ (dst->metrics[RTAX_RTTVAR-1] - m)>>2;
+ }
+
+ if (tp->snd_ssthresh >= 0xFFFF) {
+ /* Slow start still did not finish. */
+ if (dst_metric(dst, RTAX_SSTHRESH) &&
+ !dst_metric_locked(dst, RTAX_SSTHRESH) &&
+ (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
+ dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1;
+ if (!dst_metric_locked(dst, RTAX_CWND) &&
+ tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
+ dst->metrics[RTAX_CWND-1] = tp->snd_cwnd;
+ } else if (tp->snd_cwnd > tp->snd_ssthresh &&
+ tp->ca_state == TCP_CA_Open) {
+ /* Cong. avoidance phase, cwnd is reliable. */
+ if (!dst_metric_locked(dst, RTAX_SSTHRESH))
+ dst->metrics[RTAX_SSTHRESH-1] =
+ max(tp->snd_cwnd >> 1, tp->snd_ssthresh);
+ if (!dst_metric_locked(dst, RTAX_CWND))
+ dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_cwnd) >> 1;
+ } else {
+ /* Else slow start did not finish, cwnd is non-sense,
+ ssthresh may be also invalid.
+ */
+ if (!dst_metric_locked(dst, RTAX_CWND))
+ dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_ssthresh) >> 1;
+ if (dst->metrics[RTAX_SSTHRESH-1] &&
+ !dst_metric_locked(dst, RTAX_SSTHRESH) &&
+ tp->snd_ssthresh > dst->metrics[RTAX_SSTHRESH-1])
+ dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh;
+ }
+
+ if (!dst_metric_locked(dst, RTAX_REORDERING)) {
+ if (dst->metrics[RTAX_REORDERING-1] < tp->reordering &&
+ tp->reordering != sysctl_tcp_reordering)
+ dst->metrics[RTAX_REORDERING-1] = tp->reordering;
+ }
+ }
+}
+
+/* Numbers are taken from RFC2414. */
+__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
+{
+ __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
+
+ if (!cwnd) {
+ if (tp->mss_cache_std > 1460)
+ cwnd = 2;
+ else
+ cwnd = (tp->mss_cache_std > 1095) ? 3 : 4;
+ }
+ return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
+}
+
+/* Initialize metrics on socket. */
+
+static void tcp_init_metrics(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst == NULL)
+ goto reset;
+
+ dst_confirm(dst);
+
+ if (dst_metric_locked(dst, RTAX_CWND))
+ tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
+ if (dst_metric(dst, RTAX_SSTHRESH)) {
+ tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
+ if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+ tp->snd_ssthresh = tp->snd_cwnd_clamp;
+ }
+ if (dst_metric(dst, RTAX_REORDERING) &&
+ tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
+ tp->rx_opt.sack_ok &= ~2;
+ tp->reordering = dst_metric(dst, RTAX_REORDERING);
+ }
+
+ if (dst_metric(dst, RTAX_RTT) == 0)
+ goto reset;
+
+ if (!tp->srtt && dst_metric(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3))
+ goto reset;
+
+ /* Initial rtt is determined from SYN,SYN-ACK.
+ * The segment is small and rtt may appear much
+ * less than real one. Use per-dst memory
+ * to make it more realistic.
+ *
+ * A bit of theory. RTT is time passed after "normal" sized packet
+ * is sent until it is ACKed. In normal curcumstances sending small
+ * packets force peer to delay ACKs and calculation is correct too.
+ * The algorithm is adaptive and, provided we follow specs, it
+ * NEVER underestimate RTT. BUT! If peer tries to make some clever
+ * tricks sort of "quick acks" for time long enough to decrease RTT
+ * to low value, and then abruptly stops to do it and starts to delay
+ * ACKs, wait for troubles.
+ */
+ if (dst_metric(dst, RTAX_RTT) > tp->srtt) {
+ tp->srtt = dst_metric(dst, RTAX_RTT);
+ tp->rtt_seq = tp->snd_nxt;
+ }
+ if (dst_metric(dst, RTAX_RTTVAR) > tp->mdev) {
+ tp->mdev = dst_metric(dst, RTAX_RTTVAR);
+ tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
+ }
+ tcp_set_rto(tp);
+ tcp_bound_rto(tp);
+ if (tp->rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
+ goto reset;
+ tp->snd_cwnd = tcp_init_cwnd(tp, dst);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ return;
+
+reset:
+ /* Play conservative. If timestamps are not
+ * supported, TCP will fail to recalculate correct
+ * rtt, if initial rto is too small. FORGET ALL AND RESET!
+ */
+ if (!tp->rx_opt.saw_tstamp && tp->srtt) {
+ tp->srtt = 0;
+ tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
+ tp->rto = TCP_TIMEOUT_INIT;
+ }
+}
+
+static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
+{
+ if (metric > tp->reordering) {
+ tp->reordering = min(TCP_MAX_REORDERING, metric);
+
+ /* This exciting event is worth to be remembered. 8) */
+ if (ts)
+ NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER);
+ else if (IsReno(tp))
+ NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER);
+ else if (IsFack(tp))
+ NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER);
+ else
+ NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER);
+#if FASTRETRANS_DEBUG > 1
+ printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
+ tp->rx_opt.sack_ok, tp->ca_state,
+ tp->reordering,
+ tp->fackets_out,
+ tp->sacked_out,
+ tp->undo_marker ? tp->undo_retrans : 0);
+#endif
+ /* Disable FACK yet. */
+ tp->rx_opt.sack_ok &= ~2;
+ }
+}
+
+/* This procedure tags the retransmission queue when SACKs arrive.
+ *
+ * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
+ * Packets in queue with these bits set are counted in variables
+ * sacked_out, retrans_out and lost_out, correspondingly.
+ *
+ * Valid combinations are:
+ * Tag InFlight Description
+ * 0 1 - orig segment is in flight.
+ * S 0 - nothing flies, orig reached receiver.
+ * L 0 - nothing flies, orig lost by net.
+ * R 2 - both orig and retransmit are in flight.
+ * L|R 1 - orig is lost, retransmit is in flight.
+ * S|R 1 - orig reached receiver, retrans is still in flight.
+ * (L|S|R is logically valid, it could occur when L|R is sacked,
+ * but it is equivalent to plain S and code short-curcuits it to S.
+ * L|S is logically invalid, it would mean -1 packet in flight 8))
+ *
+ * These 6 states form finite state machine, controlled by the following events:
+ * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
+ * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
+ * 3. Loss detection event of one of three flavors:
+ * A. Scoreboard estimator decided the packet is lost.
+ * A'. Reno "three dupacks" marks head of queue lost.
+ * A''. Its FACK modfication, head until snd.fack is lost.
+ * B. SACK arrives sacking data transmitted after never retransmitted
+ * hole was sent out.
+ * C. SACK arrives sacking SND.NXT at the moment, when the
+ * segment was retransmitted.
+ * 4. D-SACK added new rule: D-SACK changes any tag to S.
+ *
+ * It is pleasant to note, that state diagram turns out to be commutative,
+ * so that we are allowed not to be bothered by order of our actions,
+ * when multiple events arrive simultaneously. (see the function below).
+ *
+ * Reordering detection.
+ * --------------------
+ * Reordering metric is maximal distance, which a packet can be displaced
+ * in packet stream. With SACKs we can estimate it:
+ *
+ * 1. SACK fills old hole and the corresponding segment was not
+ * ever retransmitted -> reordering. Alas, we cannot use it
+ * when segment was retransmitted.
+ * 2. The last flaw is solved with D-SACK. D-SACK arrives
+ * for retransmitted and already SACKed segment -> reordering..
+ * Both of these heuristics are not used in Loss state, when we cannot
+ * account for retransmits accurately.
+ */
+static int
+tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked;
+ struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2);
+ int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3;
+ int reord = tp->packets_out;
+ int prior_fackets;
+ u32 lost_retrans = 0;
+ int flag = 0;
+ int i;
+
+ /* So, SACKs for already sent large segments will be lost.
+ * Not good, but alternative is to resegment the queue. */
+ if (sk->sk_route_caps & NETIF_F_TSO) {
+ sk->sk_route_caps &= ~NETIF_F_TSO;
+ sock_set_flag(sk, SOCK_NO_LARGESEND);
+ tp->mss_cache = tp->mss_cache_std;
+ }
+
+ if (!tp->sacked_out)
+ tp->fackets_out = 0;
+ prior_fackets = tp->fackets_out;
+
+ for (i=0; i<num_sacks; i++, sp++) {
+ struct sk_buff *skb;
+ __u32 start_seq = ntohl(sp->start_seq);
+ __u32 end_seq = ntohl(sp->end_seq);
+ int fack_count = 0;
+ int dup_sack = 0;
+
+ /* Check for D-SACK. */
+ if (i == 0) {
+ u32 ack = TCP_SKB_CB(ack_skb)->ack_seq;
+
+ if (before(start_seq, ack)) {
+ dup_sack = 1;
+ tp->rx_opt.sack_ok |= 4;
+ NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV);
+ } else if (num_sacks > 1 &&
+ !after(end_seq, ntohl(sp[1].end_seq)) &&
+ !before(start_seq, ntohl(sp[1].start_seq))) {
+ dup_sack = 1;
+ tp->rx_opt.sack_ok |= 4;
+ NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV);
+ }
+
+ /* D-SACK for already forgotten data...
+ * Do dumb counting. */
+ if (dup_sack &&
+ !after(end_seq, prior_snd_una) &&
+ after(end_seq, tp->undo_marker))
+ tp->undo_retrans--;
+
+ /* Eliminate too old ACKs, but take into
+ * account more or less fresh ones, they can
+ * contain valid SACK info.
+ */
+ if (before(ack, prior_snd_una - tp->max_window))
+ return 0;
+ }
+
+ /* Event "B" in the comment above. */
+ if (after(end_seq, tp->high_seq))
+ flag |= FLAG_DATA_LOST;
+
+ sk_stream_for_retrans_queue(skb, sk) {
+ u8 sacked = TCP_SKB_CB(skb)->sacked;
+ int in_sack;
+
+ /* The retransmission queue is always in order, so
+ * we can short-circuit the walk early.
+ */
+ if(!before(TCP_SKB_CB(skb)->seq, end_seq))
+ break;
+
+ fack_count += tcp_skb_pcount(skb);
+
+ in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
+ !before(end_seq, TCP_SKB_CB(skb)->end_seq);
+
+ /* Account D-SACK for retransmitted packet. */
+ if ((dup_sack && in_sack) &&
+ (sacked & TCPCB_RETRANS) &&
+ after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
+ tp->undo_retrans--;
+
+ /* The frame is ACKed. */
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) {
+ if (sacked&TCPCB_RETRANS) {
+ if ((dup_sack && in_sack) &&
+ (sacked&TCPCB_SACKED_ACKED))
+ reord = min(fack_count, reord);
+ } else {
+ /* If it was in a hole, we detected reordering. */
+ if (fack_count < prior_fackets &&
+ !(sacked&TCPCB_SACKED_ACKED))
+ reord = min(fack_count, reord);
+ }
+
+ /* Nothing to do; acked frame is about to be dropped. */
+ continue;
+ }
+
+ if ((sacked&TCPCB_SACKED_RETRANS) &&
+ after(end_seq, TCP_SKB_CB(skb)->ack_seq) &&
+ (!lost_retrans || after(end_seq, lost_retrans)))
+ lost_retrans = end_seq;
+
+ if (!in_sack)
+ continue;
+
+ if (!(sacked&TCPCB_SACKED_ACKED)) {
+ if (sacked & TCPCB_SACKED_RETRANS) {
+ /* If the segment is not tagged as lost,
+ * we do not clear RETRANS, believing
+ * that retransmission is still in flight.
+ */
+ if (sacked & TCPCB_LOST) {
+ TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
+ tp->lost_out -= tcp_skb_pcount(skb);
+ tp->retrans_out -= tcp_skb_pcount(skb);
+ }
+ } else {
+ /* New sack for not retransmitted frame,
+ * which was in hole. It is reordering.
+ */
+ if (!(sacked & TCPCB_RETRANS) &&
+ fack_count < prior_fackets)
+ reord = min(fack_count, reord);
+
+ if (sacked & TCPCB_LOST) {
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
+ tp->lost_out -= tcp_skb_pcount(skb);
+ }
+ }
+
+ TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
+ flag |= FLAG_DATA_SACKED;
+ tp->sacked_out += tcp_skb_pcount(skb);
+
+ if (fack_count > tp->fackets_out)
+ tp->fackets_out = fack_count;
+ } else {
+ if (dup_sack && (sacked&TCPCB_RETRANS))
+ reord = min(fack_count, reord);
+ }
+
+ /* D-SACK. We can detect redundant retransmission
+ * in S|R and plain R frames and clear it.
+ * undo_retrans is decreased above, L|R frames
+ * are accounted above as well.
+ */
+ if (dup_sack &&
+ (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) {
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
+ }
+ }
+ }
+
+ /* Check for lost retransmit. This superb idea is
+ * borrowed from "ratehalving". Event "C".
+ * Later note: FACK people cheated me again 8),
+ * we have to account for reordering! Ugly,
+ * but should help.
+ */
+ if (lost_retrans && tp->ca_state == TCP_CA_Recovery) {
+ struct sk_buff *skb;
+
+ sk_stream_for_retrans_queue(skb, sk) {
+ if (after(TCP_SKB_CB(skb)->seq, lost_retrans))
+ break;
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+ continue;
+ if ((TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) &&
+ after(lost_retrans, TCP_SKB_CB(skb)->ack_seq) &&
+ (IsFack(tp) ||
+ !before(lost_retrans,
+ TCP_SKB_CB(skb)->ack_seq + tp->reordering *
+ tp->mss_cache_std))) {
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
+
+ if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) {
+ tp->lost_out += tcp_skb_pcount(skb);
+ TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ flag |= FLAG_DATA_SACKED;
+ NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT);
+ }
+ }
+ }
+ }
+
+ tp->left_out = tp->sacked_out + tp->lost_out;
+
+ if ((reord < tp->fackets_out) && tp->ca_state != TCP_CA_Loss)
+ tcp_update_reordering(tp, ((tp->fackets_out + 1) - reord), 0);
+
+#if FASTRETRANS_DEBUG > 0
+ BUG_TRAP((int)tp->sacked_out >= 0);
+ BUG_TRAP((int)tp->lost_out >= 0);
+ BUG_TRAP((int)tp->retrans_out >= 0);
+ BUG_TRAP((int)tcp_packets_in_flight(tp) >= 0);
+#endif
+ return flag;
+}
+
+/* RTO occurred, but do not yet enter loss state. Instead, transmit two new
+ * segments to see from the next ACKs whether any data was really missing.
+ * If the RTO was spurious, new ACKs should arrive.
+ */
+void tcp_enter_frto(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+
+ tp->frto_counter = 1;
+
+ if (tp->ca_state <= TCP_CA_Disorder ||
+ tp->snd_una == tp->high_seq ||
+ (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
+ tp->prior_ssthresh = tcp_current_ssthresh(tp);
+ if (!tcp_westwood_ssthresh(tp))
+ tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+ }
+
+ /* Have to clear retransmission markers here to keep the bookkeeping
+ * in shape, even though we are not yet in Loss state.
+ * If something was really lost, it is eventually caught up
+ * in tcp_enter_frto_loss.
+ */
+ tp->retrans_out = 0;
+ tp->undo_marker = tp->snd_una;
+ tp->undo_retrans = 0;
+
+ sk_stream_for_retrans_queue(skb, sk) {
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_RETRANS;
+ }
+ tcp_sync_left_out(tp);
+
+ tcp_set_ca_state(tp, TCP_CA_Open);
+ tp->frto_highmark = tp->snd_nxt;
+}
+
+/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
+ * which indicates that we should follow the traditional RTO recovery,
+ * i.e. mark everything lost and do go-back-N retransmission.
+ */
+static void tcp_enter_frto_loss(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ int cnt = 0;
+
+ tp->sacked_out = 0;
+ tp->lost_out = 0;
+ tp->fackets_out = 0;
+
+ sk_stream_for_retrans_queue(skb, sk) {
+ cnt += tcp_skb_pcount(skb);
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
+ if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
+
+ /* Do not mark those segments lost that were
+ * forward transmitted after RTO
+ */
+ if (!after(TCP_SKB_CB(skb)->end_seq,
+ tp->frto_highmark)) {
+ TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ tp->lost_out += tcp_skb_pcount(skb);
+ }
+ } else {
+ tp->sacked_out += tcp_skb_pcount(skb);
+ tp->fackets_out = cnt;
+ }
+ }
+ tcp_sync_left_out(tp);
+
+ tp->snd_cwnd = tp->frto_counter + tcp_packets_in_flight(tp)+1;
+ tp->snd_cwnd_cnt = 0;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->undo_marker = 0;
+ tp->frto_counter = 0;
+
+ tp->reordering = min_t(unsigned int, tp->reordering,
+ sysctl_tcp_reordering);
+ tcp_set_ca_state(tp, TCP_CA_Loss);
+ tp->high_seq = tp->frto_highmark;
+ TCP_ECN_queue_cwr(tp);
+
+ init_bictcp(tp);
+}
+
+void tcp_clear_retrans(struct tcp_sock *tp)
+{
+ tp->left_out = 0;
+ tp->retrans_out = 0;
+
+ tp->fackets_out = 0;
+ tp->sacked_out = 0;
+ tp->lost_out = 0;
+
+ tp->undo_marker = 0;
+ tp->undo_retrans = 0;
+}
+
+/* Enter Loss state. If "how" is not zero, forget all SACK information
+ * and reset tags completely, otherwise preserve SACKs. If receiver
+ * dropped its ofo queue, we will know this due to reneging detection.
+ */
+void tcp_enter_loss(struct sock *sk, int how)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ int cnt = 0;
+
+ /* Reduce ssthresh if it has not yet been made inside this window. */
+ if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
+ (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
+ tp->prior_ssthresh = tcp_current_ssthresh(tp);
+ tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+ }
+ tp->snd_cwnd = 1;
+ tp->snd_cwnd_cnt = 0;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+
+ tcp_clear_retrans(tp);
+
+ /* Push undo marker, if it was plain RTO and nothing
+ * was retransmitted. */
+ if (!how)
+ tp->undo_marker = tp->snd_una;
+
+ sk_stream_for_retrans_queue(skb, sk) {
+ cnt += tcp_skb_pcount(skb);
+ if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
+ tp->undo_marker = 0;
+ TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
+ if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
+ TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ tp->lost_out += tcp_skb_pcount(skb);
+ } else {
+ tp->sacked_out += tcp_skb_pcount(skb);
+ tp->fackets_out = cnt;
+ }
+ }
+ tcp_sync_left_out(tp);
+
+ tp->reordering = min_t(unsigned int, tp->reordering,
+ sysctl_tcp_reordering);
+ tcp_set_ca_state(tp, TCP_CA_Loss);
+ tp->high_seq = tp->snd_nxt;
+ TCP_ECN_queue_cwr(tp);
+}
+
+static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp)
+{
+ struct sk_buff *skb;
+
+ /* If ACK arrived pointing to a remembered SACK,
+ * it means that our remembered SACKs do not reflect
+ * real state of receiver i.e.
+ * receiver _host_ is heavily congested (or buggy).
+ * Do processing similar to RTO timeout.
+ */
+ if ((skb = skb_peek(&sk->sk_write_queue)) != NULL &&
+ (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
+ NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING);
+
+ tcp_enter_loss(sk, 1);
+ tp->retransmits++;
+ tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ return 1;
+ }
+ return 0;
+}
+
+static inline int tcp_fackets_out(struct tcp_sock *tp)
+{
+ return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out;
+}
+
+static inline int tcp_skb_timedout(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto);
+}
+
+static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp)
+{
+ return tp->packets_out &&
+ tcp_skb_timedout(tp, skb_peek(&sk->sk_write_queue));
+}
+
+/* Linux NewReno/SACK/FACK/ECN state machine.
+ * --------------------------------------
+ *
+ * "Open" Normal state, no dubious events, fast path.
+ * "Disorder" In all the respects it is "Open",
+ * but requires a bit more attention. It is entered when
+ * we see some SACKs or dupacks. It is split of "Open"
+ * mainly to move some processing from fast path to slow one.
+ * "CWR" CWND was reduced due to some Congestion Notification event.
+ * It can be ECN, ICMP source quench, local device congestion.
+ * "Recovery" CWND was reduced, we are fast-retransmitting.
+ * "Loss" CWND was reduced due to RTO timeout or SACK reneging.
+ *
+ * tcp_fastretrans_alert() is entered:
+ * - each incoming ACK, if state is not "Open"
+ * - when arrived ACK is unusual, namely:
+ * * SACK
+ * * Duplicate ACK.
+ * * ECN ECE.
+ *
+ * Counting packets in flight is pretty simple.
+ *
+ * in_flight = packets_out - left_out + retrans_out
+ *
+ * packets_out is SND.NXT-SND.UNA counted in packets.
+ *
+ * retrans_out is number of retransmitted segments.
+ *
+ * left_out is number of segments left network, but not ACKed yet.
+ *
+ * left_out = sacked_out + lost_out
+ *
+ * sacked_out: Packets, which arrived to receiver out of order
+ * and hence not ACKed. With SACKs this number is simply
+ * amount of SACKed data. Even without SACKs
+ * it is easy to give pretty reliable estimate of this number,
+ * counting duplicate ACKs.
+ *
+ * lost_out: Packets lost by network. TCP has no explicit
+ * "loss notification" feedback from network (for now).
+ * It means that this number can be only _guessed_.
+ * Actually, it is the heuristics to predict lossage that
+ * distinguishes different algorithms.
+ *
+ * F.e. after RTO, when all the queue is considered as lost,
+ * lost_out = packets_out and in_flight = retrans_out.
+ *
+ * Essentially, we have now two algorithms counting
+ * lost packets.
+ *
+ * FACK: It is the simplest heuristics. As soon as we decided
+ * that something is lost, we decide that _all_ not SACKed
+ * packets until the most forward SACK are lost. I.e.
+ * lost_out = fackets_out - sacked_out and left_out = fackets_out.
+ * It is absolutely correct estimate, if network does not reorder
+ * packets. And it loses any connection to reality when reordering
+ * takes place. We use FACK by default until reordering
+ * is suspected on the path to this destination.
+ *
+ * NewReno: when Recovery is entered, we assume that one segment
+ * is lost (classic Reno). While we are in Recovery and
+ * a partial ACK arrives, we assume that one more packet
+ * is lost (NewReno). This heuristics are the same in NewReno
+ * and SACK.
+ *
+ * Imagine, that's all! Forget about all this shamanism about CWND inflation
+ * deflation etc. CWND is real congestion window, never inflated, changes
+ * only according to classic VJ rules.
+ *
+ * Really tricky (and requiring careful tuning) part of algorithm
+ * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
+ * The first determines the moment _when_ we should reduce CWND and,
+ * hence, slow down forward transmission. In fact, it determines the moment
+ * when we decide that hole is caused by loss, rather than by a reorder.
+ *
+ * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
+ * holes, caused by lost packets.
+ *
+ * And the most logically complicated part of algorithm is undo
+ * heuristics. We detect false retransmits due to both too early
+ * fast retransmit (reordering) and underestimated RTO, analyzing
+ * timestamps and D-SACKs. When we detect that some segments were
+ * retransmitted by mistake and CWND reduction was wrong, we undo
+ * window reduction and abort recovery phase. This logic is hidden
+ * inside several functions named tcp_try_undo_<something>.
+ */
+
+/* This function decides, when we should leave Disordered state
+ * and enter Recovery phase, reducing congestion window.
+ *
+ * Main question: may we further continue forward transmission
+ * with the same cwnd?
+ */
+static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp)
+{
+ __u32 packets_out;
+
+ /* Trick#1: The loss is proven. */
+ if (tp->lost_out)
+ return 1;
+
+ /* Not-A-Trick#2 : Classic rule... */
+ if (tcp_fackets_out(tp) > tp->reordering)
+ return 1;
+
+ /* Trick#3 : when we use RFC2988 timer restart, fast
+ * retransmit can be triggered by timeout of queue head.
+ */
+ if (tcp_head_timedout(sk, tp))
+ return 1;
+
+ /* Trick#4: It is still not OK... But will it be useful to delay
+ * recovery more?
+ */
+ packets_out = tp->packets_out;
+ if (packets_out <= tp->reordering &&
+ tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
+ !tcp_may_send_now(sk, tp)) {
+ /* We have nothing to send. This connection is limited
+ * either by receiver window or by application.
+ */
+ return 1;
+ }
+
+ return 0;
+}
+
+/* If we receive more dupacks than we expected counting segments
+ * in assumption of absent reordering, interpret this as reordering.
+ * The only another reason could be bug in receiver TCP.
+ */
+static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend)
+{
+ u32 holes;
+
+ holes = max(tp->lost_out, 1U);
+ holes = min(holes, tp->packets_out);
+
+ if ((tp->sacked_out + holes) > tp->packets_out) {
+ tp->sacked_out = tp->packets_out - holes;
+ tcp_update_reordering(tp, tp->packets_out+addend, 0);
+ }
+}
+
+/* Emulate SACKs for SACKless connection: account for a new dupack. */
+
+static void tcp_add_reno_sack(struct tcp_sock *tp)
+{
+ tp->sacked_out++;
+ tcp_check_reno_reordering(tp, 0);
+ tcp_sync_left_out(tp);
+}
+
+/* Account for ACK, ACKing some data in Reno Recovery phase. */
+
+static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acked)
+{
+ if (acked > 0) {
+ /* One ACK acked hole. The rest eat duplicate ACKs. */
+ if (acked-1 >= tp->sacked_out)
+ tp->sacked_out = 0;
+ else
+ tp->sacked_out -= acked-1;
+ }
+ tcp_check_reno_reordering(tp, acked);
+ tcp_sync_left_out(tp);
+}
+
+static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
+{
+ tp->sacked_out = 0;
+ tp->left_out = tp->lost_out;
+}
+
+/* Mark head of queue up as lost. */
+static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
+ int packets, u32 high_seq)
+{
+ struct sk_buff *skb;
+ int cnt = packets;
+
+ BUG_TRAP(cnt <= tp->packets_out);
+
+ sk_stream_for_retrans_queue(skb, sk) {
+ cnt -= tcp_skb_pcount(skb);
+ if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq))
+ break;
+ if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
+ TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ tp->lost_out += tcp_skb_pcount(skb);
+ }
+ }
+ tcp_sync_left_out(tp);
+}
+
+/* Account newly detected lost packet(s) */
+
+static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
+{
+ if (IsFack(tp)) {
+ int lost = tp->fackets_out - tp->reordering;
+ if (lost <= 0)
+ lost = 1;
+ tcp_mark_head_lost(sk, tp, lost, tp->high_seq);
+ } else {
+ tcp_mark_head_lost(sk, tp, 1, tp->high_seq);
+ }
+
+ /* New heuristics: it is possible only after we switched
+ * to restart timer each time when something is ACKed.
+ * Hence, we can detect timed out packets during fast
+ * retransmit without falling to slow start.
+ */
+ if (tcp_head_timedout(sk, tp)) {
+ struct sk_buff *skb;
+
+ sk_stream_for_retrans_queue(skb, sk) {
+ if (tcp_skb_timedout(tp, skb) &&
+ !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
+ TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ tp->lost_out += tcp_skb_pcount(skb);
+ }
+ }
+ tcp_sync_left_out(tp);
+ }
+}
+
+/* CWND moderation, preventing bursts due to too big ACKs
+ * in dubious situations.
+ */
+static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
+{
+ tp->snd_cwnd = min(tp->snd_cwnd,
+ tcp_packets_in_flight(tp)+tcp_max_burst(tp));
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+/* Decrease cwnd each second ack. */
+
+static void tcp_cwnd_down(struct tcp_sock *tp)
+{
+ int decr = tp->snd_cwnd_cnt + 1;
+ __u32 limit;
+
+ /*
+ * TCP Westwood
+ * Here limit is evaluated as BWestimation*RTTmin (for obtaining it
+ * in packets we use mss_cache). If sysctl_tcp_westwood is off
+ * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is
+ * still used as usual. It prevents other strange cases in which
+ * BWE*RTTmin could assume value 0. It should not happen but...
+ */
+
+ if (!(limit = tcp_westwood_bw_rttmin(tp)))
+ limit = tp->snd_ssthresh/2;
+
+ tp->snd_cwnd_cnt = decr&1;
+ decr >>= 1;
+
+ if (decr && tp->snd_cwnd > limit)
+ tp->snd_cwnd -= decr;
+
+ tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+/* Nothing was retransmitted or returned timestamp is less
+ * than timestamp of the first retransmission.
+ */
+static inline int tcp_packet_delayed(struct tcp_sock *tp)
+{
+ return !tp->retrans_stamp ||
+ (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+ (__s32)(tp->rx_opt.rcv_tsecr - tp->retrans_stamp) < 0);
+}
+
+/* Undo procedures. */
+
+#if FASTRETRANS_DEBUG > 1
+static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n",
+ msg,
+ NIPQUAD(inet->daddr), ntohs(inet->dport),
+ tp->snd_cwnd, tp->left_out,
+ tp->snd_ssthresh, tp->prior_ssthresh,
+ tp->packets_out);
+}
+#else
+#define DBGUNDO(x...) do { } while (0)
+#endif
+
+static void tcp_undo_cwr(struct tcp_sock *tp, int undo)
+{
+ if (tp->prior_ssthresh) {
+ if (tcp_is_bic(tp))
+ tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd);
+ else
+ tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
+
+ if (undo && tp->prior_ssthresh > tp->snd_ssthresh) {
+ tp->snd_ssthresh = tp->prior_ssthresh;
+ TCP_ECN_withdraw_cwr(tp);
+ }
+ } else {
+ tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
+ }
+ tcp_moderate_cwnd(tp);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+static inline int tcp_may_undo(struct tcp_sock *tp)
+{
+ return tp->undo_marker &&
+ (!tp->undo_retrans || tcp_packet_delayed(tp));
+}
+
+/* People celebrate: "We love our President!" */
+static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
+{
+ if (tcp_may_undo(tp)) {
+ /* Happy end! We did not retransmit anything
+ * or our original transmission succeeded.
+ */
+ DBGUNDO(sk, tp, tp->ca_state == TCP_CA_Loss ? "loss" : "retrans");
+ tcp_undo_cwr(tp, 1);
+ if (tp->ca_state == TCP_CA_Loss)
+ NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
+ else
+ NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO);
+ tp->undo_marker = 0;
+ }
+ if (tp->snd_una == tp->high_seq && IsReno(tp)) {
+ /* Hold old state until something *above* high_seq
+ * is ACKed. For Reno it is MUST to prevent false
+ * fast retransmits (RFC2582). SACK TCP is safe. */
+ tcp_moderate_cwnd(tp);
+ return 1;
+ }
+ tcp_set_ca_state(tp, TCP_CA_Open);
+ return 0;
+}
+
+/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
+static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp)
+{
+ if (tp->undo_marker && !tp->undo_retrans) {
+ DBGUNDO(sk, tp, "D-SACK");
+ tcp_undo_cwr(tp, 1);
+ tp->undo_marker = 0;
+ NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO);
+ }
+}
+
+/* Undo during fast recovery after partial ACK. */
+
+static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp,
+ int acked)
+{
+ /* Partial ACK arrived. Force Hoe's retransmit. */
+ int failed = IsReno(tp) || tp->fackets_out>tp->reordering;
+
+ if (tcp_may_undo(tp)) {
+ /* Plain luck! Hole if filled with delayed
+ * packet, rather than with a retransmit.
+ */
+ if (tp->retrans_out == 0)
+ tp->retrans_stamp = 0;
+
+ tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1);
+
+ DBGUNDO(sk, tp, "Hoe");
+ tcp_undo_cwr(tp, 0);
+ NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO);
+
+ /* So... Do not make Hoe's retransmit yet.
+ * If the first packet was delayed, the rest
+ * ones are most probably delayed as well.
+ */
+ failed = 0;
+ }
+ return failed;
+}
+
+/* Undo during loss recovery after partial ACK. */
+static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
+{
+ if (tcp_may_undo(tp)) {
+ struct sk_buff *skb;
+ sk_stream_for_retrans_queue(skb, sk) {
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
+ }
+ DBGUNDO(sk, tp, "partial loss");
+ tp->lost_out = 0;
+ tp->left_out = tp->sacked_out;
+ tcp_undo_cwr(tp, 1);
+ NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
+ tp->retransmits = 0;
+ tp->undo_marker = 0;
+ if (!IsReno(tp))
+ tcp_set_ca_state(tp, TCP_CA_Open);
+ return 1;
+ }
+ return 0;
+}
+
+static inline void tcp_complete_cwr(struct tcp_sock *tp)
+{
+ if (tcp_westwood_cwnd(tp))
+ tp->snd_ssthresh = tp->snd_cwnd;
+ else
+ tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
+{
+ tp->left_out = tp->sacked_out;
+
+ if (tp->retrans_out == 0)
+ tp->retrans_stamp = 0;
+
+ if (flag&FLAG_ECE)
+ tcp_enter_cwr(tp);
+
+ if (tp->ca_state != TCP_CA_CWR) {
+ int state = TCP_CA_Open;
+
+ if (tp->left_out || tp->retrans_out || tp->undo_marker)
+ state = TCP_CA_Disorder;
+
+ if (tp->ca_state != state) {
+ tcp_set_ca_state(tp, state);
+ tp->high_seq = tp->snd_nxt;
+ }
+ tcp_moderate_cwnd(tp);
+ } else {
+ tcp_cwnd_down(tp);
+ }
+}
+
+/* Process an event, which can update packets-in-flight not trivially.
+ * Main goal of this function is to calculate new estimate for left_out,
+ * taking into account both packets sitting in receiver's buffer and
+ * packets lost by network.
+ *
+ * Besides that it does CWND reduction, when packet loss is detected
+ * and changes state of machine.
+ *
+ * It does _not_ decide what to send, it is made in function
+ * tcp_xmit_retransmit_queue().
+ */
+static void
+tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
+ int prior_packets, int flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP));
+
+ /* Some technical things:
+ * 1. Reno does not count dupacks (sacked_out) automatically. */
+ if (!tp->packets_out)
+ tp->sacked_out = 0;
+ /* 2. SACK counts snd_fack in packets inaccurately. */
+ if (tp->sacked_out == 0)
+ tp->fackets_out = 0;
+
+ /* Now state machine starts.
+ * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
+ if (flag&FLAG_ECE)
+ tp->prior_ssthresh = 0;
+
+ /* B. In all the states check for reneging SACKs. */
+ if (tp->sacked_out && tcp_check_sack_reneging(sk, tp))
+ return;
+
+ /* C. Process data loss notification, provided it is valid. */
+ if ((flag&FLAG_DATA_LOST) &&
+ before(tp->snd_una, tp->high_seq) &&
+ tp->ca_state != TCP_CA_Open &&
+ tp->fackets_out > tp->reordering) {
+ tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq);
+ NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
+ }
+
+ /* D. Synchronize left_out to current state. */
+ tcp_sync_left_out(tp);
+
+ /* E. Check state exit conditions. State can be terminated
+ * when high_seq is ACKed. */
+ if (tp->ca_state == TCP_CA_Open) {
+ if (!sysctl_tcp_frto)
+ BUG_TRAP(tp->retrans_out == 0);
+ tp->retrans_stamp = 0;
+ } else if (!before(tp->snd_una, tp->high_seq)) {
+ switch (tp->ca_state) {
+ case TCP_CA_Loss:
+ tp->retransmits = 0;
+ if (tcp_try_undo_recovery(sk, tp))
+ return;
+ break;
+
+ case TCP_CA_CWR:
+ /* CWR is to be held something *above* high_seq
+ * is ACKed for CWR bit to reach receiver. */
+ if (tp->snd_una != tp->high_seq) {
+ tcp_complete_cwr(tp);
+ tcp_set_ca_state(tp, TCP_CA_Open);
+ }
+ break;
+
+ case TCP_CA_Disorder:
+ tcp_try_undo_dsack(sk, tp);
+ if (!tp->undo_marker ||
+ /* For SACK case do not Open to allow to undo
+ * catching for all duplicate ACKs. */
+ IsReno(tp) || tp->snd_una != tp->high_seq) {
+ tp->undo_marker = 0;
+ tcp_set_ca_state(tp, TCP_CA_Open);
+ }
+ break;
+
+ case TCP_CA_Recovery:
+ if (IsReno(tp))
+ tcp_reset_reno_sack(tp);
+ if (tcp_try_undo_recovery(sk, tp))
+ return;
+ tcp_complete_cwr(tp);
+ break;
+ }
+ }
+
+ /* F. Process state. */
+ switch (tp->ca_state) {
+ case TCP_CA_Recovery:
+ if (prior_snd_una == tp->snd_una) {
+ if (IsReno(tp) && is_dupack)
+ tcp_add_reno_sack(tp);
+ } else {
+ int acked = prior_packets - tp->packets_out;
+ if (IsReno(tp))
+ tcp_remove_reno_sacks(sk, tp, acked);
+ is_dupack = tcp_try_undo_partial(sk, tp, acked);
+ }
+ break;
+ case TCP_CA_Loss:
+ if (flag&FLAG_DATA_ACKED)
+ tp->retransmits = 0;
+ if (!tcp_try_undo_loss(sk, tp)) {
+ tcp_moderate_cwnd(tp);
+ tcp_xmit_retransmit_queue(sk);
+ return;
+ }
+ if (tp->ca_state != TCP_CA_Open)
+ return;
+ /* Loss is undone; fall through to processing in Open state. */
+ default:
+ if (IsReno(tp)) {
+ if (tp->snd_una != prior_snd_una)
+ tcp_reset_reno_sack(tp);
+ if (is_dupack)
+ tcp_add_reno_sack(tp);
+ }
+
+ if (tp->ca_state == TCP_CA_Disorder)
+ tcp_try_undo_dsack(sk, tp);
+
+ if (!tcp_time_to_recover(sk, tp)) {
+ tcp_try_to_open(sk, tp, flag);
+ return;
+ }
+
+ /* Otherwise enter Recovery state */
+
+ if (IsReno(tp))
+ NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERY);
+ else
+ NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERY);
+
+ tp->high_seq = tp->snd_nxt;
+ tp->prior_ssthresh = 0;
+ tp->undo_marker = tp->snd_una;
+ tp->undo_retrans = tp->retrans_out;
+
+ if (tp->ca_state < TCP_CA_CWR) {
+ if (!(flag&FLAG_ECE))
+ tp->prior_ssthresh = tcp_current_ssthresh(tp);
+ tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+ TCP_ECN_queue_cwr(tp);
+ }
+
+ tp->snd_cwnd_cnt = 0;
+ tcp_set_ca_state(tp, TCP_CA_Recovery);
+ }
+
+ if (is_dupack || tcp_head_timedout(sk, tp))
+ tcp_update_scoreboard(sk, tp);
+ tcp_cwnd_down(tp);
+ tcp_xmit_retransmit_queue(sk);
+}
+
+/* Read draft-ietf-tcplw-high-performance before mucking
+ * with this code. (Superceeds RFC1323)
+ */
+static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag)
+{
+ __u32 seq_rtt;
+
+ /* RTTM Rule: A TSecr value received in a segment is used to
+ * update the averaged RTT measurement only if the segment
+ * acknowledges some new data, i.e., only if it advances the
+ * left edge of the send window.
+ *
+ * See draft-ietf-tcplw-high-performance-00, section 3.3.
+ * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
+ *
+ * Changed: reset backoff as soon as we see the first valid sample.
+ * If we do not, we get strongly overstimated rto. With timestamps
+ * samples are accepted even from very old segments: f.e., when rtt=1
+ * increases to 8, we retransmit 5 times and after 8 seconds delayed
+ * answer arrives rto becomes 120 seconds! If at least one of segments
+ * in window is lost... Voila. --ANK (010210)
+ */
+ seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
+ tcp_rtt_estimator(tp, seq_rtt);
+ tcp_set_rto(tp);
+ tp->backoff = 0;
+ tcp_bound_rto(tp);
+}
+
+static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag)
+{
+ /* We don't have a timestamp. Can only use
+ * packets that are not retransmitted to determine
+ * rtt estimates. Also, we must not reset the
+ * backoff for rto until we get a non-retransmitted
+ * packet. This allows us to deal with a situation
+ * where the network delay has increased suddenly.
+ * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
+ */
+
+ if (flag & FLAG_RETRANS_DATA_ACKED)
+ return;
+
+ tcp_rtt_estimator(tp, seq_rtt);
+ tcp_set_rto(tp);
+ tp->backoff = 0;
+ tcp_bound_rto(tp);
+}
+
+static inline void tcp_ack_update_rtt(struct tcp_sock *tp,
+ int flag, s32 seq_rtt)
+{
+ /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
+ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
+ tcp_ack_saw_tstamp(tp, flag);
+ else if (seq_rtt >= 0)
+ tcp_ack_no_tstamp(tp, seq_rtt, flag);
+}
+
+/*
+ * Compute congestion window to use.
+ *
+ * This is from the implementation of BICTCP in
+ * Lison-Xu, Kahaled Harfoush, and Injog Rhee.
+ * "Binary Increase Congestion Control for Fast, Long Distance
+ * Networks" in InfoComm 2004
+ * Available from:
+ * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
+ *
+ * Unless BIC is enabled and congestion window is large
+ * this behaves the same as the original Reno.
+ */
+static inline __u32 bictcp_cwnd(struct tcp_sock *tp)
+{
+ /* orignal Reno behaviour */
+ if (!tcp_is_bic(tp))
+ return tp->snd_cwnd;
+
+ if (tp->bictcp.last_cwnd == tp->snd_cwnd &&
+ (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5))
+ return tp->bictcp.cnt;
+
+ tp->bictcp.last_cwnd = tp->snd_cwnd;
+ tp->bictcp.last_stamp = tcp_time_stamp;
+
+ /* start off normal */
+ if (tp->snd_cwnd <= sysctl_tcp_bic_low_window)
+ tp->bictcp.cnt = tp->snd_cwnd;
+
+ /* binary increase */
+ else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) {
+ __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd)
+ / BICTCP_B;
+
+ if (dist > BICTCP_MAX_INCREMENT)
+ /* linear increase */
+ tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
+ else if (dist <= 1U)
+ /* binary search increase */
+ tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
+ / BICTCP_B;
+ else
+ /* binary search increase */
+ tp->bictcp.cnt = tp->snd_cwnd / dist;
+ } else {
+ /* slow start amd linear increase */
+ if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B)
+ /* slow start */
+ tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
+ / BICTCP_B;
+ else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd
+ + BICTCP_MAX_INCREMENT*(BICTCP_B-1))
+ /* slow start */
+ tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1)
+ / (tp->snd_cwnd-tp->bictcp.last_max_cwnd);
+ else
+ /* linear increase */
+ tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
+ }
+ return tp->bictcp.cnt;
+}
+
+/* This is Jacobson's slow start and congestion avoidance.
+ * SIGCOMM '88, p. 328.
+ */
+static inline void reno_cong_avoid(struct tcp_sock *tp)
+{
+ if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ /* In "safe" area, increase. */
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ } else {
+ /* In dangerous area, increase slowly.
+ * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
+ */
+ if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) {
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ tp->snd_cwnd_cnt=0;
+ } else
+ tp->snd_cwnd_cnt++;
+ }
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+/* This is based on the congestion detection/avoidance scheme described in
+ * Lawrence S. Brakmo and Larry L. Peterson.
+ * "TCP Vegas: End to end congestion avoidance on a global internet."
+ * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
+ * October 1995. Available from:
+ * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
+ *
+ * See http://www.cs.arizona.edu/xkernel/ for their implementation.
+ * The main aspects that distinguish this implementation from the
+ * Arizona Vegas implementation are:
+ * o We do not change the loss detection or recovery mechanisms of
+ * Linux in any way. Linux already recovers from losses quite well,
+ * using fine-grained timers, NewReno, and FACK.
+ * o To avoid the performance penalty imposed by increasing cwnd
+ * only every-other RTT during slow start, we increase during
+ * every RTT during slow start, just like Reno.
+ * o Largely to allow continuous cwnd growth during slow start,
+ * we use the rate at which ACKs come back as the "actual"
+ * rate, rather than the rate at which data is sent.
+ * o To speed convergence to the right rate, we set the cwnd
+ * to achieve the right ("actual") rate when we exit slow start.
+ * o To filter out the noise caused by delayed ACKs, we use the
+ * minimum RTT sample observed during the last RTT to calculate
+ * the actual rate.
+ * o When the sender re-starts from idle, it waits until it has
+ * received ACKs for an entire flight of new data before making
+ * a cwnd adjustment decision. The original Vegas implementation
+ * assumed senders never went idle.
+ */
+static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
+{
+ /* The key players are v_beg_snd_una and v_beg_snd_nxt.
+ *
+ * These are so named because they represent the approximate values
+ * of snd_una and snd_nxt at the beginning of the current RTT. More
+ * precisely, they represent the amount of data sent during the RTT.
+ * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
+ * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
+ * bytes of data have been ACKed during the course of the RTT, giving
+ * an "actual" rate of:
+ *
+ * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
+ *
+ * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
+ * because delayed ACKs can cover more than one segment, so they
+ * don't line up nicely with the boundaries of RTTs.
+ *
+ * Another unfortunate fact of life is that delayed ACKs delay the
+ * advance of the left edge of our send window, so that the number
+ * of bytes we send in an RTT is often less than our cwnd will allow.
+ * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
+ */
+
+ if (after(ack, tp->vegas.beg_snd_nxt)) {
+ /* Do the Vegas once-per-RTT cwnd adjustment. */
+ u32 old_wnd, old_snd_cwnd;
+
+
+ /* Here old_wnd is essentially the window of data that was
+ * sent during the previous RTT, and has all
+ * been acknowledged in the course of the RTT that ended
+ * with the ACK we just received. Likewise, old_snd_cwnd
+ * is the cwnd during the previous RTT.
+ */
+ old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) /
+ tp->mss_cache_std;
+ old_snd_cwnd = tp->vegas.beg_snd_cwnd;
+
+ /* Save the extent of the current window so we can use this
+ * at the end of the next RTT.
+ */
+ tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt;
+ tp->vegas.beg_snd_nxt = tp->snd_nxt;
+ tp->vegas.beg_snd_cwnd = tp->snd_cwnd;
+
+ /* Take into account the current RTT sample too, to
+ * decrease the impact of delayed acks. This double counts
+ * this sample since we count it for the next window as well,
+ * but that's not too awful, since we're taking the min,
+ * rather than averaging.
+ */
+ vegas_rtt_calc(tp, seq_rtt);
+
+ /* We do the Vegas calculations only if we got enough RTT
+ * samples that we can be reasonably sure that we got
+ * at least one RTT sample that wasn't from a delayed ACK.
+ * If we only had 2 samples total,
+ * then that means we're getting only 1 ACK per RTT, which
+ * means they're almost certainly delayed ACKs.
+ * If we have 3 samples, we should be OK.
+ */
+
+ if (tp->vegas.cntRTT <= 2) {
+ /* We don't have enough RTT samples to do the Vegas
+ * calculation, so we'll behave like Reno.
+ */
+ if (tp->snd_cwnd > tp->snd_ssthresh)
+ tp->snd_cwnd++;
+ } else {
+ u32 rtt, target_cwnd, diff;
+
+ /* We have enough RTT samples, so, using the Vegas
+ * algorithm, we determine if we should increase or
+ * decrease cwnd, and by how much.
+ */
+
+ /* Pluck out the RTT we are using for the Vegas
+ * calculations. This is the min RTT seen during the
+ * last RTT. Taking the min filters out the effects
+ * of delayed ACKs, at the cost of noticing congestion
+ * a bit later.
+ */
+ rtt = tp->vegas.minRTT;
+
+ /* Calculate the cwnd we should have, if we weren't
+ * going too fast.
+ *
+ * This is:
+ * (actual rate in segments) * baseRTT
+ * We keep it as a fixed point number with
+ * V_PARAM_SHIFT bits to the right of the binary point.
+ */
+ target_cwnd = ((old_wnd * tp->vegas.baseRTT)
+ << V_PARAM_SHIFT) / rtt;
+
+ /* Calculate the difference between the window we had,
+ * and the window we would like to have. This quantity
+ * is the "Diff" from the Arizona Vegas papers.
+ *
+ * Again, this is a fixed point number with
+ * V_PARAM_SHIFT bits to the right of the binary
+ * point.
+ */
+ diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
+
+ if (tp->snd_cwnd < tp->snd_ssthresh) {
+ /* Slow start. */
+ if (diff > sysctl_tcp_vegas_gamma) {
+ /* Going too fast. Time to slow down
+ * and switch to congestion avoidance.
+ */
+ tp->snd_ssthresh = 2;
+
+ /* Set cwnd to match the actual rate
+ * exactly:
+ * cwnd = (actual rate) * baseRTT
+ * Then we add 1 because the integer
+ * truncation robs us of full link
+ * utilization.
+ */
+ tp->snd_cwnd = min(tp->snd_cwnd,
+ (target_cwnd >>
+ V_PARAM_SHIFT)+1);
+
+ }
+ } else {
+ /* Congestion avoidance. */
+ u32 next_snd_cwnd;
+
+ /* Figure out where we would like cwnd
+ * to be.
+ */
+ if (diff > sysctl_tcp_vegas_beta) {
+ /* The old window was too fast, so
+ * we slow down.
+ */
+ next_snd_cwnd = old_snd_cwnd - 1;
+ } else if (diff < sysctl_tcp_vegas_alpha) {
+ /* We don't have enough extra packets
+ * in the network, so speed up.
+ */
+ next_snd_cwnd = old_snd_cwnd + 1;
+ } else {
+ /* Sending just as fast as we
+ * should be.
+ */
+ next_snd_cwnd = old_snd_cwnd;
+ }
+
+ /* Adjust cwnd upward or downward, toward the
+ * desired value.
+ */
+ if (next_snd_cwnd > tp->snd_cwnd)
+ tp->snd_cwnd++;
+ else if (next_snd_cwnd < tp->snd_cwnd)
+ tp->snd_cwnd--;
+ }
+ }
+
+ /* Wipe the slate clean for the next RTT. */
+ tp->vegas.cntRTT = 0;
+ tp->vegas.minRTT = 0x7fffffff;
+ }
+
+ /* The following code is executed for every ack we receive,
+ * except for conditions checked in should_advance_cwnd()
+ * before the call to tcp_cong_avoid(). Mainly this means that
+ * we only execute this code if the ack actually acked some
+ * data.
+ */
+
+ /* If we are in slow start, increase our cwnd in response to this ACK.
+ * (If we are not in slow start then we are in congestion avoidance,
+ * and adjust our congestion window only once per RTT. See the code
+ * above.)
+ */
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tp->snd_cwnd++;
+
+ /* to keep cwnd from growing without bound */
+ tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+
+ /* Make sure that we are never so timid as to reduce our cwnd below
+ * 2 MSS.
+ *
+ * Going below 2 MSS would risk huge delayed ACKs from our receiver.
+ */
+ tp->snd_cwnd = max(tp->snd_cwnd, 2U);
+
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
+{
+ if (tcp_vegas_enabled(tp))
+ vegas_cong_avoid(tp, ack, seq_rtt);
+ else
+ reno_cong_avoid(tp);
+}
+
+/* Restart timer after forward progress on connection.
+ * RFC2988 recommends to restart timer to now+rto.
+ */
+
+static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
+{
+ if (!tp->packets_out) {
+ tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS);
+ } else {
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ }
+}
+
+/* There is one downside to this scheme. Although we keep the
+ * ACK clock ticking, adjusting packet counters and advancing
+ * congestion window, we do not liberate socket send buffer
+ * space.
+ *
+ * Mucking with skb->truesize and sk->sk_wmem_alloc et al.
+ * then making a write space wakeup callback is a possible
+ * future enhancement. WARNING: it is not trivial to make.
+ */
+static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
+ __u32 now, __s32 *seq_rtt)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+ __u32 seq = tp->snd_una;
+ __u32 packets_acked;
+ int acked = 0;
+
+ /* If we get here, the whole TSO packet has not been
+ * acked.
+ */
+ BUG_ON(!after(scb->end_seq, seq));
+
+ packets_acked = tcp_skb_pcount(skb);
+ if (tcp_trim_head(sk, skb, seq - scb->seq))
+ return 0;
+ packets_acked -= tcp_skb_pcount(skb);
+
+ if (packets_acked) {
+ __u8 sacked = scb->sacked;
+
+ acked |= FLAG_DATA_ACKED;
+ if (sacked) {
+ if (sacked & TCPCB_RETRANS) {
+ if (sacked & TCPCB_SACKED_RETRANS)
+ tp->retrans_out -= packets_acked;
+ acked |= FLAG_RETRANS_DATA_ACKED;
+ *seq_rtt = -1;
+ } else if (*seq_rtt < 0)
+ *seq_rtt = now - scb->when;
+ if (sacked & TCPCB_SACKED_ACKED)
+ tp->sacked_out -= packets_acked;
+ if (sacked & TCPCB_LOST)
+ tp->lost_out -= packets_acked;
+ if (sacked & TCPCB_URG) {
+ if (tp->urg_mode &&
+ !before(seq, tp->snd_up))
+ tp->urg_mode = 0;
+ }
+ } else if (*seq_rtt < 0)
+ *seq_rtt = now - scb->when;
+
+ if (tp->fackets_out) {
+ __u32 dval = min(tp->fackets_out, packets_acked);
+ tp->fackets_out -= dval;
+ }
+ tp->packets_out -= packets_acked;
+
+ BUG_ON(tcp_skb_pcount(skb) == 0);
+ BUG_ON(!before(scb->seq, scb->end_seq));
+ }
+
+ return acked;
+}
+
+
+/* Remove acknowledged frames from the retransmission queue. */
+static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ __u32 now = tcp_time_stamp;
+ int acked = 0;
+ __s32 seq_rtt = -1;
+
+ while ((skb = skb_peek(&sk->sk_write_queue)) &&
+ skb != sk->sk_send_head) {
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+ __u8 sacked = scb->sacked;
+
+ /* If our packet is before the ack sequence we can
+ * discard it as it's confirmed to have arrived at
+ * the other end.
+ */
+ if (after(scb->end_seq, tp->snd_una)) {
+ if (tcp_skb_pcount(skb) > 1)
+ acked |= tcp_tso_acked(sk, skb,
+ now, &seq_rtt);
+ break;
+ }
+
+ /* Initial outgoing SYN's get put onto the write_queue
+ * just like anything else we transmit. It is not
+ * true data, and if we misinform our callers that
+ * this ACK acks real data, we will erroneously exit
+ * connection startup slow start one packet too
+ * quickly. This is severely frowned upon behavior.
+ */
+ if (!(scb->flags & TCPCB_FLAG_SYN)) {
+ acked |= FLAG_DATA_ACKED;
+ } else {
+ acked |= FLAG_SYN_ACKED;
+ tp->retrans_stamp = 0;
+ }
+
+ if (sacked) {
+ if (sacked & TCPCB_RETRANS) {
+ if(sacked & TCPCB_SACKED_RETRANS)
+ tp->retrans_out -= tcp_skb_pcount(skb);
+ acked |= FLAG_RETRANS_DATA_ACKED;
+ seq_rtt = -1;
+ } else if (seq_rtt < 0)
+ seq_rtt = now - scb->when;
+ if (sacked & TCPCB_SACKED_ACKED)
+ tp->sacked_out -= tcp_skb_pcount(skb);
+ if (sacked & TCPCB_LOST)
+ tp->lost_out -= tcp_skb_pcount(skb);
+ if (sacked & TCPCB_URG) {
+ if (tp->urg_mode &&
+ !before(scb->end_seq, tp->snd_up))
+ tp->urg_mode = 0;
+ }
+ } else if (seq_rtt < 0)
+ seq_rtt = now - scb->when;
+ tcp_dec_pcount_approx(&tp->fackets_out, skb);
+ tcp_packets_out_dec(tp, skb);
+ __skb_unlink(skb, skb->list);
+ sk_stream_free_skb(sk, skb);
+ }
+
+ if (acked&FLAG_ACKED) {
+ tcp_ack_update_rtt(tp, acked, seq_rtt);
+ tcp_ack_packets_out(sk, tp);
+ }
+
+#if FASTRETRANS_DEBUG > 0
+ BUG_TRAP((int)tp->sacked_out >= 0);
+ BUG_TRAP((int)tp->lost_out >= 0);
+ BUG_TRAP((int)tp->retrans_out >= 0);
+ if (!tp->packets_out && tp->rx_opt.sack_ok) {
+ if (tp->lost_out) {
+ printk(KERN_DEBUG "Leak l=%u %d\n",
+ tp->lost_out, tp->ca_state);
+ tp->lost_out = 0;
+ }
+ if (tp->sacked_out) {
+ printk(KERN_DEBUG "Leak s=%u %d\n",
+ tp->sacked_out, tp->ca_state);
+ tp->sacked_out = 0;
+ }
+ if (tp->retrans_out) {
+ printk(KERN_DEBUG "Leak r=%u %d\n",
+ tp->retrans_out, tp->ca_state);
+ tp->retrans_out = 0;
+ }
+ }
+#endif
+ *seq_rtt_p = seq_rtt;
+ return acked;
+}
+
+static void tcp_ack_probe(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Was it a usable window open? */
+
+ if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
+ tp->snd_una + tp->snd_wnd)) {
+ tp->backoff = 0;
+ tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0);
+ /* Socket must be waked up by subsequent tcp_data_snd_check().
+ * This function is not for random using!
+ */
+ } else {
+ tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0,
+ min(tp->rto << tp->backoff, TCP_RTO_MAX));
+ }
+}
+
+static inline int tcp_ack_is_dubious(struct tcp_sock *tp, int flag)
+{
+ return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
+ tp->ca_state != TCP_CA_Open);
+}
+
+static inline int tcp_may_raise_cwnd(struct tcp_sock *tp, int flag)
+{
+ return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
+ !((1<<tp->ca_state)&(TCPF_CA_Recovery|TCPF_CA_CWR));
+}
+
+/* Check that window update is acceptable.
+ * The function assumes that snd_una<=ack<=snd_next.
+ */
+static inline int tcp_may_update_window(struct tcp_sock *tp, u32 ack,
+ u32 ack_seq, u32 nwin)
+{
+ return (after(ack, tp->snd_una) ||
+ after(ack_seq, tp->snd_wl1) ||
+ (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd));
+}
+
+/* Update our send window.
+ *
+ * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
+ * and in FreeBSD. NetBSD's one is even worse.) is wrong.
+ */
+static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
+ struct sk_buff *skb, u32 ack, u32 ack_seq)
+{
+ int flag = 0;
+ u32 nwin = ntohs(skb->h.th->window);
+
+ if (likely(!skb->h.th->syn))
+ nwin <<= tp->rx_opt.snd_wscale;
+
+ if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
+ flag |= FLAG_WIN_UPDATE;
+ tcp_update_wl(tp, ack, ack_seq);
+
+ if (tp->snd_wnd != nwin) {
+ tp->snd_wnd = nwin;
+
+ /* Note, it is the only place, where
+ * fast path is recovered for sending TCP.
+ */
+ tcp_fast_path_check(sk, tp);
+
+ if (nwin > tp->max_window) {
+ tp->max_window = nwin;
+ tcp_sync_mss(sk, tp->pmtu_cookie);
+ }
+ }
+ }
+
+ tp->snd_una = ack;
+
+ return flag;
+}
+
+static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tcp_sync_left_out(tp);
+
+ if (tp->snd_una == prior_snd_una ||
+ !before(tp->snd_una, tp->frto_highmark)) {
+ /* RTO was caused by loss, start retransmitting in
+ * go-back-N slow start
+ */
+ tcp_enter_frto_loss(sk);
+ return;
+ }
+
+ if (tp->frto_counter == 1) {
+ /* First ACK after RTO advances the window: allow two new
+ * segments out.
+ */
+ tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
+ } else {
+ /* Also the second ACK after RTO advances the window.
+ * The RTO was likely spurious. Reduce cwnd and continue
+ * in congestion avoidance
+ */
+ tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+ tcp_moderate_cwnd(tp);
+ }
+
+ /* F-RTO affects on two new ACKs following RTO.
+ * At latest on third ACK the TCP behavor is back to normal.
+ */
+ tp->frto_counter = (tp->frto_counter + 1) % 3;
+}
+
+/*
+ * TCP Westwood+
+ */
+
+/*
+ * @init_westwood
+ * This function initializes fields used in TCP Westwood+. We can't
+ * get no information about RTTmin at this time so we simply set it to
+ * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
+ * since in this way we're sure it will be updated in a consistent
+ * way as soon as possible. It will reasonably happen within the first
+ * RTT period of the connection lifetime.
+ */
+
+static void init_westwood(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tp->westwood.bw_ns_est = 0;
+ tp->westwood.bw_est = 0;
+ tp->westwood.accounted = 0;
+ tp->westwood.cumul_ack = 0;
+ tp->westwood.rtt_win_sx = tcp_time_stamp;
+ tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT;
+ tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT;
+ tp->westwood.snd_una = tp->snd_una;
+}
+
+/*
+ * @westwood_do_filter
+ * Low-pass filter. Implemented using constant coeffients.
+ */
+
+static inline __u32 westwood_do_filter(__u32 a, __u32 b)
+{
+ return (((7 * a) + b) >> 3);
+}
+
+static void westwood_filter(struct sock *sk, __u32 delta)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tp->westwood.bw_ns_est =
+ westwood_do_filter(tp->westwood.bw_ns_est,
+ tp->westwood.bk / delta);
+ tp->westwood.bw_est =
+ westwood_do_filter(tp->westwood.bw_est,
+ tp->westwood.bw_ns_est);
+}
+
+/*
+ * @westwood_update_rttmin
+ * It is used to update RTTmin. In this case we MUST NOT use
+ * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN!
+ */
+
+static inline __u32 westwood_update_rttmin(const struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ __u32 rttmin = tp->westwood.rtt_min;
+
+ if (tp->westwood.rtt != 0 &&
+ (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin))
+ rttmin = tp->westwood.rtt;
+
+ return rttmin;
+}
+
+/*
+ * @westwood_acked
+ * Evaluate increases for dk.
+ */
+
+static inline __u32 westwood_acked(const struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ return tp->snd_una - tp->westwood.snd_una;
+}
+
+/*
+ * @westwood_new_window
+ * It evaluates if we are receiving data inside the same RTT window as
+ * when we started.
+ * Return value:
+ * It returns 0 if we are still evaluating samples in the same RTT
+ * window, 1 if the sample has to be considered in the next window.
+ */
+
+static int westwood_new_window(const struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ __u32 left_bound;
+ __u32 rtt;
+ int ret = 0;
+
+ left_bound = tp->westwood.rtt_win_sx;
+ rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN);
+
+ /*
+ * A RTT-window has passed. Be careful since if RTT is less than
+ * 50ms we don't filter but we continue 'building the sample'.
+ * This minimum limit was choosen since an estimation on small
+ * time intervals is better to avoid...
+ * Obvioulsy on a LAN we reasonably will always have
+ * right_bound = left_bound + WESTWOOD_RTT_MIN
+ */
+
+ if ((left_bound + rtt) < tcp_time_stamp)
+ ret = 1;
+
+ return ret;
+}
+
+/*
+ * @westwood_update_window
+ * It updates RTT evaluation window if it is the right moment to do
+ * it. If so it calls filter for evaluating bandwidth.
+ */
+
+static void __westwood_update_window(struct sock *sk, __u32 now)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ __u32 delta = now - tp->westwood.rtt_win_sx;
+
+ if (delta) {
+ if (tp->westwood.rtt)
+ westwood_filter(sk, delta);
+
+ tp->westwood.bk = 0;
+ tp->westwood.rtt_win_sx = tcp_time_stamp;
+ }
+}
+
+
+static void westwood_update_window(struct sock *sk, __u32 now)
+{
+ if (westwood_new_window(sk))
+ __westwood_update_window(sk, now);
+}
+
+/*
+ * @__tcp_westwood_fast_bw
+ * It is called when we are in fast path. In particular it is called when
+ * header prediction is successfull. In such case infact update is
+ * straight forward and doesn't need any particular care.
+ */
+
+static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ westwood_update_window(sk, tcp_time_stamp);
+
+ tp->westwood.bk += westwood_acked(sk);
+ tp->westwood.snd_una = tp->snd_una;
+ tp->westwood.rtt_min = westwood_update_rttmin(sk);
+}
+
+static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
+{
+ if (tcp_is_westwood(tcp_sk(sk)))
+ __tcp_westwood_fast_bw(sk, skb);
+}
+
+
+/*
+ * @westwood_dupack_update
+ * It updates accounted and cumul_ack when receiving a dupack.
+ */
+
+static void westwood_dupack_update(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tp->westwood.accounted += tp->mss_cache_std;
+ tp->westwood.cumul_ack = tp->mss_cache_std;
+}
+
+static inline int westwood_may_change_cumul(struct tcp_sock *tp)
+{
+ return (tp->westwood.cumul_ack > tp->mss_cache_std);
+}
+
+static inline void westwood_partial_update(struct tcp_sock *tp)
+{
+ tp->westwood.accounted -= tp->westwood.cumul_ack;
+ tp->westwood.cumul_ack = tp->mss_cache_std;
+}
+
+static inline void westwood_complete_update(struct tcp_sock *tp)
+{
+ tp->westwood.cumul_ack -= tp->westwood.accounted;
+ tp->westwood.accounted = 0;
+}
+
+/*
+ * @westwood_acked_count
+ * This function evaluates cumul_ack for evaluating dk in case of
+ * delayed or partial acks.
+ */
+
+static inline __u32 westwood_acked_count(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tp->westwood.cumul_ack = westwood_acked(sk);
+
+ /* If cumul_ack is 0 this is a dupack since it's not moving
+ * tp->snd_una.
+ */
+ if (!(tp->westwood.cumul_ack))
+ westwood_dupack_update(sk);
+
+ if (westwood_may_change_cumul(tp)) {
+ /* Partial or delayed ack */
+ if (tp->westwood.accounted >= tp->westwood.cumul_ack)
+ westwood_partial_update(tp);
+ else
+ westwood_complete_update(tp);
+ }
+
+ tp->westwood.snd_una = tp->snd_una;
+
+ return tp->westwood.cumul_ack;
+}
+
+
+/*
+ * @__tcp_westwood_slow_bw
+ * It is called when something is going wrong..even if there could
+ * be no problems! Infact a simple delayed packet may trigger a
+ * dupack. But we need to be careful in such case.
+ */
+
+static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ westwood_update_window(sk, tcp_time_stamp);
+
+ tp->westwood.bk += westwood_acked_count(sk);
+ tp->westwood.rtt_min = westwood_update_rttmin(sk);
+}
+
+static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
+{
+ if (tcp_is_westwood(tcp_sk(sk)))
+ __tcp_westwood_slow_bw(sk, skb);
+}
+
+/* This routine deals with incoming acks, but not outgoing ones. */
+static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 prior_snd_una = tp->snd_una;
+ u32 ack_seq = TCP_SKB_CB(skb)->seq;
+ u32 ack = TCP_SKB_CB(skb)->ack_seq;
+ u32 prior_in_flight;
+ s32 seq_rtt;
+ int prior_packets;
+
+ /* If the ack is newer than sent or older than previous acks
+ * then we can probably ignore it.
+ */
+ if (after(ack, tp->snd_nxt))
+ goto uninteresting_ack;
+
+ if (before(ack, prior_snd_una))
+ goto old_ack;
+
+ if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
+ /* Window is constant, pure forward advance.
+ * No more checks are required.
+ * Note, we use the fact that SND.UNA>=SND.WL2.
+ */
+ tcp_update_wl(tp, ack, ack_seq);
+ tp->snd_una = ack;
+ tcp_westwood_fast_bw(sk, skb);
+ flag |= FLAG_WIN_UPDATE;
+
+ NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
+ } else {
+ if (ack_seq != TCP_SKB_CB(skb)->end_seq)
+ flag |= FLAG_DATA;
+ else
+ NET_INC_STATS_BH(LINUX_MIB_TCPPUREACKS);
+
+ flag |= tcp_ack_update_window(sk, tp, skb, ack, ack_seq);
+
+ if (TCP_SKB_CB(skb)->sacked)
+ flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
+
+ if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
+ flag |= FLAG_ECE;
+
+ tcp_westwood_slow_bw(sk,skb);
+ }
+
+ /* We passed data and got it acked, remove any soft error
+ * log. Something worked...
+ */
+ sk->sk_err_soft = 0;
+ tp->rcv_tstamp = tcp_time_stamp;
+ prior_packets = tp->packets_out;
+ if (!prior_packets)
+ goto no_queue;
+
+ prior_in_flight = tcp_packets_in_flight(tp);
+
+ /* See if we can take anything off of the retransmit queue. */
+ flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
+
+ if (tp->frto_counter)
+ tcp_process_frto(sk, prior_snd_una);
+
+ if (tcp_ack_is_dubious(tp, flag)) {
+ /* Advanve CWND, if state allows this. */
+ if ((flag & FLAG_DATA_ACKED) &&
+ (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) &&
+ tcp_may_raise_cwnd(tp, flag))
+ tcp_cong_avoid(tp, ack, seq_rtt);
+ tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
+ } else {
+ if ((flag & FLAG_DATA_ACKED) &&
+ (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd))
+ tcp_cong_avoid(tp, ack, seq_rtt);
+ }
+
+ if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
+ dst_confirm(sk->sk_dst_cache);
+
+ return 1;
+
+no_queue:
+ tp->probes_out = 0;
+
+ /* If this ack opens up a zero window, clear backoff. It was
+ * being used to time the probes, and is probably far higher than
+ * it needs to be for normal retransmission.
+ */
+ if (sk->sk_send_head)
+ tcp_ack_probe(sk);
+ return 1;
+
+old_ack:
+ if (TCP_SKB_CB(skb)->sacked)
+ tcp_sacktag_write_queue(sk, skb, prior_snd_una);
+
+uninteresting_ack:
+ SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
+ return 0;
+}
+
+
+/* Look for tcp options. Normally only called on SYN and SYNACK packets.
+ * But, this can also be called on packets in the established flow when
+ * the fast version below fails.
+ */
+void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab)
+{
+ unsigned char *ptr;
+ struct tcphdr *th = skb->h.th;
+ int length=(th->doff*4)-sizeof(struct tcphdr);
+
+ ptr = (unsigned char *)(th + 1);
+ opt_rx->saw_tstamp = 0;
+
+ while(length>0) {
+ int opcode=*ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ opsize=*ptr++;
+ if (opsize < 2) /* "silly options" */
+ return;
+ if (opsize > length)
+ return; /* don't parse partial options */
+ switch(opcode) {
+ case TCPOPT_MSS:
+ if(opsize==TCPOLEN_MSS && th->syn && !estab) {
+ u16 in_mss = ntohs(get_unaligned((__u16 *)ptr));
+ if (in_mss) {
+ if (opt_rx->user_mss && opt_rx->user_mss < in_mss)
+ in_mss = opt_rx->user_mss;
+ opt_rx->mss_clamp = in_mss;
+ }
+ }
+ break;
+ case TCPOPT_WINDOW:
+ if(opsize==TCPOLEN_WINDOW && th->syn && !estab)
+ if (sysctl_tcp_window_scaling) {
+ __u8 snd_wscale = *(__u8 *) ptr;
+ opt_rx->wscale_ok = 1;
+ if (snd_wscale > 14) {
+ if(net_ratelimit())
+ printk(KERN_INFO "tcp_parse_options: Illegal window "
+ "scaling value %d >14 received.\n",
+ snd_wscale);
+ snd_wscale = 14;
+ }
+ opt_rx->snd_wscale = snd_wscale;
+ }
+ break;
+ case TCPOPT_TIMESTAMP:
+ if(opsize==TCPOLEN_TIMESTAMP) {
+ if ((estab && opt_rx->tstamp_ok) ||
+ (!estab && sysctl_tcp_timestamps)) {
+ opt_rx->saw_tstamp = 1;
+ opt_rx->rcv_tsval = ntohl(get_unaligned((__u32 *)ptr));
+ opt_rx->rcv_tsecr = ntohl(get_unaligned((__u32 *)(ptr+4)));
+ }
+ }
+ break;
+ case TCPOPT_SACK_PERM:
+ if(opsize==TCPOLEN_SACK_PERM && th->syn && !estab) {
+ if (sysctl_tcp_sack) {
+ opt_rx->sack_ok = 1;
+ tcp_sack_reset(opt_rx);
+ }
+ }
+ break;
+
+ case TCPOPT_SACK:
+ if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
+ !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
+ opt_rx->sack_ok) {
+ TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
+ }
+ };
+ ptr+=opsize-2;
+ length-=opsize;
+ };
+ }
+}
+
+/* Fast parse options. This hopes to only see timestamps.
+ * If it is wrong it falls back on tcp_parse_options().
+ */
+static inline int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
+ struct tcp_sock *tp)
+{
+ if (th->doff == sizeof(struct tcphdr)>>2) {
+ tp->rx_opt.saw_tstamp = 0;
+ return 0;
+ } else if (tp->rx_opt.tstamp_ok &&
+ th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
+ __u32 *ptr = (__u32 *)(th + 1);
+ if (*ptr == ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+ | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
+ tp->rx_opt.saw_tstamp = 1;
+ ++ptr;
+ tp->rx_opt.rcv_tsval = ntohl(*ptr);
+ ++ptr;
+ tp->rx_opt.rcv_tsecr = ntohl(*ptr);
+ return 1;
+ }
+ }
+ tcp_parse_options(skb, &tp->rx_opt, 1);
+ return 1;
+}
+
+static inline void tcp_store_ts_recent(struct tcp_sock *tp)
+{
+ tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
+ tp->rx_opt.ts_recent_stamp = xtime.tv_sec;
+}
+
+static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+{
+ if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
+ /* PAWS bug workaround wrt. ACK frames, the PAWS discard
+ * extra check below makes sure this can only happen
+ * for pure ACK frames. -DaveM
+ *
+ * Not only, also it occurs for expired timestamps.
+ */
+
+ if((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 ||
+ xtime.tv_sec >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)
+ tcp_store_ts_recent(tp);
+ }
+}
+
+/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
+ *
+ * It is not fatal. If this ACK does _not_ change critical state (seqs, window)
+ * it can pass through stack. So, the following predicate verifies that
+ * this segment is not used for anything but congestion avoidance or
+ * fast retransmit. Moreover, we even are able to eliminate most of such
+ * second order effects, if we apply some small "replay" window (~RTO)
+ * to timestamp space.
+ *
+ * All these measures still do not guarantee that we reject wrapped ACKs
+ * on networks with high bandwidth, when sequence space is recycled fastly,
+ * but it guarantees that such events will be very rare and do not affect
+ * connection seriously. This doesn't look nice, but alas, PAWS is really
+ * buggy extension.
+ *
+ * [ Later note. Even worse! It is buggy for segments _with_ data. RFC
+ * states that events when retransmit arrives after original data are rare.
+ * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
+ * the biggest problem on large power networks even with minor reordering.
+ * OK, let's give it small replay window. If peer clock is even 1hz, it is safe
+ * up to bandwidth of 18Gigabit/sec. 8) ]
+ */
+
+static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ struct tcphdr *th = skb->h.th;
+ u32 seq = TCP_SKB_CB(skb)->seq;
+ u32 ack = TCP_SKB_CB(skb)->ack_seq;
+
+ return (/* 1. Pure ACK with correct sequence number. */
+ (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
+
+ /* 2. ... and duplicate ACK. */
+ ack == tp->snd_una &&
+
+ /* 3. ... and does not update window. */
+ !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
+
+ /* 4. ... and sits in replay window. */
+ (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (tp->rto*1024)/HZ);
+}
+
+static inline int tcp_paws_discard(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
+ xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
+ !tcp_disordered_ack(tp, skb));
+}
+
+/* Check segment sequence number for validity.
+ *
+ * Segment controls are considered valid, if the segment
+ * fits to the window after truncation to the window. Acceptability
+ * of data (and SYN, FIN, of course) is checked separately.
+ * See tcp_data_queue(), for example.
+ *
+ * Also, controls (RST is main one) are accepted using RCV.WUP instead
+ * of RCV.NXT. Peer still did not advance his SND.UNA when we
+ * delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
+ * (borrowed from freebsd)
+ */
+
+static inline int tcp_sequence(struct tcp_sock *tp, u32 seq, u32 end_seq)
+{
+ return !before(end_seq, tp->rcv_wup) &&
+ !after(seq, tp->rcv_nxt + tcp_receive_window(tp));
+}
+
+/* When we get a reset we do this. */
+static void tcp_reset(struct sock *sk)
+{
+ /* We want the right error as BSD sees it (and indeed as we do). */
+ switch (sk->sk_state) {
+ case TCP_SYN_SENT:
+ sk->sk_err = ECONNREFUSED;
+ break;
+ case TCP_CLOSE_WAIT:
+ sk->sk_err = EPIPE;
+ break;
+ case TCP_CLOSE:
+ return;
+ default:
+ sk->sk_err = ECONNRESET;
+ }
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+
+ tcp_done(sk);
+}
+
+/*
+ * Process the FIN bit. This now behaves as it is supposed to work
+ * and the FIN takes effect when it is validly part of sequence
+ * space. Not before when we get holes.
+ *
+ * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
+ * (and thence onto LAST-ACK and finally, CLOSE, we never enter
+ * TIME-WAIT)
+ *
+ * If we are in FINWAIT-1, a received FIN indicates simultaneous
+ * close and we go into CLOSING (and later onto TIME-WAIT)
+ *
+ * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
+ */
+static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tcp_schedule_ack(tp);
+
+ sk->sk_shutdown |= RCV_SHUTDOWN;
+ sock_set_flag(sk, SOCK_DONE);
+
+ switch (sk->sk_state) {
+ case TCP_SYN_RECV:
+ case TCP_ESTABLISHED:
+ /* Move to CLOSE_WAIT */
+ tcp_set_state(sk, TCP_CLOSE_WAIT);
+ tp->ack.pingpong = 1;
+ break;
+
+ case TCP_CLOSE_WAIT:
+ case TCP_CLOSING:
+ /* Received a retransmission of the FIN, do
+ * nothing.
+ */
+ break;
+ case TCP_LAST_ACK:
+ /* RFC793: Remain in the LAST-ACK state. */
+ break;
+
+ case TCP_FIN_WAIT1:
+ /* This case occurs when a simultaneous close
+ * happens, we must ack the received FIN and
+ * enter the CLOSING state.
+ */
+ tcp_send_ack(sk);
+ tcp_set_state(sk, TCP_CLOSING);
+ break;
+ case TCP_FIN_WAIT2:
+ /* Received a FIN -- send ACK and enter TIME_WAIT. */
+ tcp_send_ack(sk);
+ tcp_time_wait(sk, TCP_TIME_WAIT, 0);
+ break;
+ default:
+ /* Only TCP_LISTEN and TCP_CLOSE are left, in these
+ * cases we should never reach this piece of code.
+ */
+ printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n",
+ __FUNCTION__, sk->sk_state);
+ break;
+ };
+
+ /* It _is_ possible, that we have something out-of-order _after_ FIN.
+ * Probably, we should reset in this case. For now drop them.
+ */
+ __skb_queue_purge(&tp->out_of_order_queue);
+ if (tp->rx_opt.sack_ok)
+ tcp_sack_reset(&tp->rx_opt);
+ sk_stream_mem_reclaim(sk);
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+
+ /* Do not send POLL_HUP for half duplex close. */
+ if (sk->sk_shutdown == SHUTDOWN_MASK ||
+ sk->sk_state == TCP_CLOSE)
+ sk_wake_async(sk, 1, POLL_HUP);
+ else
+ sk_wake_async(sk, 1, POLL_IN);
+ }
+}
+
+static __inline__ int
+tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
+{
+ if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
+ if (before(seq, sp->start_seq))
+ sp->start_seq = seq;
+ if (after(end_seq, sp->end_seq))
+ sp->end_seq = end_seq;
+ return 1;
+ }
+ return 0;
+}
+
+static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
+{
+ if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
+ if (before(seq, tp->rcv_nxt))
+ NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOLDSENT);
+ else
+ NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFOSENT);
+
+ tp->rx_opt.dsack = 1;
+ tp->duplicate_sack[0].start_seq = seq;
+ tp->duplicate_sack[0].end_seq = end_seq;
+ tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + 1, 4 - tp->rx_opt.tstamp_ok);
+ }
+}
+
+static inline void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq)
+{
+ if (!tp->rx_opt.dsack)
+ tcp_dsack_set(tp, seq, end_seq);
+ else
+ tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
+}
+
+static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+ before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+ NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
+ tcp_enter_quickack_mode(tp);
+
+ if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
+ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+ if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
+ end_seq = tp->rcv_nxt;
+ tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, end_seq);
+ }
+ }
+
+ tcp_send_ack(sk);
+}
+
+/* These routines update the SACK block as out-of-order packets arrive or
+ * in-order packets close up the sequence space.
+ */
+static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
+{
+ int this_sack;
+ struct tcp_sack_block *sp = &tp->selective_acks[0];
+ struct tcp_sack_block *swalk = sp+1;
+
+ /* See if the recent change to the first SACK eats into
+ * or hits the sequence space of other SACK blocks, if so coalesce.
+ */
+ for (this_sack = 1; this_sack < tp->rx_opt.num_sacks; ) {
+ if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
+ int i;
+
+ /* Zap SWALK, by moving every further SACK up by one slot.
+ * Decrease num_sacks.
+ */
+ tp->rx_opt.num_sacks--;
+ tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
+ for(i=this_sack; i < tp->rx_opt.num_sacks; i++)
+ sp[i] = sp[i+1];
+ continue;
+ }
+ this_sack++, swalk++;
+ }
+}
+
+static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
+{
+ __u32 tmp;
+
+ tmp = sack1->start_seq;
+ sack1->start_seq = sack2->start_seq;
+ sack2->start_seq = tmp;
+
+ tmp = sack1->end_seq;
+ sack1->end_seq = sack2->end_seq;
+ sack2->end_seq = tmp;
+}
+
+static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_sack_block *sp = &tp->selective_acks[0];
+ int cur_sacks = tp->rx_opt.num_sacks;
+ int this_sack;
+
+ if (!cur_sacks)
+ goto new_sack;
+
+ for (this_sack=0; this_sack<cur_sacks; this_sack++, sp++) {
+ if (tcp_sack_extend(sp, seq, end_seq)) {
+ /* Rotate this_sack to the first one. */
+ for (; this_sack>0; this_sack--, sp--)
+ tcp_sack_swap(sp, sp-1);
+ if (cur_sacks > 1)
+ tcp_sack_maybe_coalesce(tp);
+ return;
+ }
+ }
+
+ /* Could not find an adjacent existing SACK, build a new one,
+ * put it at the front, and shift everyone else down. We
+ * always know there is at least one SACK present already here.
+ *
+ * If the sack array is full, forget about the last one.
+ */
+ if (this_sack >= 4) {
+ this_sack--;
+ tp->rx_opt.num_sacks--;
+ sp--;
+ }
+ for(; this_sack > 0; this_sack--, sp--)
+ *sp = *(sp-1);
+
+new_sack:
+ /* Build the new head SACK, and we're done. */
+ sp->start_seq = seq;
+ sp->end_seq = end_seq;
+ tp->rx_opt.num_sacks++;
+ tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
+}
+
+/* RCV.NXT advances, some SACKs should be eaten. */
+
+static void tcp_sack_remove(struct tcp_sock *tp)
+{
+ struct tcp_sack_block *sp = &tp->selective_acks[0];
+ int num_sacks = tp->rx_opt.num_sacks;
+ int this_sack;
+
+ /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
+ if (skb_queue_len(&tp->out_of_order_queue) == 0) {
+ tp->rx_opt.num_sacks = 0;
+ tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
+ return;
+ }
+
+ for(this_sack = 0; this_sack < num_sacks; ) {
+ /* Check if the start of the sack is covered by RCV.NXT. */
+ if (!before(tp->rcv_nxt, sp->start_seq)) {
+ int i;
+
+ /* RCV.NXT must cover all the block! */
+ BUG_TRAP(!before(tp->rcv_nxt, sp->end_seq));
+
+ /* Zap this SACK, by moving forward any other SACKS. */
+ for (i=this_sack+1; i < num_sacks; i++)
+ tp->selective_acks[i-1] = tp->selective_acks[i];
+ num_sacks--;
+ continue;
+ }
+ this_sack++;
+ sp++;
+ }
+ if (num_sacks != tp->rx_opt.num_sacks) {
+ tp->rx_opt.num_sacks = num_sacks;
+ tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
+ }
+}
+
+/* This one checks to see if we can put data from the
+ * out_of_order queue into the receive_queue.
+ */
+static void tcp_ofo_queue(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ __u32 dsack_high = tp->rcv_nxt;
+ struct sk_buff *skb;
+
+ while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
+ if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+ break;
+
+ if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
+ __u32 dsack = dsack_high;
+ if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
+ dsack_high = TCP_SKB_CB(skb)->end_seq;
+ tcp_dsack_extend(tp, TCP_SKB_CB(skb)->seq, dsack);
+ }
+
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+ SOCK_DEBUG(sk, "ofo packet was already received \n");
+ __skb_unlink(skb, skb->list);
+ __kfree_skb(skb);
+ continue;
+ }
+ SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
+ tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq);
+
+ __skb_unlink(skb, skb->list);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ if(skb->h.th->fin)
+ tcp_fin(skb, sk, skb->h.th);
+ }
+}
+
+static int tcp_prune_queue(struct sock *sk);
+
+static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcphdr *th = skb->h.th;
+ struct tcp_sock *tp = tcp_sk(sk);
+ int eaten = -1;
+
+ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
+ goto drop;
+
+ th = skb->h.th;
+ __skb_pull(skb, th->doff*4);
+
+ TCP_ECN_accept_cwr(tp, skb);
+
+ if (tp->rx_opt.dsack) {
+ tp->rx_opt.dsack = 0;
+ tp->rx_opt.eff_sacks = min_t(unsigned int, tp->rx_opt.num_sacks,
+ 4 - tp->rx_opt.tstamp_ok);
+ }
+
+ /* Queue data for delivery to the user.
+ * Packets in sequence go to the receive queue.
+ * Out of sequence packets to the out_of_order_queue.
+ */
+ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+ if (tcp_receive_window(tp) == 0)
+ goto out_of_window;
+
+ /* Ok. In sequence. In window. */
+ if (tp->ucopy.task == current &&
+ tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
+ sock_owned_by_user(sk) && !tp->urg_data) {
+ int chunk = min_t(unsigned int, skb->len,
+ tp->ucopy.len);
+
+ __set_current_state(TASK_RUNNING);
+
+ local_bh_enable();
+ if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
+ tp->ucopy.len -= chunk;
+ tp->copied_seq += chunk;
+ eaten = (chunk == skb->len && !th->fin);
+ tcp_rcv_space_adjust(sk);
+ }
+ local_bh_disable();
+ }
+
+ if (eaten <= 0) {
+queue_and_out:
+ if (eaten < 0 &&
+ (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+ !sk_stream_rmem_schedule(sk, skb))) {
+ if (tcp_prune_queue(sk) < 0 ||
+ !sk_stream_rmem_schedule(sk, skb))
+ goto drop;
+ }
+ sk_stream_set_owner_r(skb, sk);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ }
+ tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ if(skb->len)
+ tcp_event_data_recv(sk, tp, skb);
+ if(th->fin)
+ tcp_fin(skb, sk, th);
+
+ if (skb_queue_len(&tp->out_of_order_queue)) {
+ tcp_ofo_queue(sk);
+
+ /* RFC2581. 4.2. SHOULD send immediate ACK, when
+ * gap in queue is filled.
+ */
+ if (!skb_queue_len(&tp->out_of_order_queue))
+ tp->ack.pingpong = 0;
+ }
+
+ if (tp->rx_opt.num_sacks)
+ tcp_sack_remove(tp);
+
+ tcp_fast_path_check(sk, tp);
+
+ if (eaten > 0)
+ __kfree_skb(skb);
+ else if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk, 0);
+ return;
+ }
+
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+ /* A retransmit, 2nd most common case. Force an immediate ack. */
+ NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
+ tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+
+out_of_window:
+ tcp_enter_quickack_mode(tp);
+ tcp_schedule_ack(tp);
+drop:
+ __kfree_skb(skb);
+ return;
+ }
+
+ /* Out of window. F.e. zero window probe. */
+ if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
+ goto out_of_window;
+
+ tcp_enter_quickack_mode(tp);
+
+ if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+ /* Partial packet, seq < rcv_next < end_seq */
+ SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
+ tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq);
+
+ tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
+
+ /* If window is closed, drop tail of packet. But after
+ * remembering D-SACK for its head made in previous line.
+ */
+ if (!tcp_receive_window(tp))
+ goto out_of_window;
+ goto queue_and_out;
+ }
+
+ TCP_ECN_check_ce(tp, skb);
+
+ if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+ !sk_stream_rmem_schedule(sk, skb)) {
+ if (tcp_prune_queue(sk) < 0 ||
+ !sk_stream_rmem_schedule(sk, skb))
+ goto drop;
+ }
+
+ /* Disable header prediction. */
+ tp->pred_flags = 0;
+ tcp_schedule_ack(tp);
+
+ SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
+ tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+
+ sk_stream_set_owner_r(skb, sk);
+
+ if (!skb_peek(&tp->out_of_order_queue)) {
+ /* Initial out of order segment, build 1 SACK. */
+ if (tp->rx_opt.sack_ok) {
+ tp->rx_opt.num_sacks = 1;
+ tp->rx_opt.dsack = 0;
+ tp->rx_opt.eff_sacks = 1;
+ tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
+ tp->selective_acks[0].end_seq =
+ TCP_SKB_CB(skb)->end_seq;
+ }
+ __skb_queue_head(&tp->out_of_order_queue,skb);
+ } else {
+ struct sk_buff *skb1 = tp->out_of_order_queue.prev;
+ u32 seq = TCP_SKB_CB(skb)->seq;
+ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+ if (seq == TCP_SKB_CB(skb1)->end_seq) {
+ __skb_append(skb1, skb);
+
+ if (!tp->rx_opt.num_sacks ||
+ tp->selective_acks[0].end_seq != seq)
+ goto add_sack;
+
+ /* Common case: data arrive in order after hole. */
+ tp->selective_acks[0].end_seq = end_seq;
+ return;
+ }
+
+ /* Find place to insert this segment. */
+ do {
+ if (!after(TCP_SKB_CB(skb1)->seq, seq))
+ break;
+ } while ((skb1 = skb1->prev) !=
+ (struct sk_buff*)&tp->out_of_order_queue);
+
+ /* Do skb overlap to previous one? */
+ if (skb1 != (struct sk_buff*)&tp->out_of_order_queue &&
+ before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+ /* All the bits are present. Drop. */
+ __kfree_skb(skb);
+ tcp_dsack_set(tp, seq, end_seq);
+ goto add_sack;
+ }
+ if (after(seq, TCP_SKB_CB(skb1)->seq)) {
+ /* Partial overlap. */
+ tcp_dsack_set(tp, seq, TCP_SKB_CB(skb1)->end_seq);
+ } else {
+ skb1 = skb1->prev;
+ }
+ }
+ __skb_insert(skb, skb1, skb1->next, &tp->out_of_order_queue);
+
+ /* And clean segments covered by new one as whole. */
+ while ((skb1 = skb->next) !=
+ (struct sk_buff*)&tp->out_of_order_queue &&
+ after(end_seq, TCP_SKB_CB(skb1)->seq)) {
+ if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+ tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq);
+ break;
+ }
+ __skb_unlink(skb1, skb1->list);
+ tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq);
+ __kfree_skb(skb1);
+ }
+
+add_sack:
+ if (tp->rx_opt.sack_ok)
+ tcp_sack_new_ofo_skb(sk, seq, end_seq);
+ }
+}
+
+/* Collapse contiguous sequence of skbs head..tail with
+ * sequence numbers start..end.
+ * Segments with FIN/SYN are not collapsed (only because this
+ * simplifies code)
+ */
+static void
+tcp_collapse(struct sock *sk, struct sk_buff *head,
+ struct sk_buff *tail, u32 start, u32 end)
+{
+ struct sk_buff *skb;
+
+ /* First, check that queue is collapsable and find
+ * the point where collapsing can be useful. */
+ for (skb = head; skb != tail; ) {
+ /* No new bits? It is possible on ofo queue. */
+ if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+ struct sk_buff *next = skb->next;
+ __skb_unlink(skb, skb->list);
+ __kfree_skb(skb);
+ NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
+ skb = next;
+ continue;
+ }
+
+ /* The first skb to collapse is:
+ * - not SYN/FIN and
+ * - bloated or contains data before "start" or
+ * overlaps to the next one.
+ */
+ if (!skb->h.th->syn && !skb->h.th->fin &&
+ (tcp_win_from_space(skb->truesize) > skb->len ||
+ before(TCP_SKB_CB(skb)->seq, start) ||
+ (skb->next != tail &&
+ TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb->next)->seq)))
+ break;
+
+ /* Decided to skip this, advance start seq. */
+ start = TCP_SKB_CB(skb)->end_seq;
+ skb = skb->next;
+ }
+ if (skb == tail || skb->h.th->syn || skb->h.th->fin)
+ return;
+
+ while (before(start, end)) {
+ struct sk_buff *nskb;
+ int header = skb_headroom(skb);
+ int copy = SKB_MAX_ORDER(header, 0);
+
+ /* Too big header? This can happen with IPv6. */
+ if (copy < 0)
+ return;
+ if (end-start < copy)
+ copy = end-start;
+ nskb = alloc_skb(copy+header, GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, header);
+ memcpy(nskb->head, skb->head, header);
+ nskb->nh.raw = nskb->head + (skb->nh.raw-skb->head);
+ nskb->h.raw = nskb->head + (skb->h.raw-skb->head);
+ nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head);
+ memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+ TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
+ __skb_insert(nskb, skb->prev, skb, skb->list);
+ sk_stream_set_owner_r(nskb, sk);
+
+ /* Copy data, releasing collapsed skbs. */
+ while (copy > 0) {
+ int offset = start - TCP_SKB_CB(skb)->seq;
+ int size = TCP_SKB_CB(skb)->end_seq - start;
+
+ if (offset < 0) BUG();
+ if (size > 0) {
+ size = min(copy, size);
+ if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
+ BUG();
+ TCP_SKB_CB(nskb)->end_seq += size;
+ copy -= size;
+ start += size;
+ }
+ if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+ struct sk_buff *next = skb->next;
+ __skb_unlink(skb, skb->list);
+ __kfree_skb(skb);
+ NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
+ skb = next;
+ if (skb == tail || skb->h.th->syn || skb->h.th->fin)
+ return;
+ }
+ }
+ }
+}
+
+/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
+ * and tcp_collapse() them until all the queue is collapsed.
+ */
+static void tcp_collapse_ofo_queue(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
+ struct sk_buff *head;
+ u32 start, end;
+
+ if (skb == NULL)
+ return;
+
+ start = TCP_SKB_CB(skb)->seq;
+ end = TCP_SKB_CB(skb)->end_seq;
+ head = skb;
+
+ for (;;) {
+ skb = skb->next;
+
+ /* Segment is terminated when we see gap or when
+ * we are at the end of all the queue. */
+ if (skb == (struct sk_buff *)&tp->out_of_order_queue ||
+ after(TCP_SKB_CB(skb)->seq, end) ||
+ before(TCP_SKB_CB(skb)->end_seq, start)) {
+ tcp_collapse(sk, head, skb, start, end);
+ head = skb;
+ if (skb == (struct sk_buff *)&tp->out_of_order_queue)
+ break;
+ /* Start new segment */
+ start = TCP_SKB_CB(skb)->seq;
+ end = TCP_SKB_CB(skb)->end_seq;
+ } else {
+ if (before(TCP_SKB_CB(skb)->seq, start))
+ start = TCP_SKB_CB(skb)->seq;
+ if (after(TCP_SKB_CB(skb)->end_seq, end))
+ end = TCP_SKB_CB(skb)->end_seq;
+ }
+ }
+}
+
+/* Reduce allocated memory if we can, trying to get
+ * the socket within its memory limits again.
+ *
+ * Return less than zero if we should start dropping frames
+ * until the socket owning process reads some of the data
+ * to stabilize the situation.
+ */
+static int tcp_prune_queue(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
+
+ NET_INC_STATS_BH(LINUX_MIB_PRUNECALLED);
+
+ if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+ tcp_clamp_window(sk, tp);
+ else if (tcp_memory_pressure)
+ tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
+
+ tcp_collapse_ofo_queue(sk);
+ tcp_collapse(sk, sk->sk_receive_queue.next,
+ (struct sk_buff*)&sk->sk_receive_queue,
+ tp->copied_seq, tp->rcv_nxt);
+ sk_stream_mem_reclaim(sk);
+
+ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ return 0;
+
+ /* Collapsing did not help, destructive actions follow.
+ * This must not ever occur. */
+
+ /* First, purge the out_of_order queue. */
+ if (skb_queue_len(&tp->out_of_order_queue)) {
+ NET_ADD_STATS_BH(LINUX_MIB_OFOPRUNED,
+ skb_queue_len(&tp->out_of_order_queue));
+ __skb_queue_purge(&tp->out_of_order_queue);
+
+ /* Reset SACK state. A conforming SACK implementation will
+ * do the same at a timeout based retransmit. When a connection
+ * is in a sad state like this, we care only about integrity
+ * of the connection not performance.
+ */
+ if (tp->rx_opt.sack_ok)
+ tcp_sack_reset(&tp->rx_opt);
+ sk_stream_mem_reclaim(sk);
+ }
+
+ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ return 0;
+
+ /* If we are really being abused, tell the caller to silently
+ * drop receive data on the floor. It will get retransmitted
+ * and hopefully then we'll have sufficient space.
+ */
+ NET_INC_STATS_BH(LINUX_MIB_RCVPRUNED);
+
+ /* Massive buffer overcommit. */
+ tp->pred_flags = 0;
+ return -1;
+}
+
+
+/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
+ * As additional protections, we do not touch cwnd in retransmission phases,
+ * and if application hit its sndbuf limit recently.
+ */
+void tcp_cwnd_application_limited(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->ca_state == TCP_CA_Open &&
+ sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+ /* Limited by application or receiver window. */
+ u32 win_used = max(tp->snd_cwnd_used, 2U);
+ if (win_used < tp->snd_cwnd) {
+ tp->snd_ssthresh = tcp_current_ssthresh(tp);
+ tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
+ }
+ tp->snd_cwnd_used = 0;
+ }
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+
+/* When incoming ACK allowed to free some skb from write_queue,
+ * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
+ * on the exit from tcp input handler.
+ *
+ * PROBLEM: sndbuf expansion does not work well with largesend.
+ */
+static void tcp_new_space(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->packets_out < tp->snd_cwnd &&
+ !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
+ !tcp_memory_pressure &&
+ atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
+ int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
+ MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
+ demanded = max_t(unsigned int, tp->snd_cwnd,
+ tp->reordering + 1);
+ sndmem *= 2*demanded;
+ if (sndmem > sk->sk_sndbuf)
+ sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ }
+
+ sk->sk_write_space(sk);
+}
+
+static inline void tcp_check_space(struct sock *sk)
+{
+ if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
+ sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
+ if (sk->sk_socket &&
+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ tcp_new_space(sk);
+ }
+}
+
+static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
+ tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
+ tcp_write_xmit(sk, tp->nonagle))
+ tcp_check_probe_timer(sk, tp);
+}
+
+static __inline__ void tcp_data_snd_check(struct sock *sk)
+{
+ struct sk_buff *skb = sk->sk_send_head;
+
+ if (skb != NULL)
+ __tcp_data_snd_check(sk, skb);
+ tcp_check_space(sk);
+}
+
+/*
+ * Check if sending an ack is needed.
+ */
+static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* More than one full frame received... */
+ if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss
+ /* ... and right edge of window advances far enough.
+ * (tcp_recvmsg() will send ACK otherwise). Or...
+ */
+ && __tcp_select_window(sk) >= tp->rcv_wnd) ||
+ /* We ACK each frame or... */
+ tcp_in_quickack_mode(tp) ||
+ /* We have out of order data. */
+ (ofo_possible &&
+ skb_peek(&tp->out_of_order_queue))) {
+ /* Then ack it now */
+ tcp_send_ack(sk);
+ } else {
+ /* Else, send delayed ack. */
+ tcp_send_delayed_ack(sk);
+ }
+}
+
+static __inline__ void tcp_ack_snd_check(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ if (!tcp_ack_scheduled(tp)) {
+ /* We sent a data segment already. */
+ return;
+ }
+ __tcp_ack_snd_check(sk, 1);
+}
+
+/*
+ * This routine is only called when we have urgent data
+ * signalled. Its the 'slow' part of tcp_urg. It could be
+ * moved inline now as tcp_urg is only called from one
+ * place. We handle URGent data wrong. We have to - as
+ * BSD still doesn't use the correction from RFC961.
+ * For 1003.1g we should support a new option TCP_STDURG to permit
+ * either form (or just set the sysctl tcp_stdurg).
+ */
+
+static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 ptr = ntohs(th->urg_ptr);
+
+ if (ptr && !sysctl_tcp_stdurg)
+ ptr--;
+ ptr += ntohl(th->seq);
+
+ /* Ignore urgent data that we've already seen and read. */
+ if (after(tp->copied_seq, ptr))
+ return;
+
+ /* Do not replay urg ptr.
+ *
+ * NOTE: interesting situation not covered by specs.
+ * Misbehaving sender may send urg ptr, pointing to segment,
+ * which we already have in ofo queue. We are not able to fetch
+ * such data and will stay in TCP_URG_NOTYET until will be eaten
+ * by recvmsg(). Seems, we are not obliged to handle such wicked
+ * situations. But it is worth to think about possibility of some
+ * DoSes using some hypothetical application level deadlock.
+ */
+ if (before(ptr, tp->rcv_nxt))
+ return;
+
+ /* Do we already have a newer (or duplicate) urgent pointer? */
+ if (tp->urg_data && !after(ptr, tp->urg_seq))
+ return;
+
+ /* Tell the world about our new urgent pointer. */
+ sk_send_sigurg(sk);
+
+ /* We may be adding urgent data when the last byte read was
+ * urgent. To do this requires some care. We cannot just ignore
+ * tp->copied_seq since we would read the last urgent byte again
+ * as data, nor can we alter copied_seq until this data arrives
+ * or we break the sematics of SIOCATMARK (and thus sockatmark())
+ *
+ * NOTE. Double Dutch. Rendering to plain English: author of comment
+ * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
+ * and expect that both A and B disappear from stream. This is _wrong_.
+ * Though this happens in BSD with high probability, this is occasional.
+ * Any application relying on this is buggy. Note also, that fix "works"
+ * only in this artificial test. Insert some normal data between A and B and we will
+ * decline of BSD again. Verdict: it is better to remove to trap
+ * buggy users.
+ */
+ if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
+ !sock_flag(sk, SOCK_URGINLINE) &&
+ tp->copied_seq != tp->rcv_nxt) {
+ struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+ tp->copied_seq++;
+ if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
+ __skb_unlink(skb, skb->list);
+ __kfree_skb(skb);
+ }
+ }
+
+ tp->urg_data = TCP_URG_NOTYET;
+ tp->urg_seq = ptr;
+
+ /* Disable header prediction. */
+ tp->pred_flags = 0;
+}
+
+/* This is the 'fast' part of urgent handling. */
+static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Check if we get a new urgent pointer - normally not. */
+ if (th->urg)
+ tcp_check_urg(sk,th);
+
+ /* Do we wait for any urgent data? - normally not... */
+ if (tp->urg_data == TCP_URG_NOTYET) {
+ u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
+ th->syn;
+
+ /* Is the urgent pointer pointing into this packet? */
+ if (ptr < skb->len) {
+ u8 tmp;
+ if (skb_copy_bits(skb, ptr, &tmp, 1))
+ BUG();
+ tp->urg_data = TCP_URG_VALID | tmp;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk, 0);
+ }
+ }
+}
+
+static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int chunk = skb->len - hlen;
+ int err;
+
+ local_bh_enable();
+ if (skb->ip_summed==CHECKSUM_UNNECESSARY)
+ err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk);
+ else
+ err = skb_copy_and_csum_datagram_iovec(skb, hlen,
+ tp->ucopy.iov);
+
+ if (!err) {
+ tp->ucopy.len -= chunk;
+ tp->copied_seq += chunk;
+ tcp_rcv_space_adjust(sk);
+ }
+
+ local_bh_disable();
+ return err;
+}
+
+static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
+{
+ int result;
+
+ if (sock_owned_by_user(sk)) {
+ local_bh_enable();
+ result = __tcp_checksum_complete(skb);
+ local_bh_disable();
+ } else {
+ result = __tcp_checksum_complete(skb);
+ }
+ return result;
+}
+
+static __inline__ int
+tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
+{
+ return skb->ip_summed != CHECKSUM_UNNECESSARY &&
+ __tcp_checksum_complete_user(sk, skb);
+}
+
+/*
+ * TCP receive function for the ESTABLISHED state.
+ *
+ * It is split into a fast path and a slow path. The fast path is
+ * disabled when:
+ * - A zero window was announced from us - zero window probing
+ * is only handled properly in the slow path.
+ * - Out of order segments arrived.
+ * - Urgent data is expected.
+ * - There is no buffer space left
+ * - Unexpected TCP flags/window values/header lengths are received
+ * (detected by checking the TCP header against pred_flags)
+ * - Data is sent in both directions. Fast path only supports pure senders
+ * or pure receivers (this means either the sequence number or the ack
+ * value must stay constant)
+ * - Unexpected TCP option.
+ *
+ * When these conditions are not satisfied it drops into a standard
+ * receive procedure patterned after RFC793 to handle all cases.
+ * The first three cases are guaranteed by proper pred_flags setting,
+ * the rest is checked inline. Fast processing is turned on in
+ * tcp_data_queue when everything is OK.
+ */
+int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ struct tcphdr *th, unsigned len)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /*
+ * Header prediction.
+ * The code loosely follows the one in the famous
+ * "30 instruction TCP receive" Van Jacobson mail.
+ *
+ * Van's trick is to deposit buffers into socket queue
+ * on a device interrupt, to call tcp_recv function
+ * on the receive process context and checksum and copy
+ * the buffer to user space. smart...
+ *
+ * Our current scheme is not silly either but we take the
+ * extra cost of the net_bh soft interrupt processing...
+ * We do checksum and copy also but from device to kernel.
+ */
+
+ tp->rx_opt.saw_tstamp = 0;
+
+ /* pred_flags is 0xS?10 << 16 + snd_wnd
+ * if header_predition is to be made
+ * 'S' will always be tp->tcp_header_len >> 2
+ * '?' will be 0 for the fast path, otherwise pred_flags is 0 to
+ * turn it off (when there are holes in the receive
+ * space for instance)
+ * PSH flag is ignored.
+ */
+
+ if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
+ TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+ int tcp_header_len = tp->tcp_header_len;
+
+ /* Timestamp header prediction: tcp_header_len
+ * is automatically equal to th->doff*4 due to pred_flags
+ * match.
+ */
+
+ /* Check timestamp */
+ if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
+ __u32 *ptr = (__u32 *)(th + 1);
+
+ /* No? Slow path! */
+ if (*ptr != ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+ | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
+ goto slow_path;
+
+ tp->rx_opt.saw_tstamp = 1;
+ ++ptr;
+ tp->rx_opt.rcv_tsval = ntohl(*ptr);
+ ++ptr;
+ tp->rx_opt.rcv_tsecr = ntohl(*ptr);
+
+ /* If PAWS failed, check it more carefully in slow path */
+ if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
+ goto slow_path;
+
+ /* DO NOT update ts_recent here, if checksum fails
+ * and timestamp was corrupted part, it will result
+ * in a hung connection since we will drop all
+ * future packets due to the PAWS test.
+ */
+ }
+
+ if (len <= tcp_header_len) {
+ /* Bulk data transfer: sender */
+ if (len == tcp_header_len) {
+ /* Predicted packet is in window by definition.
+ * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+ * Hence, check seq<=rcv_wup reduces to:
+ */
+ if (tcp_header_len ==
+ (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+ tp->rcv_nxt == tp->rcv_wup)
+ tcp_store_ts_recent(tp);
+
+ tcp_rcv_rtt_measure_ts(tp, skb);
+
+ /* We know that such packets are checksummed
+ * on entry.
+ */
+ tcp_ack(sk, skb, 0);
+ __kfree_skb(skb);
+ tcp_data_snd_check(sk);
+ return 0;
+ } else { /* Header too small */
+ TCP_INC_STATS_BH(TCP_MIB_INERRS);
+ goto discard;
+ }
+ } else {
+ int eaten = 0;
+
+ if (tp->ucopy.task == current &&
+ tp->copied_seq == tp->rcv_nxt &&
+ len - tcp_header_len <= tp->ucopy.len &&
+ sock_owned_by_user(sk)) {
+ __set_current_state(TASK_RUNNING);
+
+ if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
+ /* Predicted packet is in window by definition.
+ * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+ * Hence, check seq<=rcv_wup reduces to:
+ */
+ if (tcp_header_len ==
+ (sizeof(struct tcphdr) +
+ TCPOLEN_TSTAMP_ALIGNED) &&
+ tp->rcv_nxt == tp->rcv_wup)
+ tcp_store_ts_recent(tp);
+
+ tcp_rcv_rtt_measure_ts(tp, skb);
+
+ __skb_pull(skb, tcp_header_len);
+ tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER);
+ eaten = 1;
+ }
+ }
+ if (!eaten) {
+ if (tcp_checksum_complete_user(sk, skb))
+ goto csum_error;
+
+ /* Predicted packet is in window by definition.
+ * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+ * Hence, check seq<=rcv_wup reduces to:
+ */
+ if (tcp_header_len ==
+ (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+ tp->rcv_nxt == tp->rcv_wup)
+ tcp_store_ts_recent(tp);
+
+ tcp_rcv_rtt_measure_ts(tp, skb);
+
+ if ((int)skb->truesize > sk->sk_forward_alloc)
+ goto step5;
+
+ NET_INC_STATS_BH(LINUX_MIB_TCPHPHITS);
+
+ /* Bulk data transfer: receiver */
+ __skb_pull(skb,tcp_header_len);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ sk_stream_set_owner_r(skb, sk);
+ tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ }
+
+ tcp_event_data_recv(sk, tp, skb);
+
+ if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
+ /* Well, only one small jumplet in fast path... */
+ tcp_ack(sk, skb, FLAG_DATA);
+ tcp_data_snd_check(sk);
+ if (!tcp_ack_scheduled(tp))
+ goto no_ack;
+ }
+
+ if (eaten) {
+ if (tcp_in_quickack_mode(tp)) {
+ tcp_send_ack(sk);
+ } else {
+ tcp_send_delayed_ack(sk);
+ }
+ } else {
+ __tcp_ack_snd_check(sk, 0);
+ }
+
+no_ack:
+ if (eaten)
+ __kfree_skb(skb);
+ else
+ sk->sk_data_ready(sk, 0);
+ return 0;
+ }
+ }
+
+slow_path:
+ if (len < (th->doff<<2) || tcp_checksum_complete_user(sk, skb))
+ goto csum_error;
+
+ /*
+ * RFC1323: H1. Apply PAWS check first.
+ */
+ if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
+ tcp_paws_discard(tp, skb)) {
+ if (!th->rst) {
+ NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
+ tcp_send_dupack(sk, skb);
+ goto discard;
+ }
+ /* Resets are accepted even if PAWS failed.
+
+ ts_recent update must be made after we are sure
+ that the packet is in window.
+ */
+ }
+
+ /*
+ * Standard slow path.
+ */
+
+ if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
+ /* RFC793, page 37: "In all states except SYN-SENT, all reset
+ * (RST) segments are validated by checking their SEQ-fields."
+ * And page 69: "If an incoming segment is not acceptable,
+ * an acknowledgment should be sent in reply (unless the RST bit
+ * is set, if so drop the segment and return)".
+ */
+ if (!th->rst)
+ tcp_send_dupack(sk, skb);
+ goto discard;
+ }
+
+ if(th->rst) {
+ tcp_reset(sk);
+ goto discard;
+ }
+
+ tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
+
+ if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+ TCP_INC_STATS_BH(TCP_MIB_INERRS);
+ NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);
+ tcp_reset(sk);
+ return 1;
+ }
+
+step5:
+ if(th->ack)
+ tcp_ack(sk, skb, FLAG_SLOWPATH);
+
+ tcp_rcv_rtt_measure_ts(tp, skb);
+
+ /* Process urgent data. */
+ tcp_urg(sk, skb, th);
+
+ /* step 7: process the segment text */
+ tcp_data_queue(sk, skb);
+
+ tcp_data_snd_check(sk);
+ tcp_ack_snd_check(sk);
+ return 0;
+
+csum_error:
+ TCP_INC_STATS_BH(TCP_MIB_INERRS);
+
+discard:
+ __kfree_skb(skb);
+ return 0;
+}
+
+static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
+ struct tcphdr *th, unsigned len)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int saved_clamp = tp->rx_opt.mss_clamp;
+
+ tcp_parse_options(skb, &tp->rx_opt, 0);
+
+ if (th->ack) {
+ /* rfc793:
+ * "If the state is SYN-SENT then
+ * first check the ACK bit
+ * If the ACK bit is set
+ * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
+ * a reset (unless the RST bit is set, if so drop
+ * the segment and return)"
+ *
+ * We do not send data with SYN, so that RFC-correct
+ * test reduces to:
+ */
+ if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
+ goto reset_and_undo;
+
+ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+ !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
+ tcp_time_stamp)) {
+ NET_INC_STATS_BH(LINUX_MIB_PAWSACTIVEREJECTED);
+ goto reset_and_undo;
+ }
+
+ /* Now ACK is acceptable.
+ *
+ * "If the RST bit is set
+ * If the ACK was acceptable then signal the user "error:
+ * connection reset", drop the segment, enter CLOSED state,
+ * delete TCB, and return."
+ */
+
+ if (th->rst) {
+ tcp_reset(sk);
+ goto discard;
+ }
+
+ /* rfc793:
+ * "fifth, if neither of the SYN or RST bits is set then
+ * drop the segment and return."
+ *
+ * See note below!
+ * --ANK(990513)
+ */
+ if (!th->syn)
+ goto discard_and_undo;
+
+ /* rfc793:
+ * "If the SYN bit is on ...
+ * are acceptable then ...
+ * (our SYN has been ACKed), change the connection
+ * state to ESTABLISHED..."
+ */
+
+ TCP_ECN_rcv_synack(tp, th);
+ if (tp->ecn_flags&TCP_ECN_OK)
+ sock_set_flag(sk, SOCK_NO_LARGESEND);
+
+ tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+ tcp_ack(sk, skb, FLAG_SLOWPATH);
+
+ /* Ok.. it's good. Set up sequence numbers and
+ * move to established.
+ */
+ tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is
+ * never scaled.
+ */
+ tp->snd_wnd = ntohs(th->window);
+ tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq);
+
+ if (!tp->rx_opt.wscale_ok) {
+ tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
+ tp->window_clamp = min(tp->window_clamp, 65535U);
+ }
+
+ if (tp->rx_opt.saw_tstamp) {
+ tp->rx_opt.tstamp_ok = 1;
+ tp->tcp_header_len =
+ sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
+ tcp_store_ts_recent(tp);
+ } else {
+ tp->tcp_header_len = sizeof(struct tcphdr);
+ }
+
+ if (tp->rx_opt.sack_ok && sysctl_tcp_fack)
+ tp->rx_opt.sack_ok |= 2;
+
+ tcp_sync_mss(sk, tp->pmtu_cookie);
+ tcp_initialize_rcv_mss(sk);
+
+ /* Remember, tcp_poll() does not lock socket!
+ * Change state from SYN-SENT only after copied_seq
+ * is initialized. */
+ tp->copied_seq = tp->rcv_nxt;
+ mb();
+ tcp_set_state(sk, TCP_ESTABLISHED);
+
+ /* Make sure socket is routed, for correct metrics. */
+ tp->af_specific->rebuild_header(sk);
+
+ tcp_init_metrics(sk);
+
+ /* Prevent spurious tcp_cwnd_restart() on first data
+ * packet.
+ */
+ tp->lsndtime = tcp_time_stamp;
+
+ tcp_init_buffer_space(sk);
+
+ if (sock_flag(sk, SOCK_KEEPOPEN))
+ tcp_reset_keepalive_timer(sk, keepalive_time_when(tp));
+
+ if (!tp->rx_opt.snd_wscale)
+ __tcp_fast_path_on(tp, tp->snd_wnd);
+ else
+ tp->pred_flags = 0;
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+ sk_wake_async(sk, 0, POLL_OUT);
+ }
+
+ if (sk->sk_write_pending || tp->defer_accept || tp->ack.pingpong) {
+ /* Save one ACK. Data will be ready after
+ * several ticks, if write_pending is set.
+ *
+ * It may be deleted, but with this feature tcpdumps
+ * look so _wonderfully_ clever, that I was not able
+ * to stand against the temptation 8) --ANK
+ */
+ tcp_schedule_ack(tp);
+ tp->ack.lrcvtime = tcp_time_stamp;
+ tp->ack.ato = TCP_ATO_MIN;
+ tcp_incr_quickack(tp);
+ tcp_enter_quickack_mode(tp);
+ tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
+
+discard:
+ __kfree_skb(skb);
+ return 0;
+ } else {
+ tcp_send_ack(sk);
+ }
+ return -1;
+ }
+
+ /* No ACK in the segment */
+
+ if (th->rst) {
+ /* rfc793:
+ * "If the RST bit is set
+ *
+ * Otherwise (no ACK) drop the segment and return."
+ */
+
+ goto discard_and_undo;
+ }
+
+ /* PAWS check. */
+ if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && tcp_paws_check(&tp->rx_opt, 0))
+ goto discard_and_undo;
+
+ if (th->syn) {
+ /* We see SYN without ACK. It is attempt of
+ * simultaneous connect with crossed SYNs.
+ * Particularly, it can be connect to self.
+ */
+ tcp_set_state(sk, TCP_SYN_RECV);
+
+ if (tp->rx_opt.saw_tstamp) {
+ tp->rx_opt.tstamp_ok = 1;
+ tcp_store_ts_recent(tp);
+ tp->tcp_header_len =
+ sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ } else {
+ tp->tcp_header_len = sizeof(struct tcphdr);
+ }
+
+ tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is
+ * never scaled.
+ */
+ tp->snd_wnd = ntohs(th->window);
+ tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+ tp->max_window = tp->snd_wnd;
+
+ TCP_ECN_rcv_syn(tp, th);
+ if (tp->ecn_flags&TCP_ECN_OK)
+ sock_set_flag(sk, SOCK_NO_LARGESEND);
+
+ tcp_sync_mss(sk, tp->pmtu_cookie);
+ tcp_initialize_rcv_mss(sk);
+
+
+ tcp_send_synack(sk);
+#if 0
+ /* Note, we could accept data and URG from this segment.
+ * There are no obstacles to make this.
+ *
+ * However, if we ignore data in ACKless segments sometimes,
+ * we have no reasons to accept it sometimes.
+ * Also, seems the code doing it in step6 of tcp_rcv_state_process
+ * is not flawless. So, discard packet for sanity.
+ * Uncomment this return to process the data.
+ */
+ return -1;
+#else
+ goto discard;
+#endif
+ }
+ /* "fifth, if neither of the SYN or RST bits is set then
+ * drop the segment and return."
+ */
+
+discard_and_undo:
+ tcp_clear_options(&tp->rx_opt);
+ tp->rx_opt.mss_clamp = saved_clamp;
+ goto discard;
+
+reset_and_undo:
+ tcp_clear_options(&tp->rx_opt);
+ tp->rx_opt.mss_clamp = saved_clamp;
+ return 1;
+}
+
+
+/*
+ * This function implements the receiving procedure of RFC 793 for
+ * all states except ESTABLISHED and TIME_WAIT.
+ * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
+ * address independent.
+ */
+
+int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+ struct tcphdr *th, unsigned len)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int queued = 0;
+
+ tp->rx_opt.saw_tstamp = 0;
+
+ switch (sk->sk_state) {
+ case TCP_CLOSE:
+ goto discard;
+
+ case TCP_LISTEN:
+ if(th->ack)
+ return 1;
+
+ if(th->rst)
+ goto discard;
+
+ if(th->syn) {
+ if(tp->af_specific->conn_request(sk, skb) < 0)
+ return 1;
+
+ init_westwood(sk);
+ init_bictcp(tp);
+
+ /* Now we have several options: In theory there is
+ * nothing else in the frame. KA9Q has an option to
+ * send data with the syn, BSD accepts data with the
+ * syn up to the [to be] advertised window and
+ * Solaris 2.1 gives you a protocol error. For now
+ * we just ignore it, that fits the spec precisely
+ * and avoids incompatibilities. It would be nice in
+ * future to drop through and process the data.
+ *
+ * Now that TTCP is starting to be used we ought to
+ * queue this data.
+ * But, this leaves one open to an easy denial of
+ * service attack, and SYN cookies can't defend
+ * against this problem. So, we drop the data
+ * in the interest of security over speed.
+ */
+ goto discard;
+ }
+ goto discard;
+
+ case TCP_SYN_SENT:
+ init_westwood(sk);
+ init_bictcp(tp);
+
+ queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
+ if (queued >= 0)
+ return queued;
+
+ /* Do step6 onward by hand. */
+ tcp_urg(sk, skb, th);
+ __kfree_skb(skb);
+ tcp_data_snd_check(sk);
+ return 0;
+ }
+
+ if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
+ tcp_paws_discard(tp, skb)) {
+ if (!th->rst) {
+ NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
+ tcp_send_dupack(sk, skb);
+ goto discard;
+ }
+ /* Reset is accepted even if it did not pass PAWS. */
+ }
+
+ /* step 1: check sequence number */
+ if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
+ if (!th->rst)
+ tcp_send_dupack(sk, skb);
+ goto discard;
+ }
+
+ /* step 2: check RST bit */
+ if(th->rst) {
+ tcp_reset(sk);
+ goto discard;
+ }
+
+ tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
+
+ /* step 3: check security and precedence [ignored] */
+
+ /* step 4:
+ *
+ * Check for a SYN in window.
+ */
+ if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+ NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);
+ tcp_reset(sk);
+ return 1;
+ }
+
+ /* step 5: check the ACK field */
+ if (th->ack) {
+ int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH);
+
+ switch(sk->sk_state) {
+ case TCP_SYN_RECV:
+ if (acceptable) {
+ tp->copied_seq = tp->rcv_nxt;
+ mb();
+ tcp_set_state(sk, TCP_ESTABLISHED);
+ sk->sk_state_change(sk);
+
+ /* Note, that this wakeup is only for marginal
+ * crossed SYN case. Passively open sockets
+ * are not waked up, because sk->sk_sleep ==
+ * NULL and sk->sk_socket == NULL.
+ */
+ if (sk->sk_socket) {
+ sk_wake_async(sk,0,POLL_OUT);
+ }
+
+ tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
+ tp->snd_wnd = ntohs(th->window) <<
+ tp->rx_opt.snd_wscale;
+ tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq,
+ TCP_SKB_CB(skb)->seq);
+
+ /* tcp_ack considers this ACK as duplicate
+ * and does not calculate rtt.
+ * Fix it at least with timestamps.
+ */
+ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+ !tp->srtt)
+ tcp_ack_saw_tstamp(tp, 0);
+
+ if (tp->rx_opt.tstamp_ok)
+ tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
+
+ /* Make sure socket is routed, for
+ * correct metrics.
+ */
+ tp->af_specific->rebuild_header(sk);
+
+ tcp_init_metrics(sk);
+
+ /* Prevent spurious tcp_cwnd_restart() on
+ * first data packet.
+ */
+ tp->lsndtime = tcp_time_stamp;
+
+ tcp_initialize_rcv_mss(sk);
+ tcp_init_buffer_space(sk);
+ tcp_fast_path_on(tp);
+ } else {
+ return 1;
+ }
+ break;
+
+ case TCP_FIN_WAIT1:
+ if (tp->snd_una == tp->write_seq) {
+ tcp_set_state(sk, TCP_FIN_WAIT2);
+ sk->sk_shutdown |= SEND_SHUTDOWN;
+ dst_confirm(sk->sk_dst_cache);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ /* Wake up lingering close() */
+ sk->sk_state_change(sk);
+ else {
+ int tmo;
+
+ if (tp->linger2 < 0 ||
+ (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
+ tcp_done(sk);
+ NET_INC_STATS_BH(LINUX_MIB_TCPABORTONDATA);
+ return 1;
+ }
+
+ tmo = tcp_fin_time(tp);
+ if (tmo > TCP_TIMEWAIT_LEN) {
+ tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
+ } else if (th->fin || sock_owned_by_user(sk)) {
+ /* Bad case. We could lose such FIN otherwise.
+ * It is not a big problem, but it looks confusing
+ * and not so rare event. We still can lose it now,
+ * if it spins in bh_lock_sock(), but it is really
+ * marginal case.
+ */
+ tcp_reset_keepalive_timer(sk, tmo);
+ } else {
+ tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+ goto discard;
+ }
+ }
+ }
+ break;
+
+ case TCP_CLOSING:
+ if (tp->snd_una == tp->write_seq) {
+ tcp_time_wait(sk, TCP_TIME_WAIT, 0);
+ goto discard;
+ }
+ break;
+
+ case TCP_LAST_ACK:
+ if (tp->snd_una == tp->write_seq) {
+ tcp_update_metrics(sk);
+ tcp_done(sk);
+ goto discard;
+ }
+ break;
+ }
+ } else
+ goto discard;
+
+ /* step 6: check the URG bit */
+ tcp_urg(sk, skb, th);
+
+ /* step 7: process the segment text */
+ switch (sk->sk_state) {
+ case TCP_CLOSE_WAIT:
+ case TCP_CLOSING:
+ case TCP_LAST_ACK:
+ if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+ break;
+ case TCP_FIN_WAIT1:
+ case TCP_FIN_WAIT2:
+ /* RFC 793 says to queue data in these states,
+ * RFC 1122 says we MUST send a reset.
+ * BSD 4.4 also does reset.
+ */
+ if (sk->sk_shutdown & RCV_SHUTDOWN) {
+ if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
+ NET_INC_STATS_BH(LINUX_MIB_TCPABORTONDATA);
+ tcp_reset(sk);
+ return 1;
+ }
+ }
+ /* Fall through */
+ case TCP_ESTABLISHED:
+ tcp_data_queue(sk, skb);
+ queued = 1;
+ break;
+ }
+
+ /* tcp_data could move socket to TIME-WAIT */
+ if (sk->sk_state != TCP_CLOSE) {
+ tcp_data_snd_check(sk);
+ tcp_ack_snd_check(sk);
+ }
+
+ if (!queued) {
+discard:
+ __kfree_skb(skb);
+ }
+ return 0;
+}
+
+EXPORT_SYMBOL(sysctl_tcp_ecn);
+EXPORT_SYMBOL(sysctl_tcp_reordering);
+EXPORT_SYMBOL(tcp_parse_options);
+EXPORT_SYMBOL(tcp_rcv_established);
+EXPORT_SYMBOL(tcp_rcv_state_process);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
new file mode 100644
index 00000000000..3ac6659869c
--- /dev/null
+++ b/net/ipv4/tcp_ipv4.c
@@ -0,0 +1,2663 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
+ *
+ * IPv4 specific functions
+ *
+ *
+ * code split from:
+ * linux/ipv4/tcp.c
+ * linux/ipv4/tcp_input.c
+ * linux/ipv4/tcp_output.c
+ *
+ * See tcp.c for author information
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * Changes:
+ * David S. Miller : New socket lookup architecture.
+ * This code is dedicated to John Dyson.
+ * David S. Miller : Change semantics of established hash,
+ * half is devoted to TIME_WAIT sockets
+ * and the rest go in the other half.
+ * Andi Kleen : Add support for syncookies and fixed
+ * some bugs: ip options weren't passed to
+ * the TCP layer, missed a check for an
+ * ACK bit.
+ * Andi Kleen : Implemented fast path mtu discovery.
+ * Fixed many serious bugs in the
+ * open_request handling and moved
+ * most of it into the af independent code.
+ * Added tail drop and some other bugfixes.
+ * Added new listen sematics.
+ * Mike McLagan : Routing by source
+ * Juan Jose Ciarlante: ip_dynaddr bits
+ * Andi Kleen: various fixes.
+ * Vitaly E. Lavrov : Transparent proxy revived after year
+ * coma.
+ * Andi Kleen : Fix new listen.
+ * Andi Kleen : Fix accept error reporting.
+ * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
+ * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
+ * a single port at the same time.
+ */
+
+#include <linux/config.h>
+
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/cache.h>
+#include <linux/jhash.h>
+#include <linux/init.h>
+#include <linux/times.h>
+
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+#include <net/xfrm.h>
+
+#include <linux/inet.h>
+#include <linux/ipv6.h>
+#include <linux/stddef.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+extern int sysctl_ip_dynaddr;
+int sysctl_tcp_tw_reuse;
+int sysctl_tcp_low_latency;
+
+/* Check TCP sequence numbers in ICMP packets. */
+#define ICMP_MIN_LENGTH 8
+
+/* Socket used for sending RSTs */
+static struct socket *tcp_socket;
+
+void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
+ struct sk_buff *skb);
+
+struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
+ .__tcp_lhash_lock = RW_LOCK_UNLOCKED,
+ .__tcp_lhash_users = ATOMIC_INIT(0),
+ .__tcp_lhash_wait
+ = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
+ .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
+};
+
+/*
+ * This array holds the first and last local port number.
+ * For high-usage systems, use sysctl to change this to
+ * 32768-61000
+ */
+int sysctl_local_port_range[2] = { 1024, 4999 };
+int tcp_port_rover = 1024 - 1;
+
+static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
+ __u32 faddr, __u16 fport)
+{
+ int h = (laddr ^ lport) ^ (faddr ^ fport);
+ h ^= h >> 16;
+ h ^= h >> 8;
+ return h & (tcp_ehash_size - 1);
+}
+
+static __inline__ int tcp_sk_hashfn(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ __u32 laddr = inet->rcv_saddr;
+ __u16 lport = inet->num;
+ __u32 faddr = inet->daddr;
+ __u16 fport = inet->dport;
+
+ return tcp_hashfn(laddr, lport, faddr, fport);
+}
+
+/* Allocate and initialize a new TCP local port bind bucket.
+ * The bindhash mutex for snum's hash chain must be held here.
+ */
+struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
+ unsigned short snum)
+{
+ struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
+ SLAB_ATOMIC);
+ if (tb) {
+ tb->port = snum;
+ tb->fastreuse = 0;
+ INIT_HLIST_HEAD(&tb->owners);
+ hlist_add_head(&tb->node, &head->chain);
+ }
+ return tb;
+}
+
+/* Caller must hold hashbucket lock for this tb with local BH disabled */
+void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
+{
+ if (hlist_empty(&tb->owners)) {
+ __hlist_del(&tb->node);
+ kmem_cache_free(tcp_bucket_cachep, tb);
+ }
+}
+
+/* Caller must disable local BH processing. */
+static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
+{
+ struct tcp_bind_hashbucket *head =
+ &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
+ struct tcp_bind_bucket *tb;
+
+ spin_lock(&head->lock);
+ tb = tcp_sk(sk)->bind_hash;
+ sk_add_bind_node(child, &tb->owners);
+ tcp_sk(child)->bind_hash = tb;
+ spin_unlock(&head->lock);
+}
+
+inline void tcp_inherit_port(struct sock *sk, struct sock *child)
+{
+ local_bh_disable();
+ __tcp_inherit_port(sk, child);
+ local_bh_enable();
+}
+
+void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
+ unsigned short snum)
+{
+ inet_sk(sk)->num = snum;
+ sk_add_bind_node(sk, &tb->owners);
+ tcp_sk(sk)->bind_hash = tb;
+}
+
+static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
+{
+ const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
+ struct sock *sk2;
+ struct hlist_node *node;
+ int reuse = sk->sk_reuse;
+
+ sk_for_each_bound(sk2, node, &tb->owners) {
+ if (sk != sk2 &&
+ !tcp_v6_ipv6only(sk2) &&
+ (!sk->sk_bound_dev_if ||
+ !sk2->sk_bound_dev_if ||
+ sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
+ if (!reuse || !sk2->sk_reuse ||
+ sk2->sk_state == TCP_LISTEN) {
+ const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
+ if (!sk2_rcv_saddr || !sk_rcv_saddr ||
+ sk2_rcv_saddr == sk_rcv_saddr)
+ break;
+ }
+ }
+ }
+ return node != NULL;
+}
+
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ */
+static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
+{
+ struct tcp_bind_hashbucket *head;
+ struct hlist_node *node;
+ struct tcp_bind_bucket *tb;
+ int ret;
+
+ local_bh_disable();
+ if (!snum) {
+ int low = sysctl_local_port_range[0];
+ int high = sysctl_local_port_range[1];
+ int remaining = (high - low) + 1;
+ int rover;
+
+ spin_lock(&tcp_portalloc_lock);
+ rover = tcp_port_rover;
+ do {
+ rover++;
+ if (rover < low || rover > high)
+ rover = low;
+ head = &tcp_bhash[tcp_bhashfn(rover)];
+ spin_lock(&head->lock);
+ tb_for_each(tb, node, &head->chain)
+ if (tb->port == rover)
+ goto next;
+ break;
+ next:
+ spin_unlock(&head->lock);
+ } while (--remaining > 0);
+ tcp_port_rover = rover;
+ spin_unlock(&tcp_portalloc_lock);
+
+ /* Exhausted local port range during search? */
+ ret = 1;
+ if (remaining <= 0)
+ goto fail;
+
+ /* OK, here is the one we will use. HEAD is
+ * non-NULL and we hold it's mutex.
+ */
+ snum = rover;
+ } else {
+ head = &tcp_bhash[tcp_bhashfn(snum)];
+ spin_lock(&head->lock);
+ tb_for_each(tb, node, &head->chain)
+ if (tb->port == snum)
+ goto tb_found;
+ }
+ tb = NULL;
+ goto tb_not_found;
+tb_found:
+ if (!hlist_empty(&tb->owners)) {
+ if (sk->sk_reuse > 1)
+ goto success;
+ if (tb->fastreuse > 0 &&
+ sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
+ goto success;
+ } else {
+ ret = 1;
+ if (tcp_bind_conflict(sk, tb))
+ goto fail_unlock;
+ }
+ }
+tb_not_found:
+ ret = 1;
+ if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
+ goto fail_unlock;
+ if (hlist_empty(&tb->owners)) {
+ if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
+ tb->fastreuse = 1;
+ else
+ tb->fastreuse = 0;
+ } else if (tb->fastreuse &&
+ (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
+ tb->fastreuse = 0;
+success:
+ if (!tcp_sk(sk)->bind_hash)
+ tcp_bind_hash(sk, tb, snum);
+ BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
+ ret = 0;
+
+fail_unlock:
+ spin_unlock(&head->lock);
+fail:
+ local_bh_enable();
+ return ret;
+}
+
+/* Get rid of any references to a local port held by the
+ * given sock.
+ */
+static void __tcp_put_port(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
+ struct tcp_bind_bucket *tb;
+
+ spin_lock(&head->lock);
+ tb = tcp_sk(sk)->bind_hash;
+ __sk_del_bind_node(sk);
+ tcp_sk(sk)->bind_hash = NULL;
+ inet->num = 0;
+ tcp_bucket_destroy(tb);
+ spin_unlock(&head->lock);
+}
+
+void tcp_put_port(struct sock *sk)
+{
+ local_bh_disable();
+ __tcp_put_port(sk);
+ local_bh_enable();
+}
+
+/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
+ * Look, when several writers sleep and reader wakes them up, all but one
+ * immediately hit write lock and grab all the cpus. Exclusive sleep solves
+ * this, _but_ remember, it adds useless work on UP machines (wake up each
+ * exclusive lock release). It should be ifdefed really.
+ */
+
+void tcp_listen_wlock(void)
+{
+ write_lock(&tcp_lhash_lock);
+
+ if (atomic_read(&tcp_lhash_users)) {
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ prepare_to_wait_exclusive(&tcp_lhash_wait,
+ &wait, TASK_UNINTERRUPTIBLE);
+ if (!atomic_read(&tcp_lhash_users))
+ break;
+ write_unlock_bh(&tcp_lhash_lock);
+ schedule();
+ write_lock_bh(&tcp_lhash_lock);
+ }
+
+ finish_wait(&tcp_lhash_wait, &wait);
+ }
+}
+
+static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
+{
+ struct hlist_head *list;
+ rwlock_t *lock;
+
+ BUG_TRAP(sk_unhashed(sk));
+ if (listen_possible && sk->sk_state == TCP_LISTEN) {
+ list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
+ lock = &tcp_lhash_lock;
+ tcp_listen_wlock();
+ } else {
+ list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
+ lock = &tcp_ehash[sk->sk_hashent].lock;
+ write_lock(lock);
+ }
+ __sk_add_node(sk, list);
+ sock_prot_inc_use(sk->sk_prot);
+ write_unlock(lock);
+ if (listen_possible && sk->sk_state == TCP_LISTEN)
+ wake_up(&tcp_lhash_wait);
+}
+
+static void tcp_v4_hash(struct sock *sk)
+{
+ if (sk->sk_state != TCP_CLOSE) {
+ local_bh_disable();
+ __tcp_v4_hash(sk, 1);
+ local_bh_enable();
+ }
+}
+
+void tcp_unhash(struct sock *sk)
+{
+ rwlock_t *lock;
+
+ if (sk_unhashed(sk))
+ goto ende;
+
+ if (sk->sk_state == TCP_LISTEN) {
+ local_bh_disable();
+ tcp_listen_wlock();
+ lock = &tcp_lhash_lock;
+ } else {
+ struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
+ lock = &head->lock;
+ write_lock_bh(&head->lock);
+ }
+
+ if (__sk_del_node_init(sk))
+ sock_prot_dec_use(sk->sk_prot);
+ write_unlock_bh(lock);
+
+ ende:
+ if (sk->sk_state == TCP_LISTEN)
+ wake_up(&tcp_lhash_wait);
+}
+
+/* Don't inline this cruft. Here are some nice properties to
+ * exploit here. The BSD API does not allow a listening TCP
+ * to specify the remote port nor the remote address for the
+ * connection. So always assume those are both wildcarded
+ * during the search since they can never be otherwise.
+ */
+static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
+ unsigned short hnum, int dif)
+{
+ struct sock *result = NULL, *sk;
+ struct hlist_node *node;
+ int score, hiscore;
+
+ hiscore=-1;
+ sk_for_each(sk, node, head) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (inet->num == hnum && !ipv6_only_sock(sk)) {
+ __u32 rcv_saddr = inet->rcv_saddr;
+
+ score = (sk->sk_family == PF_INET ? 1 : 0);
+ if (rcv_saddr) {
+ if (rcv_saddr != daddr)
+ continue;
+ score+=2;
+ }
+ if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if != dif)
+ continue;
+ score+=2;
+ }
+ if (score == 5)
+ return sk;
+ if (score > hiscore) {
+ hiscore = score;
+ result = sk;
+ }
+ }
+ }
+ return result;
+}
+
+/* Optimize the common listener case. */
+static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
+ unsigned short hnum, int dif)
+{
+ struct sock *sk = NULL;
+ struct hlist_head *head;
+
+ read_lock(&tcp_lhash_lock);
+ head = &tcp_listening_hash[tcp_lhashfn(hnum)];
+ if (!hlist_empty(head)) {
+ struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
+
+ if (inet->num == hnum && !sk->sk_node.next &&
+ (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
+ (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
+ !sk->sk_bound_dev_if)
+ goto sherry_cache;
+ sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
+ }
+ if (sk) {
+sherry_cache:
+ sock_hold(sk);
+ }
+ read_unlock(&tcp_lhash_lock);
+ return sk;
+}
+
+/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
+ * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
+ *
+ * Local BH must be disabled here.
+ */
+
+static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
+ u32 daddr, u16 hnum,
+ int dif)
+{
+ struct tcp_ehash_bucket *head;
+ TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
+ __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
+ struct sock *sk;
+ struct hlist_node *node;
+ /* Optimize here for direct hit, only listening connections can
+ * have wildcards anyways.
+ */
+ int hash = tcp_hashfn(daddr, hnum, saddr, sport);
+ head = &tcp_ehash[hash];
+ read_lock(&head->lock);
+ sk_for_each(sk, node, &head->chain) {
+ if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
+ goto hit; /* You sunk my battleship! */
+ }
+
+ /* Must check for a TIME_WAIT'er before going to listener hash. */
+ sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
+ if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
+ goto hit;
+ }
+ sk = NULL;
+out:
+ read_unlock(&head->lock);
+ return sk;
+hit:
+ sock_hold(sk);
+ goto out;
+}
+
+static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
+ u32 daddr, u16 hnum, int dif)
+{
+ struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
+ daddr, hnum, dif);
+
+ return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
+}
+
+inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
+ u16 dport, int dif)
+{
+ struct sock *sk;
+
+ local_bh_disable();
+ sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
+ local_bh_enable();
+
+ return sk;
+}
+
+EXPORT_SYMBOL_GPL(tcp_v4_lookup);
+
+static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
+{
+ return secure_tcp_sequence_number(skb->nh.iph->daddr,
+ skb->nh.iph->saddr,
+ skb->h.th->dest,
+ skb->h.th->source);
+}
+
+/* called with local bh disabled */
+static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
+ struct tcp_tw_bucket **twp)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ u32 daddr = inet->rcv_saddr;
+ u32 saddr = inet->daddr;
+ int dif = sk->sk_bound_dev_if;
+ TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
+ __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
+ int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
+ struct tcp_ehash_bucket *head = &tcp_ehash[hash];
+ struct sock *sk2;
+ struct hlist_node *node;
+ struct tcp_tw_bucket *tw;
+
+ write_lock(&head->lock);
+
+ /* Check TIME-WAIT sockets first. */
+ sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
+ tw = (struct tcp_tw_bucket *)sk2;
+
+ if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* With PAWS, it is safe from the viewpoint
+ of data integrity. Even without PAWS it
+ is safe provided sequence spaces do not
+ overlap i.e. at data rates <= 80Mbit/sec.
+
+ Actually, the idea is close to VJ's one,
+ only timestamp cache is held not per host,
+ but per port pair and TW bucket is used
+ as state holder.
+
+ If TW bucket has been already destroyed we
+ fall back to VJ's scheme and use initial
+ timestamp retrieved from peer table.
+ */
+ if (tw->tw_ts_recent_stamp &&
+ (!twp || (sysctl_tcp_tw_reuse &&
+ xtime.tv_sec -
+ tw->tw_ts_recent_stamp > 1))) {
+ if ((tp->write_seq =
+ tw->tw_snd_nxt + 65535 + 2) == 0)
+ tp->write_seq = 1;
+ tp->rx_opt.ts_recent = tw->tw_ts_recent;
+ tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
+ sock_hold(sk2);
+ goto unique;
+ } else
+ goto not_unique;
+ }
+ }
+ tw = NULL;
+
+ /* And established part... */
+ sk_for_each(sk2, node, &head->chain) {
+ if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
+ goto not_unique;
+ }
+
+unique:
+ /* Must record num and sport now. Otherwise we will see
+ * in hash table socket with a funny identity. */
+ inet->num = lport;
+ inet->sport = htons(lport);
+ sk->sk_hashent = hash;
+ BUG_TRAP(sk_unhashed(sk));
+ __sk_add_node(sk, &head->chain);
+ sock_prot_inc_use(sk->sk_prot);
+ write_unlock(&head->lock);
+
+ if (twp) {
+ *twp = tw;
+ NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+ } else if (tw) {
+ /* Silly. Should hash-dance instead... */
+ tcp_tw_deschedule(tw);
+ NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+
+ tcp_tw_put(tw);
+ }
+
+ return 0;
+
+not_unique:
+ write_unlock(&head->lock);
+ return -EADDRNOTAVAIL;
+}
+
+static inline u32 connect_port_offset(const struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+
+ return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
+ inet->dport);
+}
+
+/*
+ * Bind a port for a connect operation and hash it.
+ */
+static inline int tcp_v4_hash_connect(struct sock *sk)
+{
+ unsigned short snum = inet_sk(sk)->num;
+ struct tcp_bind_hashbucket *head;
+ struct tcp_bind_bucket *tb;
+ int ret;
+
+ if (!snum) {
+ int low = sysctl_local_port_range[0];
+ int high = sysctl_local_port_range[1];
+ int range = high - low;
+ int i;
+ int port;
+ static u32 hint;
+ u32 offset = hint + connect_port_offset(sk);
+ struct hlist_node *node;
+ struct tcp_tw_bucket *tw = NULL;
+
+ local_bh_disable();
+ for (i = 1; i <= range; i++) {
+ port = low + (i + offset) % range;
+ head = &tcp_bhash[tcp_bhashfn(port)];
+ spin_lock(&head->lock);
+
+ /* Does not bother with rcv_saddr checks,
+ * because the established check is already
+ * unique enough.
+ */
+ tb_for_each(tb, node, &head->chain) {
+ if (tb->port == port) {
+ BUG_TRAP(!hlist_empty(&tb->owners));
+ if (tb->fastreuse >= 0)
+ goto next_port;
+ if (!__tcp_v4_check_established(sk,
+ port,
+ &tw))
+ goto ok;
+ goto next_port;
+ }
+ }
+
+ tb = tcp_bucket_create(head, port);
+ if (!tb) {
+ spin_unlock(&head->lock);
+ break;
+ }
+ tb->fastreuse = -1;
+ goto ok;
+
+ next_port:
+ spin_unlock(&head->lock);
+ }
+ local_bh_enable();
+
+ return -EADDRNOTAVAIL;
+
+ok:
+ hint += i;
+
+ /* Head lock still held and bh's disabled */
+ tcp_bind_hash(sk, tb, port);
+ if (sk_unhashed(sk)) {
+ inet_sk(sk)->sport = htons(port);
+ __tcp_v4_hash(sk, 0);
+ }
+ spin_unlock(&head->lock);
+
+ if (tw) {
+ tcp_tw_deschedule(tw);
+ tcp_tw_put(tw);
+ }
+
+ ret = 0;
+ goto out;
+ }
+
+ head = &tcp_bhash[tcp_bhashfn(snum)];
+ tb = tcp_sk(sk)->bind_hash;
+ spin_lock_bh(&head->lock);
+ if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
+ __tcp_v4_hash(sk, 0);
+ spin_unlock_bh(&head->lock);
+ return 0;
+ } else {
+ spin_unlock(&head->lock);
+ /* No definite answer... Walk to established hash table */
+ ret = __tcp_v4_check_established(sk, snum, NULL);
+out:
+ local_bh_enable();
+ return ret;
+ }
+}
+
+/* This will initiate an outgoing connection. */
+int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+ struct rtable *rt;
+ u32 daddr, nexthop;
+ int tmp;
+ int err;
+
+ if (addr_len < sizeof(struct sockaddr_in))
+ return -EINVAL;
+
+ if (usin->sin_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ nexthop = daddr = usin->sin_addr.s_addr;
+ if (inet->opt && inet->opt->srr) {
+ if (!daddr)
+ return -EINVAL;
+ nexthop = inet->opt->faddr;
+ }
+
+ tmp = ip_route_connect(&rt, nexthop, inet->saddr,
+ RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
+ IPPROTO_TCP,
+ inet->sport, usin->sin_port, sk);
+ if (tmp < 0)
+ return tmp;
+
+ if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
+ ip_rt_put(rt);
+ return -ENETUNREACH;
+ }
+
+ if (!inet->opt || !inet->opt->srr)
+ daddr = rt->rt_dst;
+
+ if (!inet->saddr)
+ inet->saddr = rt->rt_src;
+ inet->rcv_saddr = inet->saddr;
+
+ if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
+ /* Reset inherited state */
+ tp->rx_opt.ts_recent = 0;
+ tp->rx_opt.ts_recent_stamp = 0;
+ tp->write_seq = 0;
+ }
+
+ if (sysctl_tcp_tw_recycle &&
+ !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
+ struct inet_peer *peer = rt_get_peer(rt);
+
+ /* VJ's idea. We save last timestamp seen from
+ * the destination in peer table, when entering state TIME-WAIT
+ * and initialize rx_opt.ts_recent from it, when trying new connection.
+ */
+
+ if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
+ tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
+ tp->rx_opt.ts_recent = peer->tcp_ts;
+ }
+ }
+
+ inet->dport = usin->sin_port;
+ inet->daddr = daddr;
+
+ tp->ext_header_len = 0;
+ if (inet->opt)
+ tp->ext_header_len = inet->opt->optlen;
+
+ tp->rx_opt.mss_clamp = 536;
+
+ /* Socket identity is still unknown (sport may be zero).
+ * However we set state to SYN-SENT and not releasing socket
+ * lock select source port, enter ourselves into the hash tables and
+ * complete initialization after this.
+ */
+ tcp_set_state(sk, TCP_SYN_SENT);
+ err = tcp_v4_hash_connect(sk);
+ if (err)
+ goto failure;
+
+ err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
+ if (err)
+ goto failure;
+
+ /* OK, now commit destination to socket. */
+ __sk_dst_set(sk, &rt->u.dst);
+ tcp_v4_setup_caps(sk, &rt->u.dst);
+
+ if (!tp->write_seq)
+ tp->write_seq = secure_tcp_sequence_number(inet->saddr,
+ inet->daddr,
+ inet->sport,
+ usin->sin_port);
+
+ inet->id = tp->write_seq ^ jiffies;
+
+ err = tcp_connect(sk);
+ rt = NULL;
+ if (err)
+ goto failure;
+
+ return 0;
+
+failure:
+ /* This unhashes the socket and releases the local port, if necessary. */
+ tcp_set_state(sk, TCP_CLOSE);
+ ip_rt_put(rt);
+ sk->sk_route_caps = 0;
+ inet->dport = 0;
+ return err;
+}
+
+static __inline__ int tcp_v4_iif(struct sk_buff *skb)
+{
+ return ((struct rtable *)skb->dst)->rt_iif;
+}
+
+static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
+{
+ return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
+}
+
+static struct open_request *tcp_v4_search_req(struct tcp_sock *tp,
+ struct open_request ***prevp,
+ __u16 rport,
+ __u32 raddr, __u32 laddr)
+{
+ struct tcp_listen_opt *lopt = tp->listen_opt;
+ struct open_request *req, **prev;
+
+ for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
+ (req = *prev) != NULL;
+ prev = &req->dl_next) {
+ if (req->rmt_port == rport &&
+ req->af.v4_req.rmt_addr == raddr &&
+ req->af.v4_req.loc_addr == laddr &&
+ TCP_INET_FAMILY(req->class->family)) {
+ BUG_TRAP(!req->sk);
+ *prevp = prev;
+ break;
+ }
+ }
+
+ return req;
+}
+
+static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_listen_opt *lopt = tp->listen_opt;
+ u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
+
+ req->expires = jiffies + TCP_TIMEOUT_INIT;
+ req->retrans = 0;
+ req->sk = NULL;
+ req->dl_next = lopt->syn_table[h];
+
+ write_lock(&tp->syn_wait_lock);
+ lopt->syn_table[h] = req;
+ write_unlock(&tp->syn_wait_lock);
+
+ tcp_synq_added(sk);
+}
+
+
+/*
+ * This routine does path mtu discovery as defined in RFC1191.
+ */
+static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
+ u32 mtu)
+{
+ struct dst_entry *dst;
+ struct inet_sock *inet = inet_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
+ * send out by Linux are always <576bytes so they should go through
+ * unfragmented).
+ */
+ if (sk->sk_state == TCP_LISTEN)
+ return;
+
+ /* We don't check in the destentry if pmtu discovery is forbidden
+ * on this route. We just assume that no packet_to_big packets
+ * are send back when pmtu discovery is not active.
+ * There is a small race when the user changes this flag in the
+ * route, but I think that's acceptable.
+ */
+ if ((dst = __sk_dst_check(sk, 0)) == NULL)
+ return;
+
+ dst->ops->update_pmtu(dst, mtu);
+
+ /* Something is about to be wrong... Remember soft error
+ * for the case, if this connection will not able to recover.
+ */
+ if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
+ sk->sk_err_soft = EMSGSIZE;
+
+ mtu = dst_mtu(dst);
+
+ if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+ tp->pmtu_cookie > mtu) {
+ tcp_sync_mss(sk, mtu);
+
+ /* Resend the TCP packet because it's
+ * clear that the old packet has been
+ * dropped. This is the new "fast" path mtu
+ * discovery.
+ */
+ tcp_simple_retransmit(sk);
+ } /* else let the usual retransmit timer handle it */
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition. If err < 0 then the socket should
+ * be closed and the error returned to the user. If err > 0
+ * it's just the icmp type << 8 | icmp code. After adjustment
+ * header points to the first 8 bytes of the tcp header. We need
+ * to find the appropriate port.
+ *
+ * The locking strategy used here is very "optimistic". When
+ * someone else accesses the socket the ICMP is just dropped
+ * and for some paths there is no check at all.
+ * A more general error queue to queue errors for later handling
+ * is probably better.
+ *
+ */
+
+void tcp_v4_err(struct sk_buff *skb, u32 info)
+{
+ struct iphdr *iph = (struct iphdr *)skb->data;
+ struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
+ struct tcp_sock *tp;
+ struct inet_sock *inet;
+ int type = skb->h.icmph->type;
+ int code = skb->h.icmph->code;
+ struct sock *sk;
+ __u32 seq;
+ int err;
+
+ if (skb->len < (iph->ihl << 2) + 8) {
+ ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ return;
+ }
+
+ sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
+ th->source, tcp_v4_iif(skb));
+ if (!sk) {
+ ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ return;
+ }
+ if (sk->sk_state == TCP_TIME_WAIT) {
+ tcp_tw_put((struct tcp_tw_bucket *)sk);
+ return;
+ }
+
+ bh_lock_sock(sk);
+ /* If too many ICMPs get dropped on busy
+ * servers this needs to be solved differently.
+ */
+ if (sock_owned_by_user(sk))
+ NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
+
+ if (sk->sk_state == TCP_CLOSE)
+ goto out;
+
+ tp = tcp_sk(sk);
+ seq = ntohl(th->seq);
+ if (sk->sk_state != TCP_LISTEN &&
+ !between(seq, tp->snd_una, tp->snd_nxt)) {
+ NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
+ goto out;
+ }
+
+ switch (type) {
+ case ICMP_SOURCE_QUENCH:
+ /* Just silently ignore these. */
+ goto out;
+ case ICMP_PARAMETERPROB:
+ err = EPROTO;
+ break;
+ case ICMP_DEST_UNREACH:
+ if (code > NR_ICMP_UNREACH)
+ goto out;
+
+ if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
+ if (!sock_owned_by_user(sk))
+ do_pmtu_discovery(sk, iph, info);
+ goto out;
+ }
+
+ err = icmp_err_convert[code].errno;
+ break;
+ case ICMP_TIME_EXCEEDED:
+ err = EHOSTUNREACH;
+ break;
+ default:
+ goto out;
+ }
+
+ switch (sk->sk_state) {
+ struct open_request *req, **prev;
+ case TCP_LISTEN:
+ if (sock_owned_by_user(sk))
+ goto out;
+
+ req = tcp_v4_search_req(tp, &prev, th->dest,
+ iph->daddr, iph->saddr);
+ if (!req)
+ goto out;
+
+ /* ICMPs are not backlogged, hence we cannot get
+ an established socket here.
+ */
+ BUG_TRAP(!req->sk);
+
+ if (seq != req->snt_isn) {
+ NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
+ goto out;
+ }
+
+ /*
+ * Still in SYN_RECV, just remove it silently.
+ * There is no good way to pass the error to the newly
+ * created socket, and POSIX does not want network
+ * errors returned from accept().
+ */
+ tcp_synq_drop(sk, req, prev);
+ goto out;
+
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV: /* Cannot happen.
+ It can f.e. if SYNs crossed.
+ */
+ if (!sock_owned_by_user(sk)) {
+ TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
+ sk->sk_err = err;
+
+ sk->sk_error_report(sk);
+
+ tcp_done(sk);
+ } else {
+ sk->sk_err_soft = err;
+ }
+ goto out;
+ }
+
+ /* If we've already connected we will keep trying
+ * until we time out, or the user gives up.
+ *
+ * rfc1122 4.2.3.9 allows to consider as hard errors
+ * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
+ * but it is obsoleted by pmtu discovery).
+ *
+ * Note, that in modern internet, where routing is unreliable
+ * and in each dark corner broken firewalls sit, sending random
+ * errors ordered by their masters even this two messages finally lose
+ * their original sense (even Linux sends invalid PORT_UNREACHs)
+ *
+ * Now we are in compliance with RFCs.
+ * --ANK (980905)
+ */
+
+ inet = inet_sk(sk);
+ if (!sock_owned_by_user(sk) && inet->recverr) {
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ } else { /* Only an error on timeout */
+ sk->sk_err_soft = err;
+ }
+
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+/* This routine computes an IPv4 TCP checksum. */
+void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
+ struct sk_buff *skb)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (skb->ip_summed == CHECKSUM_HW) {
+ th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
+ skb->csum = offsetof(struct tcphdr, check);
+ } else {
+ th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
+ csum_partial((char *)th,
+ th->doff << 2,
+ skb->csum));
+ }
+}
+
+/*
+ * This routine will send an RST to the other tcp.
+ *
+ * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
+ * for reset.
+ * Answer: if a packet caused RST, it is not for a socket
+ * existing in our system, if it is matched to a socket,
+ * it is just duplicate segment or bug in other side's TCP.
+ * So that we build reply only basing on parameters
+ * arrived with segment.
+ * Exception: precedence violation. We do not implement it in any case.
+ */
+
+static void tcp_v4_send_reset(struct sk_buff *skb)
+{
+ struct tcphdr *th = skb->h.th;
+ struct tcphdr rth;
+ struct ip_reply_arg arg;
+
+ /* Never send a reset in response to a reset. */
+ if (th->rst)
+ return;
+
+ if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
+ return;
+
+ /* Swap the send and the receive. */
+ memset(&rth, 0, sizeof(struct tcphdr));
+ rth.dest = th->source;
+ rth.source = th->dest;
+ rth.doff = sizeof(struct tcphdr) / 4;
+ rth.rst = 1;
+
+ if (th->ack) {
+ rth.seq = th->ack_seq;
+ } else {
+ rth.ack = 1;
+ rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
+ skb->len - (th->doff << 2));
+ }
+
+ memset(&arg, 0, sizeof arg);
+ arg.iov[0].iov_base = (unsigned char *)&rth;
+ arg.iov[0].iov_len = sizeof rth;
+ arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
+ skb->nh.iph->saddr, /*XXX*/
+ sizeof(struct tcphdr), IPPROTO_TCP, 0);
+ arg.csumoffset = offsetof(struct tcphdr, check) / 2;
+
+ ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
+
+ TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
+ TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
+}
+
+/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
+ outside socket context is ugly, certainly. What can I do?
+ */
+
+static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
+ u32 win, u32 ts)
+{
+ struct tcphdr *th = skb->h.th;
+ struct {
+ struct tcphdr th;
+ u32 tsopt[3];
+ } rep;
+ struct ip_reply_arg arg;
+
+ memset(&rep.th, 0, sizeof(struct tcphdr));
+ memset(&arg, 0, sizeof arg);
+
+ arg.iov[0].iov_base = (unsigned char *)&rep;
+ arg.iov[0].iov_len = sizeof(rep.th);
+ if (ts) {
+ rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+ (TCPOPT_TIMESTAMP << 8) |
+ TCPOLEN_TIMESTAMP);
+ rep.tsopt[1] = htonl(tcp_time_stamp);
+ rep.tsopt[2] = htonl(ts);
+ arg.iov[0].iov_len = sizeof(rep);
+ }
+
+ /* Swap the send and the receive. */
+ rep.th.dest = th->source;
+ rep.th.source = th->dest;
+ rep.th.doff = arg.iov[0].iov_len / 4;
+ rep.th.seq = htonl(seq);
+ rep.th.ack_seq = htonl(ack);
+ rep.th.ack = 1;
+ rep.th.window = htons(win);
+
+ arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
+ skb->nh.iph->saddr, /*XXX*/
+ arg.iov[0].iov_len, IPPROTO_TCP, 0);
+ arg.csumoffset = offsetof(struct tcphdr, check) / 2;
+
+ ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
+
+ TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
+}
+
+static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
+
+ tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
+ tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
+
+ tcp_tw_put(tw);
+}
+
+static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
+{
+ tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
+ req->ts_recent);
+}
+
+static struct dst_entry* tcp_v4_route_req(struct sock *sk,
+ struct open_request *req)
+{
+ struct rtable *rt;
+ struct ip_options *opt = req->af.v4_req.opt;
+ struct flowi fl = { .oif = sk->sk_bound_dev_if,
+ .nl_u = { .ip4_u =
+ { .daddr = ((opt && opt->srr) ?
+ opt->faddr :
+ req->af.v4_req.rmt_addr),
+ .saddr = req->af.v4_req.loc_addr,
+ .tos = RT_CONN_FLAGS(sk) } },
+ .proto = IPPROTO_TCP,
+ .uli_u = { .ports =
+ { .sport = inet_sk(sk)->sport,
+ .dport = req->rmt_port } } };
+
+ if (ip_route_output_flow(&rt, &fl, sk, 0)) {
+ IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+ return NULL;
+ }
+ if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
+ ip_rt_put(rt);
+ IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+ return NULL;
+ }
+ return &rt->u.dst;
+}
+
+/*
+ * Send a SYN-ACK after having received an ACK.
+ * This still operates on a open_request only, not on a big
+ * socket.
+ */
+static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
+ struct dst_entry *dst)
+{
+ int err = -1;
+ struct sk_buff * skb;
+
+ /* First, grab a route. */
+ if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
+ goto out;
+
+ skb = tcp_make_synack(sk, dst, req);
+
+ if (skb) {
+ struct tcphdr *th = skb->h.th;
+
+ th->check = tcp_v4_check(th, skb->len,
+ req->af.v4_req.loc_addr,
+ req->af.v4_req.rmt_addr,
+ csum_partial((char *)th, skb->len,
+ skb->csum));
+
+ err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
+ req->af.v4_req.rmt_addr,
+ req->af.v4_req.opt);
+ if (err == NET_XMIT_CN)
+ err = 0;
+ }
+
+out:
+ dst_release(dst);
+ return err;
+}
+
+/*
+ * IPv4 open_request destructor.
+ */
+static void tcp_v4_or_free(struct open_request *req)
+{
+ if (req->af.v4_req.opt)
+ kfree(req->af.v4_req.opt);
+}
+
+static inline void syn_flood_warning(struct sk_buff *skb)
+{
+ static unsigned long warntime;
+
+ if (time_after(jiffies, (warntime + HZ * 60))) {
+ warntime = jiffies;
+ printk(KERN_INFO
+ "possible SYN flooding on port %d. Sending cookies.\n",
+ ntohs(skb->h.th->dest));
+ }
+}
+
+/*
+ * Save and compile IPv4 options into the open_request if needed.
+ */
+static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct ip_options *opt = &(IPCB(skb)->opt);
+ struct ip_options *dopt = NULL;
+
+ if (opt && opt->optlen) {
+ int opt_size = optlength(opt);
+ dopt = kmalloc(opt_size, GFP_ATOMIC);
+ if (dopt) {
+ if (ip_options_echo(dopt, skb)) {
+ kfree(dopt);
+ dopt = NULL;
+ }
+ }
+ }
+ return dopt;
+}
+
+/*
+ * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
+ * One SYN_RECV socket costs about 80bytes on a 32bit machine.
+ * It would be better to replace it with a global counter for all sockets
+ * but then some measure against one socket starving all other sockets
+ * would be needed.
+ *
+ * It was 128 by default. Experiments with real servers show, that
+ * it is absolutely not enough even at 100conn/sec. 256 cures most
+ * of problems. This value is adjusted to 128 for very small machines
+ * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
+ * Further increasing requires to change hash table size.
+ */
+int sysctl_max_syn_backlog = 256;
+
+struct or_calltable or_ipv4 = {
+ .family = PF_INET,
+ .rtx_syn_ack = tcp_v4_send_synack,
+ .send_ack = tcp_v4_or_send_ack,
+ .destructor = tcp_v4_or_free,
+ .send_reset = tcp_v4_send_reset,
+};
+
+int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_options_received tmp_opt;
+ struct open_request *req;
+ __u32 saddr = skb->nh.iph->saddr;
+ __u32 daddr = skb->nh.iph->daddr;
+ __u32 isn = TCP_SKB_CB(skb)->when;
+ struct dst_entry *dst = NULL;
+#ifdef CONFIG_SYN_COOKIES
+ int want_cookie = 0;
+#else
+#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
+#endif
+
+ /* Never answer to SYNs send to broadcast or multicast */
+ if (((struct rtable *)skb->dst)->rt_flags &
+ (RTCF_BROADCAST | RTCF_MULTICAST))
+ goto drop;
+
+ /* TW buckets are converted to open requests without
+ * limitations, they conserve resources and peer is
+ * evidently real one.
+ */
+ if (tcp_synq_is_full(sk) && !isn) {
+#ifdef CONFIG_SYN_COOKIES
+ if (sysctl_tcp_syncookies) {
+ want_cookie = 1;
+ } else
+#endif
+ goto drop;
+ }
+
+ /* Accept backlog is full. If we have already queued enough
+ * of warm entries in syn queue, drop request. It is better than
+ * clogging syn queue with openreqs with exponentially increasing
+ * timeout.
+ */
+ if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
+ goto drop;
+
+ req = tcp_openreq_alloc();
+ if (!req)
+ goto drop;
+
+ tcp_clear_options(&tmp_opt);
+ tmp_opt.mss_clamp = 536;
+ tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
+
+ tcp_parse_options(skb, &tmp_opt, 0);
+
+ if (want_cookie) {
+ tcp_clear_options(&tmp_opt);
+ tmp_opt.saw_tstamp = 0;
+ }
+
+ if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
+ /* Some OSes (unknown ones, but I see them on web server, which
+ * contains information interesting only for windows'
+ * users) do not send their stamp in SYN. It is easy case.
+ * We simply do not advertise TS support.
+ */
+ tmp_opt.saw_tstamp = 0;
+ tmp_opt.tstamp_ok = 0;
+ }
+ tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
+
+ tcp_openreq_init(req, &tmp_opt, skb);
+
+ req->af.v4_req.loc_addr = daddr;
+ req->af.v4_req.rmt_addr = saddr;
+ req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
+ req->class = &or_ipv4;
+ if (!want_cookie)
+ TCP_ECN_create_request(req, skb->h.th);
+
+ if (want_cookie) {
+#ifdef CONFIG_SYN_COOKIES
+ syn_flood_warning(skb);
+#endif
+ isn = cookie_v4_init_sequence(sk, skb, &req->mss);
+ } else if (!isn) {
+ struct inet_peer *peer = NULL;
+
+ /* VJ's idea. We save last timestamp seen
+ * from the destination in peer table, when entering
+ * state TIME-WAIT, and check against it before
+ * accepting new connection request.
+ *
+ * If "isn" is not zero, this request hit alive
+ * timewait bucket, so that all the necessary checks
+ * are made in the function processing timewait state.
+ */
+ if (tmp_opt.saw_tstamp &&
+ sysctl_tcp_tw_recycle &&
+ (dst = tcp_v4_route_req(sk, req)) != NULL &&
+ (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
+ peer->v4daddr == saddr) {
+ if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
+ (s32)(peer->tcp_ts - req->ts_recent) >
+ TCP_PAWS_WINDOW) {
+ NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
+ dst_release(dst);
+ goto drop_and_free;
+ }
+ }
+ /* Kill the following clause, if you dislike this way. */
+ else if (!sysctl_tcp_syncookies &&
+ (sysctl_max_syn_backlog - tcp_synq_len(sk) <
+ (sysctl_max_syn_backlog >> 2)) &&
+ (!peer || !peer->tcp_ts_stamp) &&
+ (!dst || !dst_metric(dst, RTAX_RTT))) {
+ /* Without syncookies last quarter of
+ * backlog is filled with destinations,
+ * proven to be alive.
+ * It means that we continue to communicate
+ * to destinations, already remembered
+ * to the moment of synflood.
+ */
+ NETDEBUG(if (net_ratelimit()) \
+ printk(KERN_DEBUG "TCP: drop open "
+ "request from %u.%u."
+ "%u.%u/%u\n", \
+ NIPQUAD(saddr),
+ ntohs(skb->h.th->source)));
+ dst_release(dst);
+ goto drop_and_free;
+ }
+
+ isn = tcp_v4_init_sequence(sk, skb);
+ }
+ req->snt_isn = isn;
+
+ if (tcp_v4_send_synack(sk, req, dst))
+ goto drop_and_free;
+
+ if (want_cookie) {
+ tcp_openreq_free(req);
+ } else {
+ tcp_v4_synq_add(sk, req);
+ }
+ return 0;
+
+drop_and_free:
+ tcp_openreq_free(req);
+drop:
+ TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
+ return 0;
+}
+
+
+/*
+ * The three way handshake has completed - we got a valid synack -
+ * now create the new socket.
+ */
+struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
+ struct open_request *req,
+ struct dst_entry *dst)
+{
+ struct inet_sock *newinet;
+ struct tcp_sock *newtp;
+ struct sock *newsk;
+
+ if (sk_acceptq_is_full(sk))
+ goto exit_overflow;
+
+ if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
+ goto exit;
+
+ newsk = tcp_create_openreq_child(sk, req, skb);
+ if (!newsk)
+ goto exit;
+
+ newsk->sk_dst_cache = dst;
+ tcp_v4_setup_caps(newsk, dst);
+
+ newtp = tcp_sk(newsk);
+ newinet = inet_sk(newsk);
+ newinet->daddr = req->af.v4_req.rmt_addr;
+ newinet->rcv_saddr = req->af.v4_req.loc_addr;
+ newinet->saddr = req->af.v4_req.loc_addr;
+ newinet->opt = req->af.v4_req.opt;
+ req->af.v4_req.opt = NULL;
+ newinet->mc_index = tcp_v4_iif(skb);
+ newinet->mc_ttl = skb->nh.iph->ttl;
+ newtp->ext_header_len = 0;
+ if (newinet->opt)
+ newtp->ext_header_len = newinet->opt->optlen;
+ newinet->id = newtp->write_seq ^ jiffies;
+
+ tcp_sync_mss(newsk, dst_mtu(dst));
+ newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
+ tcp_initialize_rcv_mss(newsk);
+
+ __tcp_v4_hash(newsk, 0);
+ __tcp_inherit_port(sk, newsk);
+
+ return newsk;
+
+exit_overflow:
+ NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
+exit:
+ NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
+ dst_release(dst);
+ return NULL;
+}
+
+static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcphdr *th = skb->h.th;
+ struct iphdr *iph = skb->nh.iph;
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sock *nsk;
+ struct open_request **prev;
+ /* Find possible connection requests. */
+ struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
+ iph->saddr, iph->daddr);
+ if (req)
+ return tcp_check_req(sk, skb, req, prev);
+
+ nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
+ th->source,
+ skb->nh.iph->daddr,
+ ntohs(th->dest),
+ tcp_v4_iif(skb));
+
+ if (nsk) {
+ if (nsk->sk_state != TCP_TIME_WAIT) {
+ bh_lock_sock(nsk);
+ return nsk;
+ }
+ tcp_tw_put((struct tcp_tw_bucket *)nsk);
+ return NULL;
+ }
+
+#ifdef CONFIG_SYN_COOKIES
+ if (!th->rst && !th->syn && th->ack)
+ sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
+#endif
+ return sk;
+}
+
+static int tcp_v4_checksum_init(struct sk_buff *skb)
+{
+ if (skb->ip_summed == CHECKSUM_HW) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
+ skb->nh.iph->daddr, skb->csum))
+ return 0;
+
+ NETDEBUG(if (net_ratelimit())
+ printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+ if (skb->len <= 76) {
+ if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
+ skb->nh.iph->daddr,
+ skb_checksum(skb, 0, skb->len, 0)))
+ return -1;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ } else {
+ skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
+ skb->nh.iph->saddr,
+ skb->nh.iph->daddr, 0);
+ }
+ return 0;
+}
+
+
+/* The socket must have it's spinlock held when we get
+ * here.
+ *
+ * We have a potential double-lock case here, so even when
+ * doing backlog processing we use the BH locking scheme.
+ * This is because we cannot sleep with the original spinlock
+ * held.
+ */
+int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
+ TCP_CHECK_TIMER(sk);
+ if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
+ goto reset;
+ TCP_CHECK_TIMER(sk);
+ return 0;
+ }
+
+ if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
+ goto csum_err;
+
+ if (sk->sk_state == TCP_LISTEN) {
+ struct sock *nsk = tcp_v4_hnd_req(sk, skb);
+ if (!nsk)
+ goto discard;
+
+ if (nsk != sk) {
+ if (tcp_child_process(sk, nsk, skb))
+ goto reset;
+ return 0;
+ }
+ }
+
+ TCP_CHECK_TIMER(sk);
+ if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
+ goto reset;
+ TCP_CHECK_TIMER(sk);
+ return 0;
+
+reset:
+ tcp_v4_send_reset(skb);
+discard:
+ kfree_skb(skb);
+ /* Be careful here. If this function gets more complicated and
+ * gcc suffers from register pressure on the x86, sk (in %ebx)
+ * might be destroyed here. This current version compiles correctly,
+ * but you have been warned.
+ */
+ return 0;
+
+csum_err:
+ TCP_INC_STATS_BH(TCP_MIB_INERRS);
+ goto discard;
+}
+
+/*
+ * From tcp_input.c
+ */
+
+int tcp_v4_rcv(struct sk_buff *skb)
+{
+ struct tcphdr *th;
+ struct sock *sk;
+ int ret;
+
+ if (skb->pkt_type != PACKET_HOST)
+ goto discard_it;
+
+ /* Count it even if it's bad */
+ TCP_INC_STATS_BH(TCP_MIB_INSEGS);
+
+ if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
+ goto discard_it;
+
+ th = skb->h.th;
+
+ if (th->doff < sizeof(struct tcphdr) / 4)
+ goto bad_packet;
+ if (!pskb_may_pull(skb, th->doff * 4))
+ goto discard_it;
+
+ /* An explanation is required here, I think.
+ * Packet length and doff are validated by header prediction,
+ * provided case of th->doff==0 is elimineted.
+ * So, we defer the checks. */
+ if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
+ tcp_v4_checksum_init(skb) < 0))
+ goto bad_packet;
+
+ th = skb->h.th;
+ TCP_SKB_CB(skb)->seq = ntohl(th->seq);
+ TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
+ skb->len - th->doff * 4);
+ TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
+ TCP_SKB_CB(skb)->when = 0;
+ TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
+ TCP_SKB_CB(skb)->sacked = 0;
+
+ sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
+ skb->nh.iph->daddr, ntohs(th->dest),
+ tcp_v4_iif(skb));
+
+ if (!sk)
+ goto no_tcp_socket;
+
+process:
+ if (sk->sk_state == TCP_TIME_WAIT)
+ goto do_time_wait;
+
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+ goto discard_and_relse;
+
+ if (sk_filter(sk, skb, 0))
+ goto discard_and_relse;
+
+ skb->dev = NULL;
+
+ bh_lock_sock(sk);
+ ret = 0;
+ if (!sock_owned_by_user(sk)) {
+ if (!tcp_prequeue(sk, skb))
+ ret = tcp_v4_do_rcv(sk, skb);
+ } else
+ sk_add_backlog(sk, skb);
+ bh_unlock_sock(sk);
+
+ sock_put(sk);
+
+ return ret;
+
+no_tcp_socket:
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto discard_it;
+
+ if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
+bad_packet:
+ TCP_INC_STATS_BH(TCP_MIB_INERRS);
+ } else {
+ tcp_v4_send_reset(skb);
+ }
+
+discard_it:
+ /* Discard frame. */
+ kfree_skb(skb);
+ return 0;
+
+discard_and_relse:
+ sock_put(sk);
+ goto discard_it;
+
+do_time_wait:
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ tcp_tw_put((struct tcp_tw_bucket *) sk);
+ goto discard_it;
+ }
+
+ if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
+ TCP_INC_STATS_BH(TCP_MIB_INERRS);
+ tcp_tw_put((struct tcp_tw_bucket *) sk);
+ goto discard_it;
+ }
+ switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
+ skb, th, skb->len)) {
+ case TCP_TW_SYN: {
+ struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
+ ntohs(th->dest),
+ tcp_v4_iif(skb));
+ if (sk2) {
+ tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
+ tcp_tw_put((struct tcp_tw_bucket *)sk);
+ sk = sk2;
+ goto process;
+ }
+ /* Fall through to ACK */
+ }
+ case TCP_TW_ACK:
+ tcp_v4_timewait_ack(sk, skb);
+ break;
+ case TCP_TW_RST:
+ goto no_tcp_socket;
+ case TCP_TW_SUCCESS:;
+ }
+ goto discard_it;
+}
+
+/* With per-bucket locks this operation is not-atomic, so that
+ * this version is not worse.
+ */
+static void __tcp_v4_rehash(struct sock *sk)
+{
+ sk->sk_prot->unhash(sk);
+ sk->sk_prot->hash(sk);
+}
+
+static int tcp_v4_reselect_saddr(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ int err;
+ struct rtable *rt;
+ __u32 old_saddr = inet->saddr;
+ __u32 new_saddr;
+ __u32 daddr = inet->daddr;
+
+ if (inet->opt && inet->opt->srr)
+ daddr = inet->opt->faddr;
+
+ /* Query new route. */
+ err = ip_route_connect(&rt, daddr, 0,
+ RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if,
+ IPPROTO_TCP,
+ inet->sport, inet->dport, sk);
+ if (err)
+ return err;
+
+ __sk_dst_set(sk, &rt->u.dst);
+ tcp_v4_setup_caps(sk, &rt->u.dst);
+
+ new_saddr = rt->rt_src;
+
+ if (new_saddr == old_saddr)
+ return 0;
+
+ if (sysctl_ip_dynaddr > 1) {
+ printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
+ "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
+ NIPQUAD(old_saddr),
+ NIPQUAD(new_saddr));
+ }
+
+ inet->saddr = new_saddr;
+ inet->rcv_saddr = new_saddr;
+
+ /* XXX The only one ugly spot where we need to
+ * XXX really change the sockets identity after
+ * XXX it has entered the hashes. -DaveM
+ *
+ * Besides that, it does not check for connection
+ * uniqueness. Wait for troubles.
+ */
+ __tcp_v4_rehash(sk);
+ return 0;
+}
+
+int tcp_v4_rebuild_header(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
+ u32 daddr;
+ int err;
+
+ /* Route is OK, nothing to do. */
+ if (rt)
+ return 0;
+
+ /* Reroute. */
+ daddr = inet->daddr;
+ if (inet->opt && inet->opt->srr)
+ daddr = inet->opt->faddr;
+
+ {
+ struct flowi fl = { .oif = sk->sk_bound_dev_if,
+ .nl_u = { .ip4_u =
+ { .daddr = daddr,
+ .saddr = inet->saddr,
+ .tos = RT_CONN_FLAGS(sk) } },
+ .proto = IPPROTO_TCP,
+ .uli_u = { .ports =
+ { .sport = inet->sport,
+ .dport = inet->dport } } };
+
+ err = ip_route_output_flow(&rt, &fl, sk, 0);
+ }
+ if (!err) {
+ __sk_dst_set(sk, &rt->u.dst);
+ tcp_v4_setup_caps(sk, &rt->u.dst);
+ return 0;
+ }
+
+ /* Routing failed... */
+ sk->sk_route_caps = 0;
+
+ if (!sysctl_ip_dynaddr ||
+ sk->sk_state != TCP_SYN_SENT ||
+ (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
+ (err = tcp_v4_reselect_saddr(sk)) != 0)
+ sk->sk_err_soft = -err;
+
+ return err;
+}
+
+static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
+ struct inet_sock *inet = inet_sk(sk);
+
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = inet->daddr;
+ sin->sin_port = inet->dport;
+}
+
+/* VJ's idea. Save last timestamp seen from this destination
+ * and hold it at least for normal timewait interval to use for duplicate
+ * segment detection in subsequent connections, before they enter synchronized
+ * state.
+ */
+
+int tcp_v4_remember_stamp(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
+ struct inet_peer *peer = NULL;
+ int release_it = 0;
+
+ if (!rt || rt->rt_dst != inet->daddr) {
+ peer = inet_getpeer(inet->daddr, 1);
+ release_it = 1;
+ } else {
+ if (!rt->peer)
+ rt_bind_peer(rt, 1);
+ peer = rt->peer;
+ }
+
+ if (peer) {
+ if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
+ (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
+ peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
+ peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
+ peer->tcp_ts = tp->rx_opt.ts_recent;
+ }
+ if (release_it)
+ inet_putpeer(peer);
+ return 1;
+ }
+
+ return 0;
+}
+
+int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
+{
+ struct inet_peer *peer = NULL;
+
+ peer = inet_getpeer(tw->tw_daddr, 1);
+
+ if (peer) {
+ if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
+ (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
+ peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
+ peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
+ peer->tcp_ts = tw->tw_ts_recent;
+ }
+ inet_putpeer(peer);
+ return 1;
+ }
+
+ return 0;
+}
+
+struct tcp_func ipv4_specific = {
+ .queue_xmit = ip_queue_xmit,
+ .send_check = tcp_v4_send_check,
+ .rebuild_header = tcp_v4_rebuild_header,
+ .conn_request = tcp_v4_conn_request,
+ .syn_recv_sock = tcp_v4_syn_recv_sock,
+ .remember_stamp = tcp_v4_remember_stamp,
+ .net_header_len = sizeof(struct iphdr),
+ .setsockopt = ip_setsockopt,
+ .getsockopt = ip_getsockopt,
+ .addr2sockaddr = v4_addr2sockaddr,
+ .sockaddr_len = sizeof(struct sockaddr_in),
+};
+
+/* NOTE: A lot of things set to zero explicitly by call to
+ * sk_alloc() so need not be done here.
+ */
+static int tcp_v4_init_sock(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ skb_queue_head_init(&tp->out_of_order_queue);
+ tcp_init_xmit_timers(sk);
+ tcp_prequeue_init(tp);
+
+ tp->rto = TCP_TIMEOUT_INIT;
+ tp->mdev = TCP_TIMEOUT_INIT;
+
+ /* So many TCP implementations out there (incorrectly) count the
+ * initial SYN frame in their delayed-ACK and congestion control
+ * algorithms that we must have the following bandaid to talk
+ * efficiently to them. -DaveM
+ */
+ tp->snd_cwnd = 2;
+
+ /* See draft-stevens-tcpca-spec-01 for discussion of the
+ * initialization of these values.
+ */
+ tp->snd_ssthresh = 0x7fffffff; /* Infinity */
+ tp->snd_cwnd_clamp = ~0;
+ tp->mss_cache_std = tp->mss_cache = 536;
+
+ tp->reordering = sysctl_tcp_reordering;
+
+ sk->sk_state = TCP_CLOSE;
+
+ sk->sk_write_space = sk_stream_write_space;
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+
+ tp->af_specific = &ipv4_specific;
+
+ sk->sk_sndbuf = sysctl_tcp_wmem[1];
+ sk->sk_rcvbuf = sysctl_tcp_rmem[1];
+
+ atomic_inc(&tcp_sockets_allocated);
+
+ return 0;
+}
+
+int tcp_v4_destroy_sock(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tcp_clear_xmit_timers(sk);
+
+ /* Cleanup up the write buffer. */
+ sk_stream_writequeue_purge(sk);
+
+ /* Cleans up our, hopefully empty, out_of_order_queue. */
+ __skb_queue_purge(&tp->out_of_order_queue);
+
+ /* Clean prequeue, it must be empty really */
+ __skb_queue_purge(&tp->ucopy.prequeue);
+
+ /* Clean up a referenced TCP bind bucket. */
+ if (tp->bind_hash)
+ tcp_put_port(sk);
+
+ /*
+ * If sendmsg cached page exists, toss it.
+ */
+ if (sk->sk_sndmsg_page) {
+ __free_page(sk->sk_sndmsg_page);
+ sk->sk_sndmsg_page = NULL;
+ }
+
+ atomic_dec(&tcp_sockets_allocated);
+
+ return 0;
+}
+
+EXPORT_SYMBOL(tcp_v4_destroy_sock);
+
+#ifdef CONFIG_PROC_FS
+/* Proc filesystem TCP sock list dumping. */
+
+static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
+{
+ return hlist_empty(head) ? NULL :
+ list_entry(head->first, struct tcp_tw_bucket, tw_node);
+}
+
+static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
+{
+ return tw->tw_node.next ?
+ hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
+}
+
+static void *listening_get_next(struct seq_file *seq, void *cur)
+{
+ struct tcp_sock *tp;
+ struct hlist_node *node;
+ struct sock *sk = cur;
+ struct tcp_iter_state* st = seq->private;
+
+ if (!sk) {
+ st->bucket = 0;
+ sk = sk_head(&tcp_listening_hash[0]);
+ goto get_sk;
+ }
+
+ ++st->num;
+
+ if (st->state == TCP_SEQ_STATE_OPENREQ) {
+ struct open_request *req = cur;
+
+ tp = tcp_sk(st->syn_wait_sk);
+ req = req->dl_next;
+ while (1) {
+ while (req) {
+ if (req->class->family == st->family) {
+ cur = req;
+ goto out;
+ }
+ req = req->dl_next;
+ }
+ if (++st->sbucket >= TCP_SYNQ_HSIZE)
+ break;
+get_req:
+ req = tp->listen_opt->syn_table[st->sbucket];
+ }
+ sk = sk_next(st->syn_wait_sk);
+ st->state = TCP_SEQ_STATE_LISTENING;
+ read_unlock_bh(&tp->syn_wait_lock);
+ } else {
+ tp = tcp_sk(sk);
+ read_lock_bh(&tp->syn_wait_lock);
+ if (tp->listen_opt && tp->listen_opt->qlen)
+ goto start_req;
+ read_unlock_bh(&tp->syn_wait_lock);
+ sk = sk_next(sk);
+ }
+get_sk:
+ sk_for_each_from(sk, node) {
+ if (sk->sk_family == st->family) {
+ cur = sk;
+ goto out;
+ }
+ tp = tcp_sk(sk);
+ read_lock_bh(&tp->syn_wait_lock);
+ if (tp->listen_opt && tp->listen_opt->qlen) {
+start_req:
+ st->uid = sock_i_uid(sk);
+ st->syn_wait_sk = sk;
+ st->state = TCP_SEQ_STATE_OPENREQ;
+ st->sbucket = 0;
+ goto get_req;
+ }
+ read_unlock_bh(&tp->syn_wait_lock);
+ }
+ if (++st->bucket < TCP_LHTABLE_SIZE) {
+ sk = sk_head(&tcp_listening_hash[st->bucket]);
+ goto get_sk;
+ }
+ cur = NULL;
+out:
+ return cur;
+}
+
+static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
+{
+ void *rc = listening_get_next(seq, NULL);
+
+ while (rc && *pos) {
+ rc = listening_get_next(seq, rc);
+ --*pos;
+ }
+ return rc;
+}
+
+static void *established_get_first(struct seq_file *seq)
+{
+ struct tcp_iter_state* st = seq->private;
+ void *rc = NULL;
+
+ for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
+ struct sock *sk;
+ struct hlist_node *node;
+ struct tcp_tw_bucket *tw;
+
+ /* We can reschedule _before_ having picked the target: */
+ cond_resched_softirq();
+
+ read_lock(&tcp_ehash[st->bucket].lock);
+ sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
+ if (sk->sk_family != st->family) {
+ continue;
+ }
+ rc = sk;
+ goto out;
+ }
+ st->state = TCP_SEQ_STATE_TIME_WAIT;
+ tw_for_each(tw, node,
+ &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
+ if (tw->tw_family != st->family) {
+ continue;
+ }
+ rc = tw;
+ goto out;
+ }
+ read_unlock(&tcp_ehash[st->bucket].lock);
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
+ }
+out:
+ return rc;
+}
+
+static void *established_get_next(struct seq_file *seq, void *cur)
+{
+ struct sock *sk = cur;
+ struct tcp_tw_bucket *tw;
+ struct hlist_node *node;
+ struct tcp_iter_state* st = seq->private;
+
+ ++st->num;
+
+ if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
+ tw = cur;
+ tw = tw_next(tw);
+get_tw:
+ while (tw && tw->tw_family != st->family) {
+ tw = tw_next(tw);
+ }
+ if (tw) {
+ cur = tw;
+ goto out;
+ }
+ read_unlock(&tcp_ehash[st->bucket].lock);
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
+
+ /* We can reschedule between buckets: */
+ cond_resched_softirq();
+
+ if (++st->bucket < tcp_ehash_size) {
+ read_lock(&tcp_ehash[st->bucket].lock);
+ sk = sk_head(&tcp_ehash[st->bucket].chain);
+ } else {
+ cur = NULL;
+ goto out;
+ }
+ } else
+ sk = sk_next(sk);
+
+ sk_for_each_from(sk, node) {
+ if (sk->sk_family == st->family)
+ goto found;
+ }
+
+ st->state = TCP_SEQ_STATE_TIME_WAIT;
+ tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
+ goto get_tw;
+found:
+ cur = sk;
+out:
+ return cur;
+}
+
+static void *established_get_idx(struct seq_file *seq, loff_t pos)
+{
+ void *rc = established_get_first(seq);
+
+ while (rc && pos) {
+ rc = established_get_next(seq, rc);
+ --pos;
+ }
+ return rc;
+}
+
+static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
+{
+ void *rc;
+ struct tcp_iter_state* st = seq->private;
+
+ tcp_listen_lock();
+ st->state = TCP_SEQ_STATE_LISTENING;
+ rc = listening_get_idx(seq, &pos);
+
+ if (!rc) {
+ tcp_listen_unlock();
+ local_bh_disable();
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
+ rc = established_get_idx(seq, pos);
+ }
+
+ return rc;
+}
+
+static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct tcp_iter_state* st = seq->private;
+ st->state = TCP_SEQ_STATE_LISTENING;
+ st->num = 0;
+ return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ void *rc = NULL;
+ struct tcp_iter_state* st;
+
+ if (v == SEQ_START_TOKEN) {
+ rc = tcp_get_idx(seq, 0);
+ goto out;
+ }
+ st = seq->private;
+
+ switch (st->state) {
+ case TCP_SEQ_STATE_OPENREQ:
+ case TCP_SEQ_STATE_LISTENING:
+ rc = listening_get_next(seq, v);
+ if (!rc) {
+ tcp_listen_unlock();
+ local_bh_disable();
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
+ rc = established_get_first(seq);
+ }
+ break;
+ case TCP_SEQ_STATE_ESTABLISHED:
+ case TCP_SEQ_STATE_TIME_WAIT:
+ rc = established_get_next(seq, v);
+ break;
+ }
+out:
+ ++*pos;
+ return rc;
+}
+
+static void tcp_seq_stop(struct seq_file *seq, void *v)
+{
+ struct tcp_iter_state* st = seq->private;
+
+ switch (st->state) {
+ case TCP_SEQ_STATE_OPENREQ:
+ if (v) {
+ struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
+ read_unlock_bh(&tp->syn_wait_lock);
+ }
+ case TCP_SEQ_STATE_LISTENING:
+ if (v != SEQ_START_TOKEN)
+ tcp_listen_unlock();
+ break;
+ case TCP_SEQ_STATE_TIME_WAIT:
+ case TCP_SEQ_STATE_ESTABLISHED:
+ if (v)
+ read_unlock(&tcp_ehash[st->bucket].lock);
+ local_bh_enable();
+ break;
+ }
+}
+
+static int tcp_seq_open(struct inode *inode, struct file *file)
+{
+ struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
+ struct seq_file *seq;
+ struct tcp_iter_state *s;
+ int rc;
+
+ if (unlikely(afinfo == NULL))
+ return -EINVAL;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+ memset(s, 0, sizeof(*s));
+ s->family = afinfo->family;
+ s->seq_ops.start = tcp_seq_start;
+ s->seq_ops.next = tcp_seq_next;
+ s->seq_ops.show = afinfo->seq_show;
+ s->seq_ops.stop = tcp_seq_stop;
+
+ rc = seq_open(file, &s->seq_ops);
+ if (rc)
+ goto out_kfree;
+ seq = file->private_data;
+ seq->private = s;
+out:
+ return rc;
+out_kfree:
+ kfree(s);
+ goto out;
+}
+
+int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
+{
+ int rc = 0;
+ struct proc_dir_entry *p;
+
+ if (!afinfo)
+ return -EINVAL;
+ afinfo->seq_fops->owner = afinfo->owner;
+ afinfo->seq_fops->open = tcp_seq_open;
+ afinfo->seq_fops->read = seq_read;
+ afinfo->seq_fops->llseek = seq_lseek;
+ afinfo->seq_fops->release = seq_release_private;
+
+ p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
+ if (p)
+ p->data = afinfo;
+ else
+ rc = -ENOMEM;
+ return rc;
+}
+
+void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
+{
+ if (!afinfo)
+ return;
+ proc_net_remove(afinfo->name);
+ memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
+}
+
+static void get_openreq4(struct sock *sk, struct open_request *req,
+ char *tmpbuf, int i, int uid)
+{
+ int ttd = req->expires - jiffies;
+
+ sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
+ " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
+ i,
+ req->af.v4_req.loc_addr,
+ ntohs(inet_sk(sk)->sport),
+ req->af.v4_req.rmt_addr,
+ ntohs(req->rmt_port),
+ TCP_SYN_RECV,
+ 0, 0, /* could print option size, but that is af dependent. */
+ 1, /* timers active (only the expire timer) */
+ jiffies_to_clock_t(ttd),
+ req->retrans,
+ uid,
+ 0, /* non standard timer */
+ 0, /* open_requests have no inode */
+ atomic_read(&sk->sk_refcnt),
+ req);
+}
+
+static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
+{
+ int timer_active;
+ unsigned long timer_expires;
+ struct tcp_sock *tp = tcp_sk(sp);
+ struct inet_sock *inet = inet_sk(sp);
+ unsigned int dest = inet->daddr;
+ unsigned int src = inet->rcv_saddr;
+ __u16 destp = ntohs(inet->dport);
+ __u16 srcp = ntohs(inet->sport);
+
+ if (tp->pending == TCP_TIME_RETRANS) {
+ timer_active = 1;
+ timer_expires = tp->timeout;
+ } else if (tp->pending == TCP_TIME_PROBE0) {
+ timer_active = 4;
+ timer_expires = tp->timeout;
+ } else if (timer_pending(&sp->sk_timer)) {
+ timer_active = 2;
+ timer_expires = sp->sk_timer.expires;
+ } else {
+ timer_active = 0;
+ timer_expires = jiffies;
+ }
+
+ sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
+ "%08X %5d %8d %lu %d %p %u %u %u %u %d",
+ i, src, srcp, dest, destp, sp->sk_state,
+ tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
+ timer_active,
+ jiffies_to_clock_t(timer_expires - jiffies),
+ tp->retransmits,
+ sock_i_uid(sp),
+ tp->probes_out,
+ sock_i_ino(sp),
+ atomic_read(&sp->sk_refcnt), sp,
+ tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
+ tp->snd_cwnd,
+ tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
+}
+
+static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
+{
+ unsigned int dest, src;
+ __u16 destp, srcp;
+ int ttd = tw->tw_ttd - jiffies;
+
+ if (ttd < 0)
+ ttd = 0;
+
+ dest = tw->tw_daddr;
+ src = tw->tw_rcv_saddr;
+ destp = ntohs(tw->tw_dport);
+ srcp = ntohs(tw->tw_sport);
+
+ sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
+ " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
+ i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
+ 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
+ atomic_read(&tw->tw_refcnt), tw);
+}
+
+#define TMPSZ 150
+
+static int tcp4_seq_show(struct seq_file *seq, void *v)
+{
+ struct tcp_iter_state* st;
+ char tmpbuf[TMPSZ + 1];
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "%-*s\n", TMPSZ - 1,
+ " sl local_address rem_address st tx_queue "
+ "rx_queue tr tm->when retrnsmt uid timeout "
+ "inode");
+ goto out;
+ }
+ st = seq->private;
+
+ switch (st->state) {
+ case TCP_SEQ_STATE_LISTENING:
+ case TCP_SEQ_STATE_ESTABLISHED:
+ get_tcp4_sock(v, tmpbuf, st->num);
+ break;
+ case TCP_SEQ_STATE_OPENREQ:
+ get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
+ break;
+ case TCP_SEQ_STATE_TIME_WAIT:
+ get_timewait4_sock(v, tmpbuf, st->num);
+ break;
+ }
+ seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
+out:
+ return 0;
+}
+
+static struct file_operations tcp4_seq_fops;
+static struct tcp_seq_afinfo tcp4_seq_afinfo = {
+ .owner = THIS_MODULE,
+ .name = "tcp",
+ .family = AF_INET,
+ .seq_show = tcp4_seq_show,
+ .seq_fops = &tcp4_seq_fops,
+};
+
+int __init tcp4_proc_init(void)
+{
+ return tcp_proc_register(&tcp4_seq_afinfo);
+}
+
+void tcp4_proc_exit(void)
+{
+ tcp_proc_unregister(&tcp4_seq_afinfo);
+}
+#endif /* CONFIG_PROC_FS */
+
+struct proto tcp_prot = {
+ .name = "TCP",
+ .owner = THIS_MODULE,
+ .close = tcp_close,
+ .connect = tcp_v4_connect,
+ .disconnect = tcp_disconnect,
+ .accept = tcp_accept,
+ .ioctl = tcp_ioctl,
+ .init = tcp_v4_init_sock,
+ .destroy = tcp_v4_destroy_sock,
+ .shutdown = tcp_shutdown,
+ .setsockopt = tcp_setsockopt,
+ .getsockopt = tcp_getsockopt,
+ .sendmsg = tcp_sendmsg,
+ .recvmsg = tcp_recvmsg,
+ .backlog_rcv = tcp_v4_do_rcv,
+ .hash = tcp_v4_hash,
+ .unhash = tcp_unhash,
+ .get_port = tcp_v4_get_port,
+ .enter_memory_pressure = tcp_enter_memory_pressure,
+ .sockets_allocated = &tcp_sockets_allocated,
+ .memory_allocated = &tcp_memory_allocated,
+ .memory_pressure = &tcp_memory_pressure,
+ .sysctl_mem = sysctl_tcp_mem,
+ .sysctl_wmem = sysctl_tcp_wmem,
+ .sysctl_rmem = sysctl_tcp_rmem,
+ .max_header = MAX_TCP_HEADER,
+ .obj_size = sizeof(struct tcp_sock),
+};
+
+
+
+void __init tcp_v4_init(struct net_proto_family *ops)
+{
+ int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
+ if (err < 0)
+ panic("Failed to create the TCP control socket.\n");
+ tcp_socket->sk->sk_allocation = GFP_ATOMIC;
+ inet_sk(tcp_socket->sk)->uc_ttl = -1;
+
+ /* Unhash it so that IP input processing does not even
+ * see it, we do not wish this socket to see incoming
+ * packets.
+ */
+ tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
+}
+
+EXPORT_SYMBOL(ipv4_specific);
+EXPORT_SYMBOL(tcp_bind_hash);
+EXPORT_SYMBOL(tcp_bucket_create);
+EXPORT_SYMBOL(tcp_hashinfo);
+EXPORT_SYMBOL(tcp_inherit_port);
+EXPORT_SYMBOL(tcp_listen_wlock);
+EXPORT_SYMBOL(tcp_port_rover);
+EXPORT_SYMBOL(tcp_prot);
+EXPORT_SYMBOL(tcp_put_port);
+EXPORT_SYMBOL(tcp_unhash);
+EXPORT_SYMBOL(tcp_v4_conn_request);
+EXPORT_SYMBOL(tcp_v4_connect);
+EXPORT_SYMBOL(tcp_v4_do_rcv);
+EXPORT_SYMBOL(tcp_v4_rebuild_header);
+EXPORT_SYMBOL(tcp_v4_remember_stamp);
+EXPORT_SYMBOL(tcp_v4_send_check);
+EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
+
+#ifdef CONFIG_PROC_FS
+EXPORT_SYMBOL(tcp_proc_register);
+EXPORT_SYMBOL(tcp_proc_unregister);
+#endif
+EXPORT_SYMBOL(sysctl_local_port_range);
+EXPORT_SYMBOL(sysctl_max_syn_backlog);
+EXPORT_SYMBOL(sysctl_tcp_low_latency);
+EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
+
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
new file mode 100644
index 00000000000..fd70509f0d5
--- /dev/null
+++ b/net/ipv4/tcp_minisocks.c
@@ -0,0 +1,1077 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version: $Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ * Linus Torvalds, <torvalds@cs.helsinki.fi>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Matthew Dillon, <dillon@apollo.west.oic.com>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/workqueue.h>
+#include <net/tcp.h>
+#include <net/inet_common.h>
+#include <net/xfrm.h>
+
+#ifdef CONFIG_SYSCTL
+#define SYNC_INIT 0 /* let the user enable it */
+#else
+#define SYNC_INIT 1
+#endif
+
+int sysctl_tcp_tw_recycle;
+int sysctl_tcp_max_tw_buckets = NR_FILE*2;
+
+int sysctl_tcp_syncookies = SYNC_INIT;
+int sysctl_tcp_abort_on_overflow;
+
+static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo);
+
+static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
+{
+ if (seq == s_win)
+ return 1;
+ if (after(end_seq, s_win) && before(seq, e_win))
+ return 1;
+ return (seq == e_win && seq == end_seq);
+}
+
+/* New-style handling of TIME_WAIT sockets. */
+
+int tcp_tw_count;
+
+
+/* Must be called with locally disabled BHs. */
+static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
+{
+ struct tcp_ehash_bucket *ehead;
+ struct tcp_bind_hashbucket *bhead;
+ struct tcp_bind_bucket *tb;
+
+ /* Unlink from established hashes. */
+ ehead = &tcp_ehash[tw->tw_hashent];
+ write_lock(&ehead->lock);
+ if (hlist_unhashed(&tw->tw_node)) {
+ write_unlock(&ehead->lock);
+ return;
+ }
+ __hlist_del(&tw->tw_node);
+ sk_node_init(&tw->tw_node);
+ write_unlock(&ehead->lock);
+
+ /* Disassociate with bind bucket. */
+ bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)];
+ spin_lock(&bhead->lock);
+ tb = tw->tw_tb;
+ __hlist_del(&tw->tw_bind_node);
+ tw->tw_tb = NULL;
+ tcp_bucket_destroy(tb);
+ spin_unlock(&bhead->lock);
+
+#ifdef INET_REFCNT_DEBUG
+ if (atomic_read(&tw->tw_refcnt) != 1) {
+ printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw,
+ atomic_read(&tw->tw_refcnt));
+ }
+#endif
+ tcp_tw_put(tw);
+}
+
+/*
+ * * Main purpose of TIME-WAIT state is to close connection gracefully,
+ * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
+ * (and, probably, tail of data) and one or more our ACKs are lost.
+ * * What is TIME-WAIT timeout? It is associated with maximal packet
+ * lifetime in the internet, which results in wrong conclusion, that
+ * it is set to catch "old duplicate segments" wandering out of their path.
+ * It is not quite correct. This timeout is calculated so that it exceeds
+ * maximal retransmission timeout enough to allow to lose one (or more)
+ * segments sent by peer and our ACKs. This time may be calculated from RTO.
+ * * When TIME-WAIT socket receives RST, it means that another end
+ * finally closed and we are allowed to kill TIME-WAIT too.
+ * * Second purpose of TIME-WAIT is catching old duplicate segments.
+ * Well, certainly it is pure paranoia, but if we load TIME-WAIT
+ * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
+ * * If we invented some more clever way to catch duplicates
+ * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
+ *
+ * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
+ * When you compare it to RFCs, please, read section SEGMENT ARRIVES
+ * from the very beginning.
+ *
+ * NOTE. With recycling (and later with fin-wait-2) TW bucket
+ * is _not_ stateless. It means, that strictly speaking we must
+ * spinlock it. I do not want! Well, probability of misbehaviour
+ * is ridiculously low and, seems, we could use some mb() tricks
+ * to avoid misread sequence numbers, states etc. --ANK
+ */
+enum tcp_tw_status
+tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
+ struct tcphdr *th, unsigned len)
+{
+ struct tcp_options_received tmp_opt;
+ int paws_reject = 0;
+
+ tmp_opt.saw_tstamp = 0;
+ if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) {
+ tcp_parse_options(skb, &tmp_opt, 0);
+
+ if (tmp_opt.saw_tstamp) {
+ tmp_opt.ts_recent = tw->tw_ts_recent;
+ tmp_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
+ paws_reject = tcp_paws_check(&tmp_opt, th->rst);
+ }
+ }
+
+ if (tw->tw_substate == TCP_FIN_WAIT2) {
+ /* Just repeat all the checks of tcp_rcv_state_process() */
+
+ /* Out of window, send ACK */
+ if (paws_reject ||
+ !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+ tw->tw_rcv_nxt,
+ tw->tw_rcv_nxt + tw->tw_rcv_wnd))
+ return TCP_TW_ACK;
+
+ if (th->rst)
+ goto kill;
+
+ if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt))
+ goto kill_with_rst;
+
+ /* Dup ACK? */
+ if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) ||
+ TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
+ tcp_tw_put(tw);
+ return TCP_TW_SUCCESS;
+ }
+
+ /* New data or FIN. If new data arrive after half-duplex close,
+ * reset.
+ */
+ if (!th->fin ||
+ TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) {
+kill_with_rst:
+ tcp_tw_deschedule(tw);
+ tcp_tw_put(tw);
+ return TCP_TW_RST;
+ }
+
+ /* FIN arrived, enter true time-wait state. */
+ tw->tw_substate = TCP_TIME_WAIT;
+ tw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ if (tmp_opt.saw_tstamp) {
+ tw->tw_ts_recent_stamp = xtime.tv_sec;
+ tw->tw_ts_recent = tmp_opt.rcv_tsval;
+ }
+
+ /* I am shamed, but failed to make it more elegant.
+ * Yes, it is direct reference to IP, which is impossible
+ * to generalize to IPv6. Taking into account that IPv6
+ * do not undertsnad recycling in any case, it not
+ * a big problem in practice. --ANK */
+ if (tw->tw_family == AF_INET &&
+ sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp &&
+ tcp_v4_tw_remember_stamp(tw))
+ tcp_tw_schedule(tw, tw->tw_timeout);
+ else
+ tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+ return TCP_TW_ACK;
+ }
+
+ /*
+ * Now real TIME-WAIT state.
+ *
+ * RFC 1122:
+ * "When a connection is [...] on TIME-WAIT state [...]
+ * [a TCP] MAY accept a new SYN from the remote TCP to
+ * reopen the connection directly, if it:
+ *
+ * (1) assigns its initial sequence number for the new
+ * connection to be larger than the largest sequence
+ * number it used on the previous connection incarnation,
+ * and
+ *
+ * (2) returns to TIME-WAIT state if the SYN turns out
+ * to be an old duplicate".
+ */
+
+ if (!paws_reject &&
+ (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt &&
+ (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
+ /* In window segment, it may be only reset or bare ack. */
+
+ if (th->rst) {
+ /* This is TIME_WAIT assasination, in two flavors.
+ * Oh well... nobody has a sufficient solution to this
+ * protocol bug yet.
+ */
+ if (sysctl_tcp_rfc1337 == 0) {
+kill:
+ tcp_tw_deschedule(tw);
+ tcp_tw_put(tw);
+ return TCP_TW_SUCCESS;
+ }
+ }
+ tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+
+ if (tmp_opt.saw_tstamp) {
+ tw->tw_ts_recent = tmp_opt.rcv_tsval;
+ tw->tw_ts_recent_stamp = xtime.tv_sec;
+ }
+
+ tcp_tw_put(tw);
+ return TCP_TW_SUCCESS;
+ }
+
+ /* Out of window segment.
+
+ All the segments are ACKed immediately.
+
+ The only exception is new SYN. We accept it, if it is
+ not old duplicate and we are not in danger to be killed
+ by delayed old duplicates. RFC check is that it has
+ newer sequence number works at rates <40Mbit/sec.
+ However, if paws works, it is reliable AND even more,
+ we even may relax silly seq space cutoff.
+
+ RED-PEN: we violate main RFC requirement, if this SYN will appear
+ old duplicate (i.e. we receive RST in reply to SYN-ACK),
+ we must return socket to time-wait state. It is not good,
+ but not fatal yet.
+ */
+
+ if (th->syn && !th->rst && !th->ack && !paws_reject &&
+ (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) ||
+ (tmp_opt.saw_tstamp && (s32)(tw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
+ u32 isn = tw->tw_snd_nxt + 65535 + 2;
+ if (isn == 0)
+ isn++;
+ TCP_SKB_CB(skb)->when = isn;
+ return TCP_TW_SYN;
+ }
+
+ if (paws_reject)
+ NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
+
+ if(!th->rst) {
+ /* In this case we must reset the TIMEWAIT timer.
+ *
+ * If it is ACKless SYN it may be both old duplicate
+ * and new good SYN with random sequence number <rcv_nxt.
+ * Do not reschedule in the last case.
+ */
+ if (paws_reject || th->ack)
+ tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+
+ /* Send ACK. Note, we do not put the bucket,
+ * it will be released by caller.
+ */
+ return TCP_TW_ACK;
+ }
+ tcp_tw_put(tw);
+ return TCP_TW_SUCCESS;
+}
+
+/* Enter the time wait state. This is called with locally disabled BH.
+ * Essentially we whip up a timewait bucket, copy the
+ * relevant info into it from the SK, and mess with hash chains
+ * and list linkage.
+ */
+static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
+{
+ struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent];
+ struct tcp_bind_hashbucket *bhead;
+
+ /* Step 1: Put TW into bind hash. Original socket stays there too.
+ Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in
+ binding cache, even if it is closed.
+ */
+ bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)];
+ spin_lock(&bhead->lock);
+ tw->tw_tb = tcp_sk(sk)->bind_hash;
+ BUG_TRAP(tcp_sk(sk)->bind_hash);
+ tw_add_bind_node(tw, &tw->tw_tb->owners);
+ spin_unlock(&bhead->lock);
+
+ write_lock(&ehead->lock);
+
+ /* Step 2: Remove SK from established hash. */
+ if (__sk_del_node_init(sk))
+ sock_prot_dec_use(sk->sk_prot);
+
+ /* Step 3: Hash TW into TIMEWAIT half of established hash table. */
+ tw_add_node(tw, &(ehead + tcp_ehash_size)->chain);
+ atomic_inc(&tw->tw_refcnt);
+
+ write_unlock(&ehead->lock);
+}
+
+/*
+ * Move a socket to time-wait or dead fin-wait-2 state.
+ */
+void tcp_time_wait(struct sock *sk, int state, int timeo)
+{
+ struct tcp_tw_bucket *tw = NULL;
+ struct tcp_sock *tp = tcp_sk(sk);
+ int recycle_ok = 0;
+
+ if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp)
+ recycle_ok = tp->af_specific->remember_stamp(sk);
+
+ if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
+ tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
+
+ if(tw != NULL) {
+ struct inet_sock *inet = inet_sk(sk);
+ int rto = (tp->rto<<2) - (tp->rto>>1);
+
+ /* Give us an identity. */
+ tw->tw_daddr = inet->daddr;
+ tw->tw_rcv_saddr = inet->rcv_saddr;
+ tw->tw_bound_dev_if = sk->sk_bound_dev_if;
+ tw->tw_num = inet->num;
+ tw->tw_state = TCP_TIME_WAIT;
+ tw->tw_substate = state;
+ tw->tw_sport = inet->sport;
+ tw->tw_dport = inet->dport;
+ tw->tw_family = sk->sk_family;
+ tw->tw_reuse = sk->sk_reuse;
+ tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
+ atomic_set(&tw->tw_refcnt, 1);
+
+ tw->tw_hashent = sk->sk_hashent;
+ tw->tw_rcv_nxt = tp->rcv_nxt;
+ tw->tw_snd_nxt = tp->snd_nxt;
+ tw->tw_rcv_wnd = tcp_receive_window(tp);
+ tw->tw_ts_recent = tp->rx_opt.ts_recent;
+ tw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
+ tw_dead_node_init(tw);
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ if (tw->tw_family == PF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr);
+ ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr);
+ tw->tw_v6_ipv6only = np->ipv6only;
+ } else {
+ memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr));
+ memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr));
+ tw->tw_v6_ipv6only = 0;
+ }
+#endif
+ /* Linkage updates. */
+ __tcp_tw_hashdance(sk, tw);
+
+ /* Get the TIME_WAIT timeout firing. */
+ if (timeo < rto)
+ timeo = rto;
+
+ if (recycle_ok) {
+ tw->tw_timeout = rto;
+ } else {
+ tw->tw_timeout = TCP_TIMEWAIT_LEN;
+ if (state == TCP_TIME_WAIT)
+ timeo = TCP_TIMEWAIT_LEN;
+ }
+
+ tcp_tw_schedule(tw, timeo);
+ tcp_tw_put(tw);
+ } else {
+ /* Sorry, if we're out of memory, just CLOSE this
+ * socket up. We've got bigger problems than
+ * non-graceful socket closings.
+ */
+ if (net_ratelimit())
+ printk(KERN_INFO "TCP: time wait bucket table overflow\n");
+ }
+
+ tcp_update_metrics(sk);
+ tcp_done(sk);
+}
+
+/* Kill off TIME_WAIT sockets once their lifetime has expired. */
+static int tcp_tw_death_row_slot;
+
+static void tcp_twkill(unsigned long);
+
+/* TIME_WAIT reaping mechanism. */
+#define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */
+#define TCP_TWKILL_PERIOD (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
+
+#define TCP_TWKILL_QUOTA 100
+
+static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
+static DEFINE_SPINLOCK(tw_death_lock);
+static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
+static void twkill_work(void *);
+static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
+static u32 twkill_thread_slots;
+
+/* Returns non-zero if quota exceeded. */
+static int tcp_do_twkill_work(int slot, unsigned int quota)
+{
+ struct tcp_tw_bucket *tw;
+ struct hlist_node *node;
+ unsigned int killed;
+ int ret;
+
+ /* NOTE: compare this to previous version where lock
+ * was released after detaching chain. It was racy,
+ * because tw buckets are scheduled in not serialized context
+ * in 2.3 (with netfilter), and with softnet it is common, because
+ * soft irqs are not sequenced.
+ */
+ killed = 0;
+ ret = 0;
+rescan:
+ tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) {
+ __tw_del_dead_node(tw);
+ spin_unlock(&tw_death_lock);
+ tcp_timewait_kill(tw);
+ tcp_tw_put(tw);
+ killed++;
+ spin_lock(&tw_death_lock);
+ if (killed > quota) {
+ ret = 1;
+ break;
+ }
+
+ /* While we dropped tw_death_lock, another cpu may have
+ * killed off the next TW bucket in the list, therefore
+ * do a fresh re-read of the hlist head node with the
+ * lock reacquired. We still use the hlist traversal
+ * macro in order to get the prefetches.
+ */
+ goto rescan;
+ }
+
+ tcp_tw_count -= killed;
+ NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
+
+ return ret;
+}
+
+static void tcp_twkill(unsigned long dummy)
+{
+ int need_timer, ret;
+
+ spin_lock(&tw_death_lock);
+
+ if (tcp_tw_count == 0)
+ goto out;
+
+ need_timer = 0;
+ ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA);
+ if (ret) {
+ twkill_thread_slots |= (1 << tcp_tw_death_row_slot);
+ mb();
+ schedule_work(&tcp_twkill_work);
+ need_timer = 1;
+ } else {
+ /* We purged the entire slot, anything left? */
+ if (tcp_tw_count)
+ need_timer = 1;
+ }
+ tcp_tw_death_row_slot =
+ ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
+ if (need_timer)
+ mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD);
+out:
+ spin_unlock(&tw_death_lock);
+}
+
+extern void twkill_slots_invalid(void);
+
+static void twkill_work(void *dummy)
+{
+ int i;
+
+ if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8))
+ twkill_slots_invalid();
+
+ while (twkill_thread_slots) {
+ spin_lock_bh(&tw_death_lock);
+ for (i = 0; i < TCP_TWKILL_SLOTS; i++) {
+ if (!(twkill_thread_slots & (1 << i)))
+ continue;
+
+ while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
+ if (need_resched()) {
+ spin_unlock_bh(&tw_death_lock);
+ schedule();
+ spin_lock_bh(&tw_death_lock);
+ }
+ }
+
+ twkill_thread_slots &= ~(1 << i);
+ }
+ spin_unlock_bh(&tw_death_lock);
+ }
+}
+
+/* These are always called from BH context. See callers in
+ * tcp_input.c to verify this.
+ */
+
+/* This is for handling early-kills of TIME_WAIT sockets. */
+void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
+{
+ spin_lock(&tw_death_lock);
+ if (tw_del_dead_node(tw)) {
+ tcp_tw_put(tw);
+ if (--tcp_tw_count == 0)
+ del_timer(&tcp_tw_timer);
+ }
+ spin_unlock(&tw_death_lock);
+ tcp_timewait_kill(tw);
+}
+
+/* Short-time timewait calendar */
+
+static int tcp_twcal_hand = -1;
+static int tcp_twcal_jiffie;
+static void tcp_twcal_tick(unsigned long);
+static struct timer_list tcp_twcal_timer =
+ TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
+static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
+
+static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
+{
+ struct hlist_head *list;
+ int slot;
+
+ /* timeout := RTO * 3.5
+ *
+ * 3.5 = 1+2+0.5 to wait for two retransmits.
+ *
+ * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
+ * our ACK acking that FIN can be lost. If N subsequent retransmitted
+ * FINs (or previous seqments) are lost (probability of such event
+ * is p^(N+1), where p is probability to lose single packet and
+ * time to detect the loss is about RTO*(2^N - 1) with exponential
+ * backoff). Normal timewait length is calculated so, that we
+ * waited at least for one retransmitted FIN (maximal RTO is 120sec).
+ * [ BTW Linux. following BSD, violates this requirement waiting
+ * only for 60sec, we should wait at least for 240 secs.
+ * Well, 240 consumes too much of resources 8)
+ * ]
+ * This interval is not reduced to catch old duplicate and
+ * responces to our wandering segments living for two MSLs.
+ * However, if we use PAWS to detect
+ * old duplicates, we can reduce the interval to bounds required
+ * by RTO, rather than MSL. So, if peer understands PAWS, we
+ * kill tw bucket after 3.5*RTO (it is important that this number
+ * is greater than TS tick!) and detect old duplicates with help
+ * of PAWS.
+ */
+ slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
+
+ spin_lock(&tw_death_lock);
+
+ /* Unlink it, if it was scheduled */
+ if (tw_del_dead_node(tw))
+ tcp_tw_count--;
+ else
+ atomic_inc(&tw->tw_refcnt);
+
+ if (slot >= TCP_TW_RECYCLE_SLOTS) {
+ /* Schedule to slow timer */
+ if (timeo >= TCP_TIMEWAIT_LEN) {
+ slot = TCP_TWKILL_SLOTS-1;
+ } else {
+ slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
+ if (slot >= TCP_TWKILL_SLOTS)
+ slot = TCP_TWKILL_SLOTS-1;
+ }
+ tw->tw_ttd = jiffies + timeo;
+ slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
+ list = &tcp_tw_death_row[slot];
+ } else {
+ tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK);
+
+ if (tcp_twcal_hand < 0) {
+ tcp_twcal_hand = 0;
+ tcp_twcal_jiffie = jiffies;
+ tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
+ add_timer(&tcp_twcal_timer);
+ } else {
+ if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK)))
+ mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
+ slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
+ }
+ list = &tcp_twcal_row[slot];
+ }
+
+ hlist_add_head(&tw->tw_death_node, list);
+
+ if (tcp_tw_count++ == 0)
+ mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
+ spin_unlock(&tw_death_lock);
+}
+
+void tcp_twcal_tick(unsigned long dummy)
+{
+ int n, slot;
+ unsigned long j;
+ unsigned long now = jiffies;
+ int killed = 0;
+ int adv = 0;
+
+ spin_lock(&tw_death_lock);
+ if (tcp_twcal_hand < 0)
+ goto out;
+
+ slot = tcp_twcal_hand;
+ j = tcp_twcal_jiffie;
+
+ for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
+ if (time_before_eq(j, now)) {
+ struct hlist_node *node, *safe;
+ struct tcp_tw_bucket *tw;
+
+ tw_for_each_inmate_safe(tw, node, safe,
+ &tcp_twcal_row[slot]) {
+ __tw_del_dead_node(tw);
+ tcp_timewait_kill(tw);
+ tcp_tw_put(tw);
+ killed++;
+ }
+ } else {
+ if (!adv) {
+ adv = 1;
+ tcp_twcal_jiffie = j;
+ tcp_twcal_hand = slot;
+ }
+
+ if (!hlist_empty(&tcp_twcal_row[slot])) {
+ mod_timer(&tcp_twcal_timer, j);
+ goto out;
+ }
+ }
+ j += (1<<TCP_TW_RECYCLE_TICK);
+ slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
+ }
+ tcp_twcal_hand = -1;
+
+out:
+ if ((tcp_tw_count -= killed) == 0)
+ del_timer(&tcp_tw_timer);
+ NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
+ spin_unlock(&tw_death_lock);
+}
+
+/* This is not only more efficient than what we used to do, it eliminates
+ * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
+ *
+ * Actually, we could lots of memory writes here. tp of listening
+ * socket contains all necessary default parameters.
+ */
+struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
+{
+ /* allocate the newsk from the same slab of the master sock,
+ * if not, at sk_free time we'll try to free it from the wrong
+ * slabcache (i.e. is it TCPv4 or v6?), this is handled thru sk->sk_prot -acme */
+ struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0);
+
+ if(newsk != NULL) {
+ struct tcp_sock *newtp;
+ struct sk_filter *filter;
+
+ memcpy(newsk, sk, sizeof(struct tcp_sock));
+ newsk->sk_state = TCP_SYN_RECV;
+
+ /* SANITY */
+ sk_node_init(&newsk->sk_node);
+ tcp_sk(newsk)->bind_hash = NULL;
+
+ /* Clone the TCP header template */
+ inet_sk(newsk)->dport = req->rmt_port;
+
+ sock_lock_init(newsk);
+ bh_lock_sock(newsk);
+
+ rwlock_init(&newsk->sk_dst_lock);
+ atomic_set(&newsk->sk_rmem_alloc, 0);
+ skb_queue_head_init(&newsk->sk_receive_queue);
+ atomic_set(&newsk->sk_wmem_alloc, 0);
+ skb_queue_head_init(&newsk->sk_write_queue);
+ atomic_set(&newsk->sk_omem_alloc, 0);
+ newsk->sk_wmem_queued = 0;
+ newsk->sk_forward_alloc = 0;
+
+ sock_reset_flag(newsk, SOCK_DONE);
+ newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+ newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
+ newsk->sk_send_head = NULL;
+ rwlock_init(&newsk->sk_callback_lock);
+ skb_queue_head_init(&newsk->sk_error_queue);
+ newsk->sk_write_space = sk_stream_write_space;
+
+ if ((filter = newsk->sk_filter) != NULL)
+ sk_filter_charge(newsk, filter);
+
+ if (unlikely(xfrm_sk_clone_policy(newsk))) {
+ /* It is still raw copy of parent, so invalidate
+ * destructor and make plain sk_free() */
+ newsk->sk_destruct = NULL;
+ sk_free(newsk);
+ return NULL;
+ }
+
+ /* Now setup tcp_sock */
+ newtp = tcp_sk(newsk);
+ newtp->pred_flags = 0;
+ newtp->rcv_nxt = req->rcv_isn + 1;
+ newtp->snd_nxt = req->snt_isn + 1;
+ newtp->snd_una = req->snt_isn + 1;
+ newtp->snd_sml = req->snt_isn + 1;
+
+ tcp_prequeue_init(newtp);
+
+ tcp_init_wl(newtp, req->snt_isn, req->rcv_isn);
+
+ newtp->retransmits = 0;
+ newtp->backoff = 0;
+ newtp->srtt = 0;
+ newtp->mdev = TCP_TIMEOUT_INIT;
+ newtp->rto = TCP_TIMEOUT_INIT;
+
+ newtp->packets_out = 0;
+ newtp->left_out = 0;
+ newtp->retrans_out = 0;
+ newtp->sacked_out = 0;
+ newtp->fackets_out = 0;
+ newtp->snd_ssthresh = 0x7fffffff;
+
+ /* So many TCP implementations out there (incorrectly) count the
+ * initial SYN frame in their delayed-ACK and congestion control
+ * algorithms that we must have the following bandaid to talk
+ * efficiently to them. -DaveM
+ */
+ newtp->snd_cwnd = 2;
+ newtp->snd_cwnd_cnt = 0;
+
+ newtp->frto_counter = 0;
+ newtp->frto_highmark = 0;
+
+ tcp_set_ca_state(newtp, TCP_CA_Open);
+ tcp_init_xmit_timers(newsk);
+ skb_queue_head_init(&newtp->out_of_order_queue);
+ newtp->rcv_wup = req->rcv_isn + 1;
+ newtp->write_seq = req->snt_isn + 1;
+ newtp->pushed_seq = newtp->write_seq;
+ newtp->copied_seq = req->rcv_isn + 1;
+
+ newtp->rx_opt.saw_tstamp = 0;
+
+ newtp->rx_opt.dsack = 0;
+ newtp->rx_opt.eff_sacks = 0;
+
+ newtp->probes_out = 0;
+ newtp->rx_opt.num_sacks = 0;
+ newtp->urg_data = 0;
+ newtp->listen_opt = NULL;
+ newtp->accept_queue = newtp->accept_queue_tail = NULL;
+ /* Deinitialize syn_wait_lock to trap illegal accesses. */
+ memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
+
+ /* Back to base struct sock members. */
+ newsk->sk_err = 0;
+ newsk->sk_priority = 0;
+ atomic_set(&newsk->sk_refcnt, 2);
+#ifdef INET_REFCNT_DEBUG
+ atomic_inc(&inet_sock_nr);
+#endif
+ atomic_inc(&tcp_sockets_allocated);
+
+ if (sock_flag(newsk, SOCK_KEEPOPEN))
+ tcp_reset_keepalive_timer(newsk,
+ keepalive_time_when(newtp));
+ newsk->sk_socket = NULL;
+ newsk->sk_sleep = NULL;
+
+ newtp->rx_opt.tstamp_ok = req->tstamp_ok;
+ if((newtp->rx_opt.sack_ok = req->sack_ok) != 0) {
+ if (sysctl_tcp_fack)
+ newtp->rx_opt.sack_ok |= 2;
+ }
+ newtp->window_clamp = req->window_clamp;
+ newtp->rcv_ssthresh = req->rcv_wnd;
+ newtp->rcv_wnd = req->rcv_wnd;
+ newtp->rx_opt.wscale_ok = req->wscale_ok;
+ if (newtp->rx_opt.wscale_ok) {
+ newtp->rx_opt.snd_wscale = req->snd_wscale;
+ newtp->rx_opt.rcv_wscale = req->rcv_wscale;
+ } else {
+ newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
+ newtp->window_clamp = min(newtp->window_clamp, 65535U);
+ }
+ newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->rx_opt.snd_wscale;
+ newtp->max_window = newtp->snd_wnd;
+
+ if (newtp->rx_opt.tstamp_ok) {
+ newtp->rx_opt.ts_recent = req->ts_recent;
+ newtp->rx_opt.ts_recent_stamp = xtime.tv_sec;
+ newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ } else {
+ newtp->rx_opt.ts_recent_stamp = 0;
+ newtp->tcp_header_len = sizeof(struct tcphdr);
+ }
+ if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
+ newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len;
+ newtp->rx_opt.mss_clamp = req->mss;
+ TCP_ECN_openreq_child(newtp, req);
+ if (newtp->ecn_flags&TCP_ECN_OK)
+ sock_set_flag(newsk, SOCK_NO_LARGESEND);
+
+ tcp_ca_init(newtp);
+
+ TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
+ }
+ return newsk;
+}
+
+/*
+ * Process an incoming packet for SYN_RECV sockets represented
+ * as an open_request.
+ */
+
+struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
+ struct open_request *req,
+ struct open_request **prev)
+{
+ struct tcphdr *th = skb->h.th;
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
+ int paws_reject = 0;
+ struct tcp_options_received tmp_opt;
+ struct sock *child;
+
+ tmp_opt.saw_tstamp = 0;
+ if (th->doff > (sizeof(struct tcphdr)>>2)) {
+ tcp_parse_options(skb, &tmp_opt, 0);
+
+ if (tmp_opt.saw_tstamp) {
+ tmp_opt.ts_recent = req->ts_recent;
+ /* We do not store true stamp, but it is not required,
+ * it can be estimated (approximately)
+ * from another data.
+ */
+ tmp_opt.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
+ paws_reject = tcp_paws_check(&tmp_opt, th->rst);
+ }
+ }
+
+ /* Check for pure retransmitted SYN. */
+ if (TCP_SKB_CB(skb)->seq == req->rcv_isn &&
+ flg == TCP_FLAG_SYN &&
+ !paws_reject) {
+ /*
+ * RFC793 draws (Incorrectly! It was fixed in RFC1122)
+ * this case on figure 6 and figure 8, but formal
+ * protocol description says NOTHING.
+ * To be more exact, it says that we should send ACK,
+ * because this segment (at least, if it has no data)
+ * is out of window.
+ *
+ * CONCLUSION: RFC793 (even with RFC1122) DOES NOT
+ * describe SYN-RECV state. All the description
+ * is wrong, we cannot believe to it and should
+ * rely only on common sense and implementation
+ * experience.
+ *
+ * Enforce "SYN-ACK" according to figure 8, figure 6
+ * of RFC793, fixed by RFC1122.
+ */
+ req->class->rtx_syn_ack(sk, req, NULL);
+ return NULL;
+ }
+
+ /* Further reproduces section "SEGMENT ARRIVES"
+ for state SYN-RECEIVED of RFC793.
+ It is broken, however, it does not work only
+ when SYNs are crossed.
+
+ You would think that SYN crossing is impossible here, since
+ we should have a SYN_SENT socket (from connect()) on our end,
+ but this is not true if the crossed SYNs were sent to both
+ ends by a malicious third party. We must defend against this,
+ and to do that we first verify the ACK (as per RFC793, page
+ 36) and reset if it is invalid. Is this a true full defense?
+ To convince ourselves, let us consider a way in which the ACK
+ test can still pass in this 'malicious crossed SYNs' case.
+ Malicious sender sends identical SYNs (and thus identical sequence
+ numbers) to both A and B:
+
+ A: gets SYN, seq=7
+ B: gets SYN, seq=7
+
+ By our good fortune, both A and B select the same initial
+ send sequence number of seven :-)
+
+ A: sends SYN|ACK, seq=7, ack_seq=8
+ B: sends SYN|ACK, seq=7, ack_seq=8
+
+ So we are now A eating this SYN|ACK, ACK test passes. So
+ does sequence test, SYN is truncated, and thus we consider
+ it a bare ACK.
+
+ If tp->defer_accept, we silently drop this bare ACK. Otherwise,
+ we create an established connection. Both ends (listening sockets)
+ accept the new incoming connection and try to talk to each other. 8-)
+
+ Note: This case is both harmless, and rare. Possibility is about the
+ same as us discovering intelligent life on another plant tomorrow.
+
+ But generally, we should (RFC lies!) to accept ACK
+ from SYNACK both here and in tcp_rcv_state_process().
+ tcp_rcv_state_process() does not, hence, we do not too.
+
+ Note that the case is absolutely generic:
+ we cannot optimize anything here without
+ violating protocol. All the checks must be made
+ before attempt to create socket.
+ */
+
+ /* RFC793 page 36: "If the connection is in any non-synchronized state ...
+ * and the incoming segment acknowledges something not yet
+ * sent (the segment carries an unaccaptable ACK) ...
+ * a reset is sent."
+ *
+ * Invalid ACK: reset will be sent by listening socket
+ */
+ if ((flg & TCP_FLAG_ACK) &&
+ (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1))
+ return sk;
+
+ /* Also, it would be not so bad idea to check rcv_tsecr, which
+ * is essentially ACK extension and too early or too late values
+ * should cause reset in unsynchronized states.
+ */
+
+ /* RFC793: "first check sequence number". */
+
+ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+ req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) {
+ /* Out of window: send ACK and drop. */
+ if (!(flg & TCP_FLAG_RST))
+ req->class->send_ack(skb, req);
+ if (paws_reject)
+ NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
+ return NULL;
+ }
+
+ /* In sequence, PAWS is OK. */
+
+ if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
+ req->ts_recent = tmp_opt.rcv_tsval;
+
+ if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
+ /* Truncate SYN, it is out of window starting
+ at req->rcv_isn+1. */
+ flg &= ~TCP_FLAG_SYN;
+ }
+
+ /* RFC793: "second check the RST bit" and
+ * "fourth, check the SYN bit"
+ */
+ if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN))
+ goto embryonic_reset;
+
+ /* ACK sequence verified above, just make sure ACK is
+ * set. If ACK not set, just silently drop the packet.
+ */
+ if (!(flg & TCP_FLAG_ACK))
+ return NULL;
+
+ /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
+ if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) {
+ req->acked = 1;
+ return NULL;
+ }
+
+ /* OK, ACK is valid, create big socket and
+ * feed this segment to it. It will repeat all
+ * the tests. THIS SEGMENT MUST MOVE SOCKET TO
+ * ESTABLISHED STATE. If it will be dropped after
+ * socket is created, wait for troubles.
+ */
+ child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
+ if (child == NULL)
+ goto listen_overflow;
+
+ tcp_synq_unlink(tp, req, prev);
+ tcp_synq_removed(sk, req);
+
+ tcp_acceptq_queue(sk, req, child);
+ return child;
+
+ listen_overflow:
+ if (!sysctl_tcp_abort_on_overflow) {
+ req->acked = 1;
+ return NULL;
+ }
+
+ embryonic_reset:
+ NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS);
+ if (!(flg & TCP_FLAG_RST))
+ req->class->send_reset(skb);
+
+ tcp_synq_drop(sk, req, prev);
+ return NULL;
+}
+
+/*
+ * Queue segment on the new socket if the new socket is active,
+ * otherwise we just shortcircuit this and continue with
+ * the new socket.
+ */
+
+int tcp_child_process(struct sock *parent, struct sock *child,
+ struct sk_buff *skb)
+{
+ int ret = 0;
+ int state = child->sk_state;
+
+ if (!sock_owned_by_user(child)) {
+ ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
+
+ /* Wakeup parent, send SIGIO */
+ if (state == TCP_SYN_RECV && child->sk_state != state)
+ parent->sk_data_ready(parent, 0);
+ } else {
+ /* Alas, it is possible again, because we do lookup
+ * in main socket hash table and lock on listening
+ * socket does not protect us more.
+ */
+ sk_add_backlog(child, skb);
+ }
+
+ bh_unlock_sock(child);
+ sock_put(child);
+ return ret;
+}
+
+EXPORT_SYMBOL(tcp_check_req);
+EXPORT_SYMBOL(tcp_child_process);
+EXPORT_SYMBOL(tcp_create_openreq_child);
+EXPORT_SYMBOL(tcp_timewait_state_process);
+EXPORT_SYMBOL(tcp_tw_deschedule);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
new file mode 100644
index 00000000000..13c14cb6dee
--- /dev/null
+++ b/net/ipv4/tcp_output.c
@@ -0,0 +1,1739 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version: $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ * Linus Torvalds, <torvalds@cs.helsinki.fi>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Matthew Dillon, <dillon@apollo.west.oic.com>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+/*
+ * Changes: Pedro Roque : Retransmit queue handled by TCP.
+ * : Fragmentation on mtu decrease
+ * : Segment collapse on retransmit
+ * : AF independence
+ *
+ * Linus Torvalds : send_delayed_ack
+ * David S. Miller : Charge memory using the right skb
+ * during syn/ack processing.
+ * David S. Miller : Output engine completely rewritten.
+ * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
+ * Cacophonix Gaul : draft-minshall-nagle-01
+ * J Hadi Salim : ECN support
+ *
+ */
+
+#include <net/tcp.h>
+
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/smp_lock.h>
+
+/* People can turn this off for buggy TCP's found in printers etc. */
+int sysctl_tcp_retrans_collapse = 1;
+
+/* This limits the percentage of the congestion window which we
+ * will allow a single TSO frame to consume. Building TSO frames
+ * which are too large can cause TCP streams to be bursty.
+ */
+int sysctl_tcp_tso_win_divisor = 8;
+
+static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
+ struct sk_buff *skb)
+{
+ sk->sk_send_head = skb->next;
+ if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
+ sk->sk_send_head = NULL;
+ tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ tcp_packets_out_inc(sk, tp, skb);
+}
+
+/* SND.NXT, if window was not shrunk.
+ * If window has been shrunk, what should we make? It is not clear at all.
+ * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
+ * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
+ * invalid. OK, let's make this for now:
+ */
+static inline __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_sock *tp)
+{
+ if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
+ return tp->snd_nxt;
+ else
+ return tp->snd_una+tp->snd_wnd;
+}
+
+/* Calculate mss to advertise in SYN segment.
+ * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
+ *
+ * 1. It is independent of path mtu.
+ * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
+ * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
+ * attached devices, because some buggy hosts are confused by
+ * large MSS.
+ * 4. We do not make 3, we advertise MSS, calculated from first
+ * hop device mtu, but allow to raise it to ip_rt_min_advmss.
+ * This may be overridden via information stored in routing table.
+ * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
+ * probably even Jumbo".
+ */
+static __u16 tcp_advertise_mss(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct dst_entry *dst = __sk_dst_get(sk);
+ int mss = tp->advmss;
+
+ if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
+ mss = dst_metric(dst, RTAX_ADVMSS);
+ tp->advmss = mss;
+ }
+
+ return (__u16)mss;
+}
+
+/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
+ * This is the first part of cwnd validation mechanism. */
+static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
+{
+ s32 delta = tcp_time_stamp - tp->lsndtime;
+ u32 restart_cwnd = tcp_init_cwnd(tp, dst);
+ u32 cwnd = tp->snd_cwnd;
+
+ if (tcp_is_vegas(tp))
+ tcp_vegas_enable(tp);
+
+ tp->snd_ssthresh = tcp_current_ssthresh(tp);
+ restart_cwnd = min(restart_cwnd, cwnd);
+
+ while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
+ cwnd >>= 1;
+ tp->snd_cwnd = max(cwnd, restart_cwnd);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->snd_cwnd_used = 0;
+}
+
+static inline void tcp_event_data_sent(struct tcp_sock *tp,
+ struct sk_buff *skb, struct sock *sk)
+{
+ u32 now = tcp_time_stamp;
+
+ if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
+ tcp_cwnd_restart(tp, __sk_dst_get(sk));
+
+ tp->lsndtime = now;
+
+ /* If it is a reply for ato after last received
+ * packet, enter pingpong mode.
+ */
+ if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
+ tp->ack.pingpong = 1;
+}
+
+static __inline__ void tcp_event_ack_sent(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tcp_dec_quickack_mode(tp);
+ tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
+}
+
+/* Determine a window scaling and initial window to offer.
+ * Based on the assumption that the given amount of space
+ * will be offered. Store the results in the tp structure.
+ * NOTE: for smooth operation initial space offering should
+ * be a multiple of mss if possible. We assume here that mss >= 1.
+ * This MUST be enforced by all callers.
+ */
+void tcp_select_initial_window(int __space, __u32 mss,
+ __u32 *rcv_wnd, __u32 *window_clamp,
+ int wscale_ok, __u8 *rcv_wscale)
+{
+ unsigned int space = (__space < 0 ? 0 : __space);
+
+ /* If no clamp set the clamp to the max possible scaled window */
+ if (*window_clamp == 0)
+ (*window_clamp) = (65535 << 14);
+ space = min(*window_clamp, space);
+
+ /* Quantize space offering to a multiple of mss if possible. */
+ if (space > mss)
+ space = (space / mss) * mss;
+
+ /* NOTE: offering an initial window larger than 32767
+ * will break some buggy TCP stacks. We try to be nice.
+ * If we are not window scaling, then this truncates
+ * our initial window offering to 32k. There should also
+ * be a sysctl option to stop being nice.
+ */
+ (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
+ (*rcv_wscale) = 0;
+ if (wscale_ok) {
+ /* Set window scaling on max possible window
+ * See RFC1323 for an explanation of the limit to 14
+ */
+ space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
+ while (space > 65535 && (*rcv_wscale) < 14) {
+ space >>= 1;
+ (*rcv_wscale)++;
+ }
+ }
+
+ /* Set initial window to value enough for senders,
+ * following RFC1414. Senders, not following this RFC,
+ * will be satisfied with 2.
+ */
+ if (mss > (1<<*rcv_wscale)) {
+ int init_cwnd = 4;
+ if (mss > 1460*3)
+ init_cwnd = 2;
+ else if (mss > 1460)
+ init_cwnd = 3;
+ if (*rcv_wnd > init_cwnd*mss)
+ *rcv_wnd = init_cwnd*mss;
+ }
+
+ /* Set the clamp no higher than max representable value */
+ (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
+}
+
+/* Chose a new window to advertise, update state in tcp_sock for the
+ * socket, and return result with RFC1323 scaling applied. The return
+ * value can be stuffed directly into th->window for an outgoing
+ * frame.
+ */
+static __inline__ u16 tcp_select_window(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 cur_win = tcp_receive_window(tp);
+ u32 new_win = __tcp_select_window(sk);
+
+ /* Never shrink the offered window */
+ if(new_win < cur_win) {
+ /* Danger Will Robinson!
+ * Don't update rcv_wup/rcv_wnd here or else
+ * we will not be able to advertise a zero
+ * window in time. --DaveM
+ *
+ * Relax Will Robinson.
+ */
+ new_win = cur_win;
+ }
+ tp->rcv_wnd = new_win;
+ tp->rcv_wup = tp->rcv_nxt;
+
+ /* Make sure we do not exceed the maximum possible
+ * scaled window.
+ */
+ if (!tp->rx_opt.rcv_wscale)
+ new_win = min(new_win, MAX_TCP_WINDOW);
+ else
+ new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
+
+ /* RFC1323 scaling applied */
+ new_win >>= tp->rx_opt.rcv_wscale;
+
+ /* If we advertise zero window, disable fast path. */
+ if (new_win == 0)
+ tp->pred_flags = 0;
+
+ return new_win;
+}
+
+
+/* This routine actually transmits TCP packets queued in by
+ * tcp_do_sendmsg(). This is used by both the initial
+ * transmission and possible later retransmissions.
+ * All SKB's seen here are completely headerless. It is our
+ * job to build the TCP header, and pass the packet down to
+ * IP so it can do the same plus pass the packet off to the
+ * device.
+ *
+ * We are working here with either a clone of the original
+ * SKB, or a fresh unique copy made by the retransmit engine.
+ */
+static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+ if (skb != NULL) {
+ struct inet_sock *inet = inet_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+ int tcp_header_size = tp->tcp_header_len;
+ struct tcphdr *th;
+ int sysctl_flags;
+ int err;
+
+ BUG_ON(!tcp_skb_pcount(skb));
+
+#define SYSCTL_FLAG_TSTAMPS 0x1
+#define SYSCTL_FLAG_WSCALE 0x2
+#define SYSCTL_FLAG_SACK 0x4
+
+ sysctl_flags = 0;
+ if (tcb->flags & TCPCB_FLAG_SYN) {
+ tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
+ if(sysctl_tcp_timestamps) {
+ tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
+ sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
+ }
+ if(sysctl_tcp_window_scaling) {
+ tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
+ sysctl_flags |= SYSCTL_FLAG_WSCALE;
+ }
+ if(sysctl_tcp_sack) {
+ sysctl_flags |= SYSCTL_FLAG_SACK;
+ if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
+ tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
+ }
+ } else if (tp->rx_opt.eff_sacks) {
+ /* A SACK is 2 pad bytes, a 2 byte header, plus
+ * 2 32-bit sequence numbers for each SACK block.
+ */
+ tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
+ (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
+ }
+
+ /*
+ * If the connection is idle and we are restarting,
+ * then we don't want to do any Vegas calculations
+ * until we get fresh RTT samples. So when we
+ * restart, we reset our Vegas state to a clean
+ * slate. After we get acks for this flight of
+ * packets, _then_ we can make Vegas calculations
+ * again.
+ */
+ if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
+ tcp_vegas_enable(tp);
+
+ th = (struct tcphdr *) skb_push(skb, tcp_header_size);
+ skb->h.th = th;
+ skb_set_owner_w(skb, sk);
+
+ /* Build TCP header and checksum it. */
+ th->source = inet->sport;
+ th->dest = inet->dport;
+ th->seq = htonl(tcb->seq);
+ th->ack_seq = htonl(tp->rcv_nxt);
+ *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
+ if (tcb->flags & TCPCB_FLAG_SYN) {
+ /* RFC1323: The window in SYN & SYN/ACK segments
+ * is never scaled.
+ */
+ th->window = htons(tp->rcv_wnd);
+ } else {
+ th->window = htons(tcp_select_window(sk));
+ }
+ th->check = 0;
+ th->urg_ptr = 0;
+
+ if (tp->urg_mode &&
+ between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
+ th->urg_ptr = htons(tp->snd_up-tcb->seq);
+ th->urg = 1;
+ }
+
+ if (tcb->flags & TCPCB_FLAG_SYN) {
+ tcp_syn_build_options((__u32 *)(th + 1),
+ tcp_advertise_mss(sk),
+ (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
+ (sysctl_flags & SYSCTL_FLAG_SACK),
+ (sysctl_flags & SYSCTL_FLAG_WSCALE),
+ tp->rx_opt.rcv_wscale,
+ tcb->when,
+ tp->rx_opt.ts_recent);
+ } else {
+ tcp_build_and_update_options((__u32 *)(th + 1),
+ tp, tcb->when);
+
+ TCP_ECN_send(sk, tp, skb, tcp_header_size);
+ }
+ tp->af_specific->send_check(sk, th, skb->len, skb);
+
+ if (tcb->flags & TCPCB_FLAG_ACK)
+ tcp_event_ack_sent(sk);
+
+ if (skb->len != tcp_header_size)
+ tcp_event_data_sent(tp, skb, sk);
+
+ TCP_INC_STATS(TCP_MIB_OUTSEGS);
+
+ err = tp->af_specific->queue_xmit(skb, 0);
+ if (err <= 0)
+ return err;
+
+ tcp_enter_cwr(tp);
+
+ /* NET_XMIT_CN is special. It does not guarantee,
+ * that this packet is lost. It tells that device
+ * is about to start to drop packets or already
+ * drops some packets of the same priority and
+ * invokes us to send less aggressively.
+ */
+ return err == NET_XMIT_CN ? 0 : err;
+ }
+ return -ENOBUFS;
+#undef SYSCTL_FLAG_TSTAMPS
+#undef SYSCTL_FLAG_WSCALE
+#undef SYSCTL_FLAG_SACK
+}
+
+
+/* This routine just queue's the buffer
+ *
+ * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
+ * otherwise socket can stall.
+ */
+static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Advance write_seq and place onto the write_queue. */
+ tp->write_seq = TCP_SKB_CB(skb)->end_seq;
+ skb_header_release(skb);
+ __skb_queue_tail(&sk->sk_write_queue, skb);
+ sk_charge_skb(sk, skb);
+
+ /* Queue it, remembering where we must start sending. */
+ if (sk->sk_send_head == NULL)
+ sk->sk_send_head = skb;
+}
+
+static inline void tcp_tso_set_push(struct sk_buff *skb)
+{
+ /* Force push to be on for any TSO frames to workaround
+ * problems with busted implementations like Mac OS-X that
+ * hold off socket receive wakeups until push is seen.
+ */
+ if (tcp_skb_pcount(skb) > 1)
+ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+}
+
+/* Send _single_ skb sitting at the send head. This function requires
+ * true push pending frames to setup probe timer etc.
+ */
+void tcp_push_one(struct sock *sk, unsigned cur_mss)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb = sk->sk_send_head;
+
+ if (tcp_snd_test(tp, skb, cur_mss, TCP_NAGLE_PUSH)) {
+ /* Send it out now. */
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ tcp_tso_set_push(skb);
+ if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
+ sk->sk_send_head = NULL;
+ tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ tcp_packets_out_inc(sk, tp, skb);
+ return;
+ }
+ }
+}
+
+void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_std)
+{
+ if (skb->len <= mss_std) {
+ /* Avoid the costly divide in the normal
+ * non-TSO case.
+ */
+ skb_shinfo(skb)->tso_segs = 1;
+ skb_shinfo(skb)->tso_size = 0;
+ } else {
+ unsigned int factor;
+
+ factor = skb->len + (mss_std - 1);
+ factor /= mss_std;
+ skb_shinfo(skb)->tso_segs = factor;
+ skb_shinfo(skb)->tso_size = mss_std;
+ }
+}
+
+/* Function to create two new TCP segments. Shrinks the given segment
+ * to the specified size and appends a new segment with the rest of the
+ * packet to the list. This won't be called frequently, I hope.
+ * Remember, these are still headerless SKBs at this point.
+ */
+static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *buff;
+ int nsize;
+ u16 flags;
+
+ nsize = skb_headlen(skb) - len;
+ if (nsize < 0)
+ nsize = 0;
+
+ if (skb_cloned(skb) &&
+ skb_is_nonlinear(skb) &&
+ pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ return -ENOMEM;
+
+ /* Get a new skb... force flag on. */
+ buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
+ if (buff == NULL)
+ return -ENOMEM; /* We'll just try again later. */
+ sk_charge_skb(sk, buff);
+
+ /* Correct the sequence numbers. */
+ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+ TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+
+ /* PSH and FIN should only be set in the second packet. */
+ flags = TCP_SKB_CB(skb)->flags;
+ TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
+ TCP_SKB_CB(buff)->flags = flags;
+ TCP_SKB_CB(buff)->sacked =
+ (TCP_SKB_CB(skb)->sacked &
+ (TCPCB_LOST | TCPCB_EVER_RETRANS | TCPCB_AT_TAIL));
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
+
+ if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
+ /* Copy and checksum data tail into the new buffer. */
+ buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
+ nsize, 0);
+
+ skb_trim(skb, len);
+
+ skb->csum = csum_block_sub(skb->csum, buff->csum, len);
+ } else {
+ skb->ip_summed = CHECKSUM_HW;
+ skb_split(skb, buff, len);
+ }
+
+ buff->ip_summed = skb->ip_summed;
+
+ /* Looks stupid, but our code really uses when of
+ * skbs, which it never sent before. --ANK
+ */
+ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
+
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
+ tp->lost_out -= tcp_skb_pcount(skb);
+ tp->left_out -= tcp_skb_pcount(skb);
+ }
+
+ /* Fix up tso_factor for both original and new SKB. */
+ tcp_set_skb_tso_segs(skb, tp->mss_cache_std);
+ tcp_set_skb_tso_segs(buff, tp->mss_cache_std);
+
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
+ tp->lost_out += tcp_skb_pcount(skb);
+ tp->left_out += tcp_skb_pcount(skb);
+ }
+
+ if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
+ tp->lost_out += tcp_skb_pcount(buff);
+ tp->left_out += tcp_skb_pcount(buff);
+ }
+
+ /* Link BUFF into the send queue. */
+ __skb_append(skb, buff);
+
+ return 0;
+}
+
+/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
+ * eventually). The difference is that pulled data not copied, but
+ * immediately discarded.
+ */
+static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len)
+{
+ int i, k, eat;
+
+ eat = len;
+ k = 0;
+ for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
+ if (skb_shinfo(skb)->frags[i].size <= eat) {
+ put_page(skb_shinfo(skb)->frags[i].page);
+ eat -= skb_shinfo(skb)->frags[i].size;
+ } else {
+ skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
+ if (eat) {
+ skb_shinfo(skb)->frags[k].page_offset += eat;
+ skb_shinfo(skb)->frags[k].size -= eat;
+ eat = 0;
+ }
+ k++;
+ }
+ }
+ skb_shinfo(skb)->nr_frags = k;
+
+ skb->tail = skb->data;
+ skb->data_len -= len;
+ skb->len = skb->data_len;
+ return skb->tail;
+}
+
+int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
+{
+ if (skb_cloned(skb) &&
+ pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ return -ENOMEM;
+
+ if (len <= skb_headlen(skb)) {
+ __skb_pull(skb, len);
+ } else {
+ if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL)
+ return -ENOMEM;
+ }
+
+ TCP_SKB_CB(skb)->seq += len;
+ skb->ip_summed = CHECKSUM_HW;
+
+ skb->truesize -= len;
+ sk->sk_wmem_queued -= len;
+ sk->sk_forward_alloc += len;
+ sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
+
+ /* Any change of skb->len requires recalculation of tso
+ * factor and mss.
+ */
+ if (tcp_skb_pcount(skb) > 1)
+ tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
+
+ return 0;
+}
+
+/* This function synchronize snd mss to current pmtu/exthdr set.
+
+ tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
+ for TCP options, but includes only bare TCP header.
+
+ tp->rx_opt.mss_clamp is mss negotiated at connection setup.
+ It is minumum of user_mss and mss received with SYN.
+ It also does not include TCP options.
+
+ tp->pmtu_cookie is last pmtu, seen by this function.
+
+ tp->mss_cache is current effective sending mss, including
+ all tcp options except for SACKs. It is evaluated,
+ taking into account current pmtu, but never exceeds
+ tp->rx_opt.mss_clamp.
+
+ NOTE1. rfc1122 clearly states that advertised MSS
+ DOES NOT include either tcp or ip options.
+
+ NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
+ this function. --ANK (980731)
+ */
+
+unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int mss_now;
+
+ /* Calculate base mss without TCP options:
+ It is MMS_S - sizeof(tcphdr) of rfc1122
+ */
+ mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
+
+ /* Clamp it (mss_clamp does not include tcp options) */
+ if (mss_now > tp->rx_opt.mss_clamp)
+ mss_now = tp->rx_opt.mss_clamp;
+
+ /* Now subtract optional transport overhead */
+ mss_now -= tp->ext_header_len;
+
+ /* Then reserve room for full set of TCP options and 8 bytes of data */
+ if (mss_now < 48)
+ mss_now = 48;
+
+ /* Now subtract TCP options size, not including SACKs */
+ mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+
+ /* Bound mss with half of window */
+ if (tp->max_window && mss_now > (tp->max_window>>1))
+ mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
+
+ /* And store cached results */
+ tp->pmtu_cookie = pmtu;
+ tp->mss_cache = tp->mss_cache_std = mss_now;
+
+ return mss_now;
+}
+
+/* Compute the current effective MSS, taking SACKs and IP options,
+ * and even PMTU discovery events into account.
+ *
+ * LARGESEND note: !urg_mode is overkill, only frames up to snd_up
+ * cannot be large. However, taking into account rare use of URG, this
+ * is not a big flaw.
+ */
+
+unsigned int tcp_current_mss(struct sock *sk, int large)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct dst_entry *dst = __sk_dst_get(sk);
+ unsigned int do_large, mss_now;
+
+ mss_now = tp->mss_cache_std;
+ if (dst) {
+ u32 mtu = dst_mtu(dst);
+ if (mtu != tp->pmtu_cookie)
+ mss_now = tcp_sync_mss(sk, mtu);
+ }
+
+ do_large = (large &&
+ (sk->sk_route_caps & NETIF_F_TSO) &&
+ !tp->urg_mode);
+
+ if (do_large) {
+ unsigned int large_mss, factor, limit;
+
+ large_mss = 65535 - tp->af_specific->net_header_len -
+ tp->ext_header_len - tp->tcp_header_len;
+
+ if (tp->max_window && large_mss > (tp->max_window>>1))
+ large_mss = max((tp->max_window>>1),
+ 68U - tp->tcp_header_len);
+
+ factor = large_mss / mss_now;
+
+ /* Always keep large mss multiple of real mss, but
+ * do not exceed 1/tso_win_divisor of the congestion window
+ * so we can keep the ACK clock ticking and minimize
+ * bursting.
+ */
+ limit = tp->snd_cwnd;
+ if (sysctl_tcp_tso_win_divisor)
+ limit /= sysctl_tcp_tso_win_divisor;
+ limit = max(1U, limit);
+ if (factor > limit)
+ factor = limit;
+
+ tp->mss_cache = mss_now * factor;
+
+ mss_now = tp->mss_cache;
+ }
+
+ if (tp->rx_opt.eff_sacks)
+ mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
+ (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
+ return mss_now;
+}
+
+/* This routine writes packets to the network. It advances the
+ * send_head. This happens as incoming acks open up the remote
+ * window for us.
+ *
+ * Returns 1, if no segments are in flight and we have queued segments, but
+ * cannot send anything now because of SWS or another problem.
+ */
+int tcp_write_xmit(struct sock *sk, int nonagle)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int mss_now;
+
+ /* If we are closed, the bytes will have to remain here.
+ * In time closedown will finish, we empty the write queue and all
+ * will be happy.
+ */
+ if (sk->sk_state != TCP_CLOSE) {
+ struct sk_buff *skb;
+ int sent_pkts = 0;
+
+ /* Account for SACKS, we may need to fragment due to this.
+ * It is just like the real MSS changing on us midstream.
+ * We also handle things correctly when the user adds some
+ * IP options mid-stream. Silly to do, but cover it.
+ */
+ mss_now = tcp_current_mss(sk, 1);
+
+ while ((skb = sk->sk_send_head) &&
+ tcp_snd_test(tp, skb, mss_now,
+ tcp_skb_is_last(sk, skb) ? nonagle :
+ TCP_NAGLE_PUSH)) {
+ if (skb->len > mss_now) {
+ if (tcp_fragment(sk, skb, mss_now))
+ break;
+ }
+
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ tcp_tso_set_push(skb);
+ if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
+ break;
+
+ /* Advance the send_head. This one is sent out.
+ * This call will increment packets_out.
+ */
+ update_send_head(sk, tp, skb);
+
+ tcp_minshall_update(tp, mss_now, skb);
+ sent_pkts = 1;
+ }
+
+ if (sent_pkts) {
+ tcp_cwnd_validate(sk, tp);
+ return 0;
+ }
+
+ return !tp->packets_out && sk->sk_send_head;
+ }
+ return 0;
+}
+
+/* This function returns the amount that we can raise the
+ * usable window based on the following constraints
+ *
+ * 1. The window can never be shrunk once it is offered (RFC 793)
+ * 2. We limit memory per socket
+ *
+ * RFC 1122:
+ * "the suggested [SWS] avoidance algorithm for the receiver is to keep
+ * RECV.NEXT + RCV.WIN fixed until:
+ * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
+ *
+ * i.e. don't raise the right edge of the window until you can raise
+ * it at least MSS bytes.
+ *
+ * Unfortunately, the recommended algorithm breaks header prediction,
+ * since header prediction assumes th->window stays fixed.
+ *
+ * Strictly speaking, keeping th->window fixed violates the receiver
+ * side SWS prevention criteria. The problem is that under this rule
+ * a stream of single byte packets will cause the right side of the
+ * window to always advance by a single byte.
+ *
+ * Of course, if the sender implements sender side SWS prevention
+ * then this will not be a problem.
+ *
+ * BSD seems to make the following compromise:
+ *
+ * If the free space is less than the 1/4 of the maximum
+ * space available and the free space is less than 1/2 mss,
+ * then set the window to 0.
+ * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
+ * Otherwise, just prevent the window from shrinking
+ * and from being larger than the largest representable value.
+ *
+ * This prevents incremental opening of the window in the regime
+ * where TCP is limited by the speed of the reader side taking
+ * data out of the TCP receive queue. It does nothing about
+ * those cases where the window is constrained on the sender side
+ * because the pipeline is full.
+ *
+ * BSD also seems to "accidentally" limit itself to windows that are a
+ * multiple of MSS, at least until the free space gets quite small.
+ * This would appear to be a side effect of the mbuf implementation.
+ * Combining these two algorithms results in the observed behavior
+ * of having a fixed window size at almost all times.
+ *
+ * Below we obtain similar behavior by forcing the offered window to
+ * a multiple of the mss when it is feasible to do so.
+ *
+ * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
+ * Regular options like TIMESTAMP are taken into account.
+ */
+u32 __tcp_select_window(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ /* MSS for the peer's data. Previous verions used mss_clamp
+ * here. I don't know if the value based on our guesses
+ * of peer's MSS is better for the performance. It's more correct
+ * but may be worse for the performance because of rcv_mss
+ * fluctuations. --SAW 1998/11/1
+ */
+ int mss = tp->ack.rcv_mss;
+ int free_space = tcp_space(sk);
+ int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
+ int window;
+
+ if (mss > full_space)
+ mss = full_space;
+
+ if (free_space < full_space/2) {
+ tp->ack.quick = 0;
+
+ if (tcp_memory_pressure)
+ tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
+
+ if (free_space < mss)
+ return 0;
+ }
+
+ if (free_space > tp->rcv_ssthresh)
+ free_space = tp->rcv_ssthresh;
+
+ /* Don't do rounding if we are using window scaling, since the
+ * scaled window will not line up with the MSS boundary anyway.
+ */
+ window = tp->rcv_wnd;
+ if (tp->rx_opt.rcv_wscale) {
+ window = free_space;
+
+ /* Advertise enough space so that it won't get scaled away.
+ * Import case: prevent zero window announcement if
+ * 1<<rcv_wscale > mss.
+ */
+ if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window)
+ window = (((window >> tp->rx_opt.rcv_wscale) + 1)
+ << tp->rx_opt.rcv_wscale);
+ } else {
+ /* Get the largest window that is a nice multiple of mss.
+ * Window clamp already applied above.
+ * If our current window offering is within 1 mss of the
+ * free space we just keep it. This prevents the divide
+ * and multiply from happening most of the time.
+ * We also don't do any window rounding when the free space
+ * is too small.
+ */
+ if (window <= free_space - mss || window > free_space)
+ window = (free_space/mss)*mss;
+ }
+
+ return window;
+}
+
+/* Attempt to collapse two adjacent SKB's during retransmission. */
+static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *next_skb = skb->next;
+
+ /* The first test we must make is that neither of these two
+ * SKB's are still referenced by someone else.
+ */
+ if (!skb_cloned(skb) && !skb_cloned(next_skb)) {
+ int skb_size = skb->len, next_skb_size = next_skb->len;
+ u16 flags = TCP_SKB_CB(skb)->flags;
+
+ /* Also punt if next skb has been SACK'd. */
+ if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
+ return;
+
+ /* Next skb is out of window. */
+ if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
+ return;
+
+ /* Punt if not enough space exists in the first SKB for
+ * the data in the second, or the total combined payload
+ * would exceed the MSS.
+ */
+ if ((next_skb_size > skb_tailroom(skb)) ||
+ ((skb_size + next_skb_size) > mss_now))
+ return;
+
+ BUG_ON(tcp_skb_pcount(skb) != 1 ||
+ tcp_skb_pcount(next_skb) != 1);
+
+ /* Ok. We will be able to collapse the packet. */
+ __skb_unlink(next_skb, next_skb->list);
+
+ memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
+
+ if (next_skb->ip_summed == CHECKSUM_HW)
+ skb->ip_summed = CHECKSUM_HW;
+
+ if (skb->ip_summed != CHECKSUM_HW)
+ skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
+
+ /* Update sequence range on original skb. */
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
+
+ /* Merge over control information. */
+ flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
+ TCP_SKB_CB(skb)->flags = flags;
+
+ /* All done, get rid of second SKB and account for it so
+ * packet counting does not break.
+ */
+ TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
+ if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
+ tp->retrans_out -= tcp_skb_pcount(next_skb);
+ if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
+ tp->lost_out -= tcp_skb_pcount(next_skb);
+ tp->left_out -= tcp_skb_pcount(next_skb);
+ }
+ /* Reno case is special. Sigh... */
+ if (!tp->rx_opt.sack_ok && tp->sacked_out) {
+ tcp_dec_pcount_approx(&tp->sacked_out, next_skb);
+ tp->left_out -= tcp_skb_pcount(next_skb);
+ }
+
+ /* Not quite right: it can be > snd.fack, but
+ * it is better to underestimate fackets.
+ */
+ tcp_dec_pcount_approx(&tp->fackets_out, next_skb);
+ tcp_packets_out_dec(tp, next_skb);
+ sk_stream_free_skb(sk, next_skb);
+ }
+}
+
+/* Do a simple retransmit without using the backoff mechanisms in
+ * tcp_timer. This is used for path mtu discovery.
+ * The socket is already locked here.
+ */
+void tcp_simple_retransmit(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ unsigned int mss = tcp_current_mss(sk, 0);
+ int lost = 0;
+
+ sk_stream_for_retrans_queue(skb, sk) {
+ if (skb->len > mss &&
+ !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
+ if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
+ }
+ if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
+ TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ tp->lost_out += tcp_skb_pcount(skb);
+ lost = 1;
+ }
+ }
+ }
+
+ if (!lost)
+ return;
+
+ tcp_sync_left_out(tp);
+
+ /* Don't muck with the congestion window here.
+ * Reason is that we do not increase amount of _data_
+ * in network, but units changed and effective
+ * cwnd/ssthresh really reduced now.
+ */
+ if (tp->ca_state != TCP_CA_Loss) {
+ tp->high_seq = tp->snd_nxt;
+ tp->snd_ssthresh = tcp_current_ssthresh(tp);
+ tp->prior_ssthresh = 0;
+ tp->undo_marker = 0;
+ tcp_set_ca_state(tp, TCP_CA_Loss);
+ }
+ tcp_xmit_retransmit_queue(sk);
+}
+
+/* This retransmits one SKB. Policy decisions and retransmit queue
+ * state updates are done by the caller. Returns non-zero if an
+ * error occurred which prevented the send.
+ */
+int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int cur_mss = tcp_current_mss(sk, 0);
+ int err;
+
+ /* Do not sent more than we queued. 1/4 is reserved for possible
+ * copying overhead: frgagmentation, tunneling, mangling etc.
+ */
+ if (atomic_read(&sk->sk_wmem_alloc) >
+ min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
+ return -EAGAIN;
+
+ if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
+ if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+ BUG();
+
+ if (sk->sk_route_caps & NETIF_F_TSO) {
+ sk->sk_route_caps &= ~NETIF_F_TSO;
+ sock_set_flag(sk, SOCK_NO_LARGESEND);
+ tp->mss_cache = tp->mss_cache_std;
+ }
+
+ if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
+ return -ENOMEM;
+ }
+
+ /* If receiver has shrunk his window, and skb is out of
+ * new window, do not retransmit it. The exception is the
+ * case, when window is shrunk to zero. In this case
+ * our retransmit serves as a zero window probe.
+ */
+ if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)
+ && TCP_SKB_CB(skb)->seq != tp->snd_una)
+ return -EAGAIN;
+
+ if (skb->len > cur_mss) {
+ int old_factor = tcp_skb_pcount(skb);
+ int new_factor;
+
+ if (tcp_fragment(sk, skb, cur_mss))
+ return -ENOMEM; /* We'll try again later. */
+
+ /* New SKB created, account for it. */
+ new_factor = tcp_skb_pcount(skb);
+ tp->packets_out -= old_factor - new_factor;
+ tp->packets_out += tcp_skb_pcount(skb->next);
+ }
+
+ /* Collapse two adjacent packets if worthwhile and we can. */
+ if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
+ (skb->len < (cur_mss >> 1)) &&
+ (skb->next != sk->sk_send_head) &&
+ (skb->next != (struct sk_buff *)&sk->sk_write_queue) &&
+ (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) &&
+ (tcp_skb_pcount(skb) == 1 && tcp_skb_pcount(skb->next) == 1) &&
+ (sysctl_tcp_retrans_collapse != 0))
+ tcp_retrans_try_collapse(sk, skb, cur_mss);
+
+ if(tp->af_specific->rebuild_header(sk))
+ return -EHOSTUNREACH; /* Routing failure or similar. */
+
+ /* Some Solaris stacks overoptimize and ignore the FIN on a
+ * retransmit when old data is attached. So strip it off
+ * since it is cheap to do so and saves bytes on the network.
+ */
+ if(skb->len > 0 &&
+ (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
+ tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
+ if (!pskb_trim(skb, 0)) {
+ TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
+ skb_shinfo(skb)->tso_segs = 1;
+ skb_shinfo(skb)->tso_size = 0;
+ skb->ip_summed = CHECKSUM_NONE;
+ skb->csum = 0;
+ }
+ }
+
+ /* Make a copy, if the first transmission SKB clone we made
+ * is still in somebody's hands, else make a clone.
+ */
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ tcp_tso_set_push(skb);
+
+ err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
+ pskb_copy(skb, GFP_ATOMIC):
+ skb_clone(skb, GFP_ATOMIC)));
+
+ if (err == 0) {
+ /* Update global TCP statistics. */
+ TCP_INC_STATS(TCP_MIB_RETRANSSEGS);
+
+ tp->total_retrans++;
+
+#if FASTRETRANS_DEBUG > 0
+ if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
+ if (net_ratelimit())
+ printk(KERN_DEBUG "retrans_out leaked.\n");
+ }
+#endif
+ TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
+ tp->retrans_out += tcp_skb_pcount(skb);
+
+ /* Save stamp of the first retransmit. */
+ if (!tp->retrans_stamp)
+ tp->retrans_stamp = TCP_SKB_CB(skb)->when;
+
+ tp->undo_retrans++;
+
+ /* snd_nxt is stored to detect loss of retransmitted segment,
+ * see tcp_input.c tcp_sacktag_write_queue().
+ */
+ TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
+ }
+ return err;
+}
+
+/* This gets called after a retransmit timeout, and the initially
+ * retransmitted data is acknowledged. It tries to continue
+ * resending the rest of the retransmit queue, until either
+ * we've sent it all or the congestion window limit is reached.
+ * If doing SACK, the first ACK which comes back for a timeout
+ * based retransmit packet might feed us FACK information again.
+ * If so, we use it to avoid unnecessarily retransmissions.
+ */
+void tcp_xmit_retransmit_queue(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ int packet_cnt = tp->lost_out;
+
+ /* First pass: retransmit lost packets. */
+ if (packet_cnt) {
+ sk_stream_for_retrans_queue(skb, sk) {
+ __u8 sacked = TCP_SKB_CB(skb)->sacked;
+
+ /* Assume this retransmit will generate
+ * only one packet for congestion window
+ * calculation purposes. This works because
+ * tcp_retransmit_skb() will chop up the
+ * packet to be MSS sized and all the
+ * packet counting works out.
+ */
+ if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
+ return;
+
+ if (sacked&TCPCB_LOST) {
+ if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
+ if (tcp_retransmit_skb(sk, skb))
+ return;
+ if (tp->ca_state != TCP_CA_Loss)
+ NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
+ else
+ NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
+
+ if (skb ==
+ skb_peek(&sk->sk_write_queue))
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ }
+
+ packet_cnt -= tcp_skb_pcount(skb);
+ if (packet_cnt <= 0)
+ break;
+ }
+ }
+ }
+
+ /* OK, demanded retransmission is finished. */
+
+ /* Forward retransmissions are possible only during Recovery. */
+ if (tp->ca_state != TCP_CA_Recovery)
+ return;
+
+ /* No forward retransmissions in Reno are possible. */
+ if (!tp->rx_opt.sack_ok)
+ return;
+
+ /* Yeah, we have to make difficult choice between forward transmission
+ * and retransmission... Both ways have their merits...
+ *
+ * For now we do not retransmit anything, while we have some new
+ * segments to send.
+ */
+
+ if (tcp_may_send_now(sk, tp))
+ return;
+
+ packet_cnt = 0;
+
+ sk_stream_for_retrans_queue(skb, sk) {
+ /* Similar to the retransmit loop above we
+ * can pretend that the retransmitted SKB
+ * we send out here will be composed of one
+ * real MSS sized packet because tcp_retransmit_skb()
+ * will fragment it if necessary.
+ */
+ if (++packet_cnt > tp->fackets_out)
+ break;
+
+ if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
+ break;
+
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
+ continue;
+
+ /* Ok, retransmit it. */
+ if (tcp_retransmit_skb(sk, skb))
+ break;
+
+ if (skb == skb_peek(&sk->sk_write_queue))
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+
+ NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
+ }
+}
+
+
+/* Send a fin. The caller locks the socket for us. This cannot be
+ * allowed to fail queueing a FIN frame under any circumstances.
+ */
+void tcp_send_fin(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue);
+ int mss_now;
+
+ /* Optimization, tack on the FIN if we have a queue of
+ * unsent frames. But be careful about outgoing SACKS
+ * and IP options.
+ */
+ mss_now = tcp_current_mss(sk, 1);
+
+ if (sk->sk_send_head != NULL) {
+ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
+ TCP_SKB_CB(skb)->end_seq++;
+ tp->write_seq++;
+ } else {
+ /* Socket is locked, keep trying until memory is available. */
+ for (;;) {
+ skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
+ if (skb)
+ break;
+ yield();
+ }
+
+ /* Reserve space for headers and prepare control bits. */
+ skb_reserve(skb, MAX_TCP_HEADER);
+ skb->csum = 0;
+ TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
+ TCP_SKB_CB(skb)->sacked = 0;
+ skb_shinfo(skb)->tso_segs = 1;
+ skb_shinfo(skb)->tso_size = 0;
+
+ /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
+ TCP_SKB_CB(skb)->seq = tp->write_seq;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
+ tcp_queue_skb(sk, skb);
+ }
+ __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_OFF);
+}
+
+/* We get here when a process closes a file descriptor (either due to
+ * an explicit close() or as a byproduct of exit()'ing) and there
+ * was unread data in the receive queue. This behavior is recommended
+ * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
+ */
+void tcp_send_active_reset(struct sock *sk, int priority)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+
+ /* NOTE: No TCP options attached and we never retransmit this. */
+ skb = alloc_skb(MAX_TCP_HEADER, priority);
+ if (!skb) {
+ NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
+ return;
+ }
+
+ /* Reserve space for headers and prepare control bits. */
+ skb_reserve(skb, MAX_TCP_HEADER);
+ skb->csum = 0;
+ TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
+ TCP_SKB_CB(skb)->sacked = 0;
+ skb_shinfo(skb)->tso_segs = 1;
+ skb_shinfo(skb)->tso_size = 0;
+
+ /* Send it off. */
+ TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ if (tcp_transmit_skb(sk, skb))
+ NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
+}
+
+/* WARNING: This routine must only be called when we have already sent
+ * a SYN packet that crossed the incoming SYN that caused this routine
+ * to get called. If this assumption fails then the initial rcv_wnd
+ * and rcv_wscale values will not be correct.
+ */
+int tcp_send_synack(struct sock *sk)
+{
+ struct sk_buff* skb;
+
+ skb = skb_peek(&sk->sk_write_queue);
+ if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
+ printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
+ return -EFAULT;
+ }
+ if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
+ if (skb_cloned(skb)) {
+ struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
+ if (nskb == NULL)
+ return -ENOMEM;
+ __skb_unlink(skb, &sk->sk_write_queue);
+ skb_header_release(nskb);
+ __skb_queue_head(&sk->sk_write_queue, nskb);
+ sk_stream_free_skb(sk, skb);
+ sk_charge_skb(sk, nskb);
+ skb = nskb;
+ }
+
+ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
+ TCP_ECN_send_synack(tcp_sk(sk), skb);
+ }
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+}
+
+/*
+ * Prepare a SYN-ACK.
+ */
+struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
+ struct open_request *req)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcphdr *th;
+ int tcp_header_size;
+ struct sk_buff *skb;
+
+ skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
+ if (skb == NULL)
+ return NULL;
+
+ /* Reserve space for headers. */
+ skb_reserve(skb, MAX_TCP_HEADER);
+
+ skb->dst = dst_clone(dst);
+
+ tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
+ (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
+ (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
+ /* SACK_PERM is in the place of NOP NOP of TS */
+ ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
+ skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
+
+ memset(th, 0, sizeof(struct tcphdr));
+ th->syn = 1;
+ th->ack = 1;
+ if (dst->dev->features&NETIF_F_TSO)
+ req->ecn_ok = 0;
+ TCP_ECN_make_synack(req, th);
+ th->source = inet_sk(sk)->sport;
+ th->dest = req->rmt_port;
+ TCP_SKB_CB(skb)->seq = req->snt_isn;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
+ TCP_SKB_CB(skb)->sacked = 0;
+ skb_shinfo(skb)->tso_segs = 1;
+ skb_shinfo(skb)->tso_size = 0;
+ th->seq = htonl(TCP_SKB_CB(skb)->seq);
+ th->ack_seq = htonl(req->rcv_isn + 1);
+ if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
+ __u8 rcv_wscale;
+ /* Set this up on the first call only */
+ req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+ /* tcp_full_space because it is guaranteed to be the first packet */
+ tcp_select_initial_window(tcp_full_space(sk),
+ dst_metric(dst, RTAX_ADVMSS) - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+ &req->rcv_wnd,
+ &req->window_clamp,
+ req->wscale_ok,
+ &rcv_wscale);
+ req->rcv_wscale = rcv_wscale;
+ }
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
+ th->window = htons(req->rcv_wnd);
+
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), req->tstamp_ok,
+ req->sack_ok, req->wscale_ok, req->rcv_wscale,
+ TCP_SKB_CB(skb)->when,
+ req->ts_recent);
+
+ skb->csum = 0;
+ th->doff = (tcp_header_size >> 2);
+ TCP_INC_STATS(TCP_MIB_OUTSEGS);
+ return skb;
+}
+
+/*
+ * Do all connect socket setups that can be done AF independent.
+ */
+static inline void tcp_connect_init(struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ __u8 rcv_wscale;
+
+ /* We'll fix this up when we get a response from the other end.
+ * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
+ */
+ tp->tcp_header_len = sizeof(struct tcphdr) +
+ (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
+
+ /* If user gave his TCP_MAXSEG, record it to clamp */
+ if (tp->rx_opt.user_mss)
+ tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+ tp->max_window = 0;
+ tcp_sync_mss(sk, dst_mtu(dst));
+
+ if (!tp->window_clamp)
+ tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
+ tp->advmss = dst_metric(dst, RTAX_ADVMSS);
+ tcp_initialize_rcv_mss(sk);
+ tcp_ca_init(tp);
+
+ tcp_select_initial_window(tcp_full_space(sk),
+ tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
+ &tp->rcv_wnd,
+ &tp->window_clamp,
+ sysctl_tcp_window_scaling,
+ &rcv_wscale);
+
+ tp->rx_opt.rcv_wscale = rcv_wscale;
+ tp->rcv_ssthresh = tp->rcv_wnd;
+
+ sk->sk_err = 0;
+ sock_reset_flag(sk, SOCK_DONE);
+ tp->snd_wnd = 0;
+ tcp_init_wl(tp, tp->write_seq, 0);
+ tp->snd_una = tp->write_seq;
+ tp->snd_sml = tp->write_seq;
+ tp->rcv_nxt = 0;
+ tp->rcv_wup = 0;
+ tp->copied_seq = 0;
+
+ tp->rto = TCP_TIMEOUT_INIT;
+ tp->retransmits = 0;
+ tcp_clear_retrans(tp);
+}
+
+/*
+ * Build a SYN and send it off.
+ */
+int tcp_connect(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *buff;
+
+ tcp_connect_init(sk);
+
+ buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
+ if (unlikely(buff == NULL))
+ return -ENOBUFS;
+
+ /* Reserve space for headers. */
+ skb_reserve(buff, MAX_TCP_HEADER);
+
+ TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
+ TCP_ECN_send_syn(sk, tp, buff);
+ TCP_SKB_CB(buff)->sacked = 0;
+ skb_shinfo(buff)->tso_segs = 1;
+ skb_shinfo(buff)->tso_size = 0;
+ buff->csum = 0;
+ TCP_SKB_CB(buff)->seq = tp->write_seq++;
+ TCP_SKB_CB(buff)->end_seq = tp->write_seq;
+ tp->snd_nxt = tp->write_seq;
+ tp->pushed_seq = tp->write_seq;
+ tcp_ca_init(tp);
+
+ /* Send it off. */
+ TCP_SKB_CB(buff)->when = tcp_time_stamp;
+ tp->retrans_stamp = TCP_SKB_CB(buff)->when;
+ skb_header_release(buff);
+ __skb_queue_tail(&sk->sk_write_queue, buff);
+ sk_charge_skb(sk, buff);
+ tp->packets_out += tcp_skb_pcount(buff);
+ tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
+ TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
+
+ /* Timer for repeating the SYN until an answer. */
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ return 0;
+}
+
+/* Send out a delayed ack, the caller does the policy checking
+ * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
+ * for details.
+ */
+void tcp_send_delayed_ack(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int ato = tp->ack.ato;
+ unsigned long timeout;
+
+ if (ato > TCP_DELACK_MIN) {
+ int max_ato = HZ/2;
+
+ if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
+ max_ato = TCP_DELACK_MAX;
+
+ /* Slow path, intersegment interval is "high". */
+
+ /* If some rtt estimate is known, use it to bound delayed ack.
+ * Do not use tp->rto here, use results of rtt measurements
+ * directly.
+ */
+ if (tp->srtt) {
+ int rtt = max(tp->srtt>>3, TCP_DELACK_MIN);
+
+ if (rtt < max_ato)
+ max_ato = rtt;
+ }
+
+ ato = min(ato, max_ato);
+ }
+
+ /* Stay within the limit we were given */
+ timeout = jiffies + ato;
+
+ /* Use new timeout only if there wasn't a older one earlier. */
+ if (tp->ack.pending&TCP_ACK_TIMER) {
+ /* If delack timer was blocked or is about to expire,
+ * send ACK now.
+ */
+ if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
+ tcp_send_ack(sk);
+ return;
+ }
+
+ if (!time_before(timeout, tp->ack.timeout))
+ timeout = tp->ack.timeout;
+ }
+ tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
+ tp->ack.timeout = timeout;
+ sk_reset_timer(sk, &tp->delack_timer, timeout);
+}
+
+/* This routine sends an ack and also updates the window. */
+void tcp_send_ack(struct sock *sk)
+{
+ /* If we have been reset, we may not send again. */
+ if (sk->sk_state != TCP_CLOSE) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *buff;
+
+ /* We are not putting this on the write queue, so
+ * tcp_transmit_skb() will set the ownership to this
+ * sock.
+ */
+ buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+ if (buff == NULL) {
+ tcp_schedule_ack(tp);
+ tp->ack.ato = TCP_ATO_MIN;
+ tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
+ return;
+ }
+
+ /* Reserve space for headers and prepare control bits. */
+ skb_reserve(buff, MAX_TCP_HEADER);
+ buff->csum = 0;
+ TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
+ TCP_SKB_CB(buff)->sacked = 0;
+ skb_shinfo(buff)->tso_segs = 1;
+ skb_shinfo(buff)->tso_size = 0;
+
+ /* Send it off, this clears delayed acks for us. */
+ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
+ TCP_SKB_CB(buff)->when = tcp_time_stamp;
+ tcp_transmit_skb(sk, buff);
+ }
+}
+
+/* This routine sends a packet with an out of date sequence
+ * number. It assumes the other end will try to ack it.
+ *
+ * Question: what should we make while urgent mode?
+ * 4.4BSD forces sending single byte of data. We cannot send
+ * out of window data, because we have SND.NXT==SND.MAX...
+ *
+ * Current solution: to send TWO zero-length segments in urgent mode:
+ * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
+ * out-of-date with SND.UNA-1 to probe window.
+ */
+static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+
+ /* We don't queue it, tcp_transmit_skb() sets ownership. */
+ skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+ if (skb == NULL)
+ return -1;
+
+ /* Reserve space for headers and set control bits. */
+ skb_reserve(skb, MAX_TCP_HEADER);
+ skb->csum = 0;
+ TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
+ TCP_SKB_CB(skb)->sacked = urgent;
+ skb_shinfo(skb)->tso_segs = 1;
+ skb_shinfo(skb)->tso_size = 0;
+
+ /* Use a previous sequence. This should cause the other
+ * end to send an ack. Don't queue or clone SKB, just
+ * send it.
+ */
+ TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ return tcp_transmit_skb(sk, skb);
+}
+
+int tcp_write_wakeup(struct sock *sk)
+{
+ if (sk->sk_state != TCP_CLOSE) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+
+ if ((skb = sk->sk_send_head) != NULL &&
+ before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
+ int err;
+ unsigned int mss = tcp_current_mss(sk, 0);
+ unsigned int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
+
+ if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
+ tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
+
+ /* We are probing the opening of a window
+ * but the window size is != 0
+ * must have been a result SWS avoidance ( sender )
+ */
+ if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
+ skb->len > mss) {
+ seg_size = min(seg_size, mss);
+ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+ if (tcp_fragment(sk, skb, seg_size))
+ return -1;
+ /* SWS override triggered forced fragmentation.
+ * Disable TSO, the connection is too sick. */
+ if (sk->sk_route_caps & NETIF_F_TSO) {
+ sock_set_flag(sk, SOCK_NO_LARGESEND);
+ sk->sk_route_caps &= ~NETIF_F_TSO;
+ tp->mss_cache = tp->mss_cache_std;
+ }
+ } else if (!tcp_skb_pcount(skb))
+ tcp_set_skb_tso_segs(skb, tp->mss_cache_std);
+
+ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ tcp_tso_set_push(skb);
+ err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+ if (!err) {
+ update_send_head(sk, tp, skb);
+ }
+ return err;
+ } else {
+ if (tp->urg_mode &&
+ between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
+ tcp_xmit_probe_skb(sk, TCPCB_URG);
+ return tcp_xmit_probe_skb(sk, 0);
+ }
+ }
+ return -1;
+}
+
+/* A window probe timeout has occurred. If window is not closed send
+ * a partial packet else a zero probe.
+ */
+void tcp_send_probe0(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int err;
+
+ err = tcp_write_wakeup(sk);
+
+ if (tp->packets_out || !sk->sk_send_head) {
+ /* Cancel probe timer, if it is not required. */
+ tp->probes_out = 0;
+ tp->backoff = 0;
+ return;
+ }
+
+ if (err <= 0) {
+ if (tp->backoff < sysctl_tcp_retries2)
+ tp->backoff++;
+ tp->probes_out++;
+ tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
+ min(tp->rto << tp->backoff, TCP_RTO_MAX));
+ } else {
+ /* If packet was not sent due to local congestion,
+ * do not backoff and do not remember probes_out.
+ * Let local senders to fight for local resources.
+ *
+ * Use accumulated backoff yet.
+ */
+ if (!tp->probes_out)
+ tp->probes_out=1;
+ tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
+ min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
+ }
+}
+
+EXPORT_SYMBOL(tcp_connect);
+EXPORT_SYMBOL(tcp_make_synack);
+EXPORT_SYMBOL(tcp_simple_retransmit);
+EXPORT_SYMBOL(tcp_sync_mss);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
new file mode 100644
index 00000000000..85b279f1e93
--- /dev/null
+++ b/net/ipv4/tcp_timer.c
@@ -0,0 +1,656 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version: $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ * Linus Torvalds, <torvalds@cs.helsinki.fi>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Matthew Dillon, <dillon@apollo.west.oic.com>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+#include <linux/module.h>
+#include <net/tcp.h>
+
+int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
+int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
+int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
+int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
+int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
+int sysctl_tcp_retries1 = TCP_RETR1;
+int sysctl_tcp_retries2 = TCP_RETR2;
+int sysctl_tcp_orphan_retries;
+
+static void tcp_write_timer(unsigned long);
+static void tcp_delack_timer(unsigned long);
+static void tcp_keepalive_timer (unsigned long data);
+
+#ifdef TCP_DEBUG
+const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
+EXPORT_SYMBOL(tcp_timer_bug_msg);
+#endif
+
+/*
+ * Using different timers for retransmit, delayed acks and probes
+ * We may wish use just one timer maintaining a list of expire jiffies
+ * to optimize.
+ */
+
+void tcp_init_xmit_timers(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ init_timer(&tp->retransmit_timer);
+ tp->retransmit_timer.function=&tcp_write_timer;
+ tp->retransmit_timer.data = (unsigned long) sk;
+ tp->pending = 0;
+
+ init_timer(&tp->delack_timer);
+ tp->delack_timer.function=&tcp_delack_timer;
+ tp->delack_timer.data = (unsigned long) sk;
+ tp->ack.pending = 0;
+
+ init_timer(&sk->sk_timer);
+ sk->sk_timer.function = &tcp_keepalive_timer;
+ sk->sk_timer.data = (unsigned long)sk;
+}
+
+void tcp_clear_xmit_timers(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tp->pending = 0;
+ sk_stop_timer(sk, &tp->retransmit_timer);
+
+ tp->ack.pending = 0;
+ tp->ack.blocked = 0;
+ sk_stop_timer(sk, &tp->delack_timer);
+
+ sk_stop_timer(sk, &sk->sk_timer);
+}
+
+static void tcp_write_err(struct sock *sk)
+{
+ sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
+ sk->sk_error_report(sk);
+
+ tcp_done(sk);
+ NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
+}
+
+/* Do not allow orphaned sockets to eat all our resources.
+ * This is direct violation of TCP specs, but it is required
+ * to prevent DoS attacks. It is called when a retransmission timeout
+ * or zero probe timeout occurs on orphaned socket.
+ *
+ * Criterium is still not confirmed experimentally and may change.
+ * We kill the socket, if:
+ * 1. If number of orphaned sockets exceeds an administratively configured
+ * limit.
+ * 2. If we have strong memory pressure.
+ */
+static int tcp_out_of_resources(struct sock *sk, int do_reset)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int orphans = atomic_read(&tcp_orphan_count);
+
+ /* If peer does not open window for long time, or did not transmit
+ * anything for long time, penalize it. */
+ if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
+ orphans <<= 1;
+
+ /* If some dubious ICMP arrived, penalize even more. */
+ if (sk->sk_err_soft)
+ orphans <<= 1;
+
+ if (orphans >= sysctl_tcp_max_orphans ||
+ (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
+ atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
+ if (net_ratelimit())
+ printk(KERN_INFO "Out of socket memory\n");
+
+ /* Catch exceptional cases, when connection requires reset.
+ * 1. Last segment was sent recently. */
+ if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
+ /* 2. Window is closed. */
+ (!tp->snd_wnd && !tp->packets_out))
+ do_reset = 1;
+ if (do_reset)
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_done(sk);
+ NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
+ return 1;
+ }
+ return 0;
+}
+
+/* Calculate maximal number or retries on an orphaned socket. */
+static int tcp_orphan_retries(struct sock *sk, int alive)
+{
+ int retries = sysctl_tcp_orphan_retries; /* May be zero. */
+
+ /* We know from an ICMP that something is wrong. */
+ if (sk->sk_err_soft && !alive)
+ retries = 0;
+
+ /* However, if socket sent something recently, select some safe
+ * number of retries. 8 corresponds to >100 seconds with minimal
+ * RTO of 200msec. */
+ if (retries == 0 && alive)
+ retries = 8;
+ return retries;
+}
+
+/* A write timeout has occurred. Process the after effects. */
+static int tcp_write_timeout(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int retry_until;
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ if (tp->retransmits)
+ dst_negative_advice(&sk->sk_dst_cache);
+ retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
+ } else {
+ if (tp->retransmits >= sysctl_tcp_retries1) {
+ /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
+ hole detection. :-(
+
+ It is place to make it. It is not made. I do not want
+ to make it. It is disguisting. It does not work in any
+ case. Let me to cite the same draft, which requires for
+ us to implement this:
+
+ "The one security concern raised by this memo is that ICMP black holes
+ are often caused by over-zealous security administrators who block
+ all ICMP messages. It is vitally important that those who design and
+ deploy security systems understand the impact of strict filtering on
+ upper-layer protocols. The safest web site in the world is worthless
+ if most TCP implementations cannot transfer data from it. It would
+ be far nicer to have all of the black holes fixed rather than fixing
+ all of the TCP implementations."
+
+ Golden words :-).
+ */
+
+ dst_negative_advice(&sk->sk_dst_cache);
+ }
+
+ retry_until = sysctl_tcp_retries2;
+ if (sock_flag(sk, SOCK_DEAD)) {
+ int alive = (tp->rto < TCP_RTO_MAX);
+
+ retry_until = tcp_orphan_retries(sk, alive);
+
+ if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until))
+ return 1;
+ }
+ }
+
+ if (tp->retransmits >= retry_until) {
+ /* Has it gone just too far? */
+ tcp_write_err(sk);
+ return 1;
+ }
+ return 0;
+}
+
+static void tcp_delack_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock*)data;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ /* Try again later. */
+ tp->ack.blocked = 1;
+ NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
+ sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN);
+ goto out_unlock;
+ }
+
+ sk_stream_mem_reclaim(sk);
+
+ if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER))
+ goto out;
+
+ if (time_after(tp->ack.timeout, jiffies)) {
+ sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout);
+ goto out;
+ }
+ tp->ack.pending &= ~TCP_ACK_TIMER;
+
+ if (skb_queue_len(&tp->ucopy.prequeue)) {
+ struct sk_buff *skb;
+
+ NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED,
+ skb_queue_len(&tp->ucopy.prequeue));
+
+ while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
+ sk->sk_backlog_rcv(sk, skb);
+
+ tp->ucopy.memory = 0;
+ }
+
+ if (tcp_ack_scheduled(tp)) {
+ if (!tp->ack.pingpong) {
+ /* Delayed ACK missed: inflate ATO. */
+ tp->ack.ato = min(tp->ack.ato << 1, tp->rto);
+ } else {
+ /* Delayed ACK missed: leave pingpong mode and
+ * deflate ATO.
+ */
+ tp->ack.pingpong = 0;
+ tp->ack.ato = TCP_ATO_MIN;
+ }
+ tcp_send_ack(sk);
+ NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
+ }
+ TCP_CHECK_TIMER(sk);
+
+out:
+ if (tcp_memory_pressure)
+ sk_stream_mem_reclaim(sk);
+out_unlock:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+static void tcp_probe_timer(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int max_probes;
+
+ if (tp->packets_out || !sk->sk_send_head) {
+ tp->probes_out = 0;
+ return;
+ }
+
+ /* *WARNING* RFC 1122 forbids this
+ *
+ * It doesn't AFAIK, because we kill the retransmit timer -AK
+ *
+ * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
+ * this behaviour in Solaris down as a bug fix. [AC]
+ *
+ * Let me to explain. probes_out is zeroed by incoming ACKs
+ * even if they advertise zero window. Hence, connection is killed only
+ * if we received no ACKs for normal connection timeout. It is not killed
+ * only because window stays zero for some time, window may be zero
+ * until armageddon and even later. We are in full accordance
+ * with RFCs, only probe timer combines both retransmission timeout
+ * and probe timeout in one bottle. --ANK
+ */
+ max_probes = sysctl_tcp_retries2;
+
+ if (sock_flag(sk, SOCK_DEAD)) {
+ int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX);
+
+ max_probes = tcp_orphan_retries(sk, alive);
+
+ if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
+ return;
+ }
+
+ if (tp->probes_out > max_probes) {
+ tcp_write_err(sk);
+ } else {
+ /* Only send another probe if we didn't close things up. */
+ tcp_send_probe0(sk);
+ }
+}
+
+/*
+ * The TCP retransmit timer.
+ */
+
+static void tcp_retransmit_timer(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tp->packets_out)
+ goto out;
+
+ BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue));
+
+ if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
+ !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
+ /* Receiver dastardly shrinks window. Our retransmits
+ * become zero probes, but we should not timeout this
+ * connection. If the socket is an orphan, time it out,
+ * we cannot allow such beasts to hang infinitely.
+ */
+#ifdef TCP_DEBUG
+ if (net_ratelimit()) {
+ struct inet_sock *inet = inet_sk(sk);
+ printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n",
+ NIPQUAD(inet->daddr), htons(inet->dport),
+ inet->num, tp->snd_una, tp->snd_nxt);
+ }
+#endif
+ if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
+ tcp_write_err(sk);
+ goto out;
+ }
+ tcp_enter_loss(sk, 0);
+ tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
+ __sk_dst_reset(sk);
+ goto out_reset_timer;
+ }
+
+ if (tcp_write_timeout(sk))
+ goto out;
+
+ if (tp->retransmits == 0) {
+ if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
+ if (tp->rx_opt.sack_ok) {
+ if (tp->ca_state == TCP_CA_Recovery)
+ NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
+ else
+ NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
+ } else {
+ if (tp->ca_state == TCP_CA_Recovery)
+ NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
+ else
+ NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
+ }
+ } else if (tp->ca_state == TCP_CA_Loss) {
+ NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
+ } else {
+ NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
+ }
+ }
+
+ if (tcp_use_frto(sk)) {
+ tcp_enter_frto(sk);
+ } else {
+ tcp_enter_loss(sk, 0);
+ }
+
+ if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) {
+ /* Retransmission failed because of local congestion,
+ * do not backoff.
+ */
+ if (!tp->retransmits)
+ tp->retransmits=1;
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
+ min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
+ goto out;
+ }
+
+ /* Increase the timeout each time we retransmit. Note that
+ * we do not increase the rtt estimate. rto is initialized
+ * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
+ * that doubling rto each time is the least we can get away with.
+ * In KA9Q, Karn uses this for the first few times, and then
+ * goes to quadratic. netBSD doubles, but only goes up to *64,
+ * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
+ * defined in the protocol as the maximum possible RTT. I guess
+ * we'll have to use something other than TCP to talk to the
+ * University of Mars.
+ *
+ * PAWS allows us longer timeouts and large windows, so once
+ * implemented ftp to mars will work nicely. We will have to fix
+ * the 120 second clamps though!
+ */
+ tp->backoff++;
+ tp->retransmits++;
+
+out_reset_timer:
+ tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ if (tp->retransmits > sysctl_tcp_retries1)
+ __sk_dst_reset(sk);
+
+out:;
+}
+
+static void tcp_write_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock*)data;
+ struct tcp_sock *tp = tcp_sk(sk);
+ int event;
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ /* Try again later */
+ sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20));
+ goto out_unlock;
+ }
+
+ if (sk->sk_state == TCP_CLOSE || !tp->pending)
+ goto out;
+
+ if (time_after(tp->timeout, jiffies)) {
+ sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout);
+ goto out;
+ }
+
+ event = tp->pending;
+ tp->pending = 0;
+
+ switch (event) {
+ case TCP_TIME_RETRANS:
+ tcp_retransmit_timer(sk);
+ break;
+ case TCP_TIME_PROBE0:
+ tcp_probe_timer(sk);
+ break;
+ }
+ TCP_CHECK_TIMER(sk);
+
+out:
+ sk_stream_mem_reclaim(sk);
+out_unlock:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+/*
+ * Timer for listening sockets
+ */
+
+static void tcp_synack_timer(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_listen_opt *lopt = tp->listen_opt;
+ int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
+ int thresh = max_retries;
+ unsigned long now = jiffies;
+ struct open_request **reqp, *req;
+ int i, budget;
+
+ if (lopt == NULL || lopt->qlen == 0)
+ return;
+
+ /* Normally all the openreqs are young and become mature
+ * (i.e. converted to established socket) for first timeout.
+ * If synack was not acknowledged for 3 seconds, it means
+ * one of the following things: synack was lost, ack was lost,
+ * rtt is high or nobody planned to ack (i.e. synflood).
+ * When server is a bit loaded, queue is populated with old
+ * open requests, reducing effective size of queue.
+ * When server is well loaded, queue size reduces to zero
+ * after several minutes of work. It is not synflood,
+ * it is normal operation. The solution is pruning
+ * too old entries overriding normal timeout, when
+ * situation becomes dangerous.
+ *
+ * Essentially, we reserve half of room for young
+ * embrions; and abort old ones without pity, if old
+ * ones are about to clog our table.
+ */
+ if (lopt->qlen>>(lopt->max_qlen_log-1)) {
+ int young = (lopt->qlen_young<<1);
+
+ while (thresh > 2) {
+ if (lopt->qlen < young)
+ break;
+ thresh--;
+ young <<= 1;
+ }
+ }
+
+ if (tp->defer_accept)
+ max_retries = tp->defer_accept;
+
+ budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
+ i = lopt->clock_hand;
+
+ do {
+ reqp=&lopt->syn_table[i];
+ while ((req = *reqp) != NULL) {
+ if (time_after_eq(now, req->expires)) {
+ if ((req->retrans < thresh ||
+ (req->acked && req->retrans < max_retries))
+ && !req->class->rtx_syn_ack(sk, req, NULL)) {
+ unsigned long timeo;
+
+ if (req->retrans++ == 0)
+ lopt->qlen_young--;
+ timeo = min((TCP_TIMEOUT_INIT << req->retrans),
+ TCP_RTO_MAX);
+ req->expires = now + timeo;
+ reqp = &req->dl_next;
+ continue;
+ }
+
+ /* Drop this request */
+ write_lock(&tp->syn_wait_lock);
+ *reqp = req->dl_next;
+ write_unlock(&tp->syn_wait_lock);
+ lopt->qlen--;
+ if (req->retrans == 0)
+ lopt->qlen_young--;
+ tcp_openreq_free(req);
+ continue;
+ }
+ reqp = &req->dl_next;
+ }
+
+ i = (i+1)&(TCP_SYNQ_HSIZE-1);
+
+ } while (--budget > 0);
+
+ lopt->clock_hand = i;
+
+ if (lopt->qlen)
+ tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
+}
+
+void tcp_delete_keepalive_timer (struct sock *sk)
+{
+ sk_stop_timer(sk, &sk->sk_timer);
+}
+
+void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
+{
+ sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+}
+
+void tcp_set_keepalive(struct sock *sk, int val)
+{
+ if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
+ return;
+
+ if (val && !sock_flag(sk, SOCK_KEEPOPEN))
+ tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
+ else if (!val)
+ tcp_delete_keepalive_timer(sk);
+}
+
+
+static void tcp_keepalive_timer (unsigned long data)
+{
+ struct sock *sk = (struct sock *) data;
+ struct tcp_sock *tp = tcp_sk(sk);
+ __u32 elapsed;
+
+ /* Only process if socket is not in use. */
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ /* Try again later. */
+ tcp_reset_keepalive_timer (sk, HZ/20);
+ goto out;
+ }
+
+ if (sk->sk_state == TCP_LISTEN) {
+ tcp_synack_timer(sk);
+ goto out;
+ }
+
+ if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
+ if (tp->linger2 >= 0) {
+ int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
+
+ if (tmo > 0) {
+ tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+ goto out;
+ }
+ }
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ goto death;
+ }
+
+ if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
+ goto out;
+
+ elapsed = keepalive_time_when(tp);
+
+ /* It is alive without keepalive 8) */
+ if (tp->packets_out || sk->sk_send_head)
+ goto resched;
+
+ elapsed = tcp_time_stamp - tp->rcv_tstamp;
+
+ if (elapsed >= keepalive_time_when(tp)) {
+ if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
+ (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_write_err(sk);
+ goto out;
+ }
+ if (tcp_write_wakeup(sk) <= 0) {
+ tp->probes_out++;
+ elapsed = keepalive_intvl_when(tp);
+ } else {
+ /* If keepalive was lost due to local congestion,
+ * try harder.
+ */
+ elapsed = TCP_RESOURCE_PROBE_INTERVAL;
+ }
+ } else {
+ /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
+ elapsed = keepalive_time_when(tp) - elapsed;
+ }
+
+ TCP_CHECK_TIMER(sk);
+ sk_stream_mem_reclaim(sk);
+
+resched:
+ tcp_reset_keepalive_timer (sk, elapsed);
+ goto out;
+
+death:
+ tcp_done(sk);
+
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+EXPORT_SYMBOL(tcp_clear_xmit_timers);
+EXPORT_SYMBOL(tcp_delete_keepalive_timer);
+EXPORT_SYMBOL(tcp_init_xmit_timers);
+EXPORT_SYMBOL(tcp_reset_keepalive_timer);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
new file mode 100644
index 00000000000..6baddfbedca
--- /dev/null
+++ b/net/ipv4/udp.c
@@ -0,0 +1,1575 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The User Datagram Protocol (UDP).
+ *
+ * Version: $Id: udp.c,v 1.102 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Alan Cox, <Alan.Cox@linux.org>
+ * Hirokazu Takahashi, <taka@valinux.co.jp>
+ *
+ * Fixes:
+ * Alan Cox : verify_area() calls
+ * Alan Cox : stopped close while in use off icmp
+ * messages. Not a fix but a botch that
+ * for udp at least is 'valid'.
+ * Alan Cox : Fixed icmp handling properly
+ * Alan Cox : Correct error for oversized datagrams
+ * Alan Cox : Tidied select() semantics.
+ * Alan Cox : udp_err() fixed properly, also now
+ * select and read wake correctly on errors
+ * Alan Cox : udp_send verify_area moved to avoid mem leak
+ * Alan Cox : UDP can count its memory
+ * Alan Cox : send to an unknown connection causes
+ * an ECONNREFUSED off the icmp, but
+ * does NOT close.
+ * Alan Cox : Switched to new sk_buff handlers. No more backlog!
+ * Alan Cox : Using generic datagram code. Even smaller and the PEEK
+ * bug no longer crashes it.
+ * Fred Van Kempen : Net2e support for sk->broadcast.
+ * Alan Cox : Uses skb_free_datagram
+ * Alan Cox : Added get/set sockopt support.
+ * Alan Cox : Broadcasting without option set returns EACCES.
+ * Alan Cox : No wakeup calls. Instead we now use the callbacks.
+ * Alan Cox : Use ip_tos and ip_ttl
+ * Alan Cox : SNMP Mibs
+ * Alan Cox : MSG_DONTROUTE, and 0.0.0.0 support.
+ * Matt Dillon : UDP length checks.
+ * Alan Cox : Smarter af_inet used properly.
+ * Alan Cox : Use new kernel side addressing.
+ * Alan Cox : Incorrect return on truncated datagram receive.
+ * Arnt Gulbrandsen : New udp_send and stuff
+ * Alan Cox : Cache last socket
+ * Alan Cox : Route cache
+ * Jon Peatfield : Minor efficiency fix to sendto().
+ * Mike Shaver : RFC1122 checks.
+ * Alan Cox : Nonblocking error fix.
+ * Willy Konynenberg : Transparent proxying support.
+ * Mike McLagan : Routing by source
+ * David S. Miller : New socket lookup architecture.
+ * Last socket cache retained as it
+ * does have a high hit rate.
+ * Olaf Kirch : Don't linearise iovec on sendmsg.
+ * Andi Kleen : Some cleanups, cache destination entry
+ * for connect.
+ * Vitaly E. Lavrov : Transparent proxy revived after year coma.
+ * Melvin Smith : Check msg_name not msg_namelen in sendto(),
+ * return ENOTCONN for unconnected sockets (POSIX)
+ * Janos Farkas : don't deliver multi/broadcasts to a different
+ * bound-to-device socket
+ * Hirokazu Takahashi : HW checksumming for outgoing UDP
+ * datagrams.
+ * Hirokazu Takahashi : sendfile() on UDP works now.
+ * Arnaldo C. Melo : convert /proc/net/udp to seq_file
+ * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
+ * Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind
+ * a single port at the same time.
+ * Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/config.h>
+#include <linux/inet.h>
+#include <linux/ipv6.h>
+#include <linux/netdevice.h>
+#include <net/snmp.h>
+#include <net/tcp.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/sock.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/inet_common.h>
+#include <net/checksum.h>
+#include <net/xfrm.h>
+
+/*
+ * Snmp MIB for the UDP layer
+ */
+
+DEFINE_SNMP_STAT(struct udp_mib, udp_statistics);
+
+struct hlist_head udp_hash[UDP_HTABLE_SIZE];
+DEFINE_RWLOCK(udp_hash_lock);
+
+/* Shared by v4/v6 udp. */
+int udp_port_rover;
+
+static int udp_v4_get_port(struct sock *sk, unsigned short snum)
+{
+ struct hlist_node *node;
+ struct sock *sk2;
+ struct inet_sock *inet = inet_sk(sk);
+
+ write_lock_bh(&udp_hash_lock);
+ if (snum == 0) {
+ int best_size_so_far, best, result, i;
+
+ if (udp_port_rover > sysctl_local_port_range[1] ||
+ udp_port_rover < sysctl_local_port_range[0])
+ udp_port_rover = sysctl_local_port_range[0];
+ best_size_so_far = 32767;
+ best = result = udp_port_rover;
+ for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) {
+ struct hlist_head *list;
+ int size;
+
+ list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)];
+ if (hlist_empty(list)) {
+ if (result > sysctl_local_port_range[1])
+ result = sysctl_local_port_range[0] +
+ ((result - sysctl_local_port_range[0]) &
+ (UDP_HTABLE_SIZE - 1));
+ goto gotit;
+ }
+ size = 0;
+ sk_for_each(sk2, node, list)
+ if (++size >= best_size_so_far)
+ goto next;
+ best_size_so_far = size;
+ best = result;
+ next:;
+ }
+ result = best;
+ for(i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++, result += UDP_HTABLE_SIZE) {
+ if (result > sysctl_local_port_range[1])
+ result = sysctl_local_port_range[0]
+ + ((result - sysctl_local_port_range[0]) &
+ (UDP_HTABLE_SIZE - 1));
+ if (!udp_lport_inuse(result))
+ break;
+ }
+ if (i >= (1 << 16) / UDP_HTABLE_SIZE)
+ goto fail;
+gotit:
+ udp_port_rover = snum = result;
+ } else {
+ sk_for_each(sk2, node,
+ &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) {
+ struct inet_sock *inet2 = inet_sk(sk2);
+
+ if (inet2->num == snum &&
+ sk2 != sk &&
+ !ipv6_only_sock(sk2) &&
+ (!sk2->sk_bound_dev_if ||
+ !sk->sk_bound_dev_if ||
+ sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ (!inet2->rcv_saddr ||
+ !inet->rcv_saddr ||
+ inet2->rcv_saddr == inet->rcv_saddr) &&
+ (!sk2->sk_reuse || !sk->sk_reuse))
+ goto fail;
+ }
+ }
+ inet->num = snum;
+ if (sk_unhashed(sk)) {
+ struct hlist_head *h = &udp_hash[snum & (UDP_HTABLE_SIZE - 1)];
+
+ sk_add_node(sk, h);
+ sock_prot_inc_use(sk->sk_prot);
+ }
+ write_unlock_bh(&udp_hash_lock);
+ return 0;
+
+fail:
+ write_unlock_bh(&udp_hash_lock);
+ return 1;
+}
+
+static void udp_v4_hash(struct sock *sk)
+{
+ BUG();
+}
+
+static void udp_v4_unhash(struct sock *sk)
+{
+ write_lock_bh(&udp_hash_lock);
+ if (sk_del_node_init(sk)) {
+ inet_sk(sk)->num = 0;
+ sock_prot_dec_use(sk->sk_prot);
+ }
+ write_unlock_bh(&udp_hash_lock);
+}
+
+/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
+ * harder than this. -DaveM
+ */
+static struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport,
+ u32 daddr, u16 dport, int dif)
+{
+ struct sock *sk, *result = NULL;
+ struct hlist_node *node;
+ unsigned short hnum = ntohs(dport);
+ int badness = -1;
+
+ sk_for_each(sk, node, &udp_hash[hnum & (UDP_HTABLE_SIZE - 1)]) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (inet->num == hnum && !ipv6_only_sock(sk)) {
+ int score = (sk->sk_family == PF_INET ? 1 : 0);
+ if (inet->rcv_saddr) {
+ if (inet->rcv_saddr != daddr)
+ continue;
+ score+=2;
+ }
+ if (inet->daddr) {
+ if (inet->daddr != saddr)
+ continue;
+ score+=2;
+ }
+ if (inet->dport) {
+ if (inet->dport != sport)
+ continue;
+ score+=2;
+ }
+ if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if != dif)
+ continue;
+ score+=2;
+ }
+ if(score == 9) {
+ result = sk;
+ break;
+ } else if(score > badness) {
+ result = sk;
+ badness = score;
+ }
+ }
+ }
+ return result;
+}
+
+static __inline__ struct sock *udp_v4_lookup(u32 saddr, u16 sport,
+ u32 daddr, u16 dport, int dif)
+{
+ struct sock *sk;
+
+ read_lock(&udp_hash_lock);
+ sk = udp_v4_lookup_longway(saddr, sport, daddr, dport, dif);
+ if (sk)
+ sock_hold(sk);
+ read_unlock(&udp_hash_lock);
+ return sk;
+}
+
+static inline struct sock *udp_v4_mcast_next(struct sock *sk,
+ u16 loc_port, u32 loc_addr,
+ u16 rmt_port, u32 rmt_addr,
+ int dif)
+{
+ struct hlist_node *node;
+ struct sock *s = sk;
+ unsigned short hnum = ntohs(loc_port);
+
+ sk_for_each_from(s, node) {
+ struct inet_sock *inet = inet_sk(s);
+
+ if (inet->num != hnum ||
+ (inet->daddr && inet->daddr != rmt_addr) ||
+ (inet->dport != rmt_port && inet->dport) ||
+ (inet->rcv_saddr && inet->rcv_saddr != loc_addr) ||
+ ipv6_only_sock(s) ||
+ (s->sk_bound_dev_if && s->sk_bound_dev_if != dif))
+ continue;
+ if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif))
+ continue;
+ goto found;
+ }
+ s = NULL;
+found:
+ return s;
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition. If err < 0 then the socket should
+ * be closed and the error returned to the user. If err > 0
+ * it's just the icmp type << 8 | icmp code.
+ * Header points to the ip header of the error packet. We move
+ * on past this. Then (as it used to claim before adjustment)
+ * header points to the first 8 bytes of the udp header. We need
+ * to find the appropriate port.
+ */
+
+void udp_err(struct sk_buff *skb, u32 info)
+{
+ struct inet_sock *inet;
+ struct iphdr *iph = (struct iphdr*)skb->data;
+ struct udphdr *uh = (struct udphdr*)(skb->data+(iph->ihl<<2));
+ int type = skb->h.icmph->type;
+ int code = skb->h.icmph->code;
+ struct sock *sk;
+ int harderr;
+ int err;
+
+ sk = udp_v4_lookup(iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex);
+ if (sk == NULL) {
+ ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ return; /* No socket for error */
+ }
+
+ err = 0;
+ harderr = 0;
+ inet = inet_sk(sk);
+
+ switch (type) {
+ default:
+ case ICMP_TIME_EXCEEDED:
+ err = EHOSTUNREACH;
+ break;
+ case ICMP_SOURCE_QUENCH:
+ goto out;
+ case ICMP_PARAMETERPROB:
+ err = EPROTO;
+ harderr = 1;
+ break;
+ case ICMP_DEST_UNREACH:
+ if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+ if (inet->pmtudisc != IP_PMTUDISC_DONT) {
+ err = EMSGSIZE;
+ harderr = 1;
+ break;
+ }
+ goto out;
+ }
+ err = EHOSTUNREACH;
+ if (code <= NR_ICMP_UNREACH) {
+ harderr = icmp_err_convert[code].fatal;
+ err = icmp_err_convert[code].errno;
+ }
+ break;
+ }
+
+ /*
+ * RFC1122: OK. Passes ICMP errors back to application, as per
+ * 4.1.3.3.
+ */
+ if (!inet->recverr) {
+ if (!harderr || sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+ } else {
+ ip_icmp_error(sk, skb, err, uh->dest, info, (u8*)(uh+1));
+ }
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+out:
+ sock_put(sk);
+}
+
+/*
+ * Throw away all pending data and cancel the corking. Socket is locked.
+ */
+static void udp_flush_pending_frames(struct sock *sk)
+{
+ struct udp_sock *up = udp_sk(sk);
+
+ if (up->pending) {
+ up->len = 0;
+ up->pending = 0;
+ ip_flush_pending_frames(sk);
+ }
+}
+
+/*
+ * Push out all pending data as one UDP datagram. Socket is locked.
+ */
+static int udp_push_pending_frames(struct sock *sk, struct udp_sock *up)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct flowi *fl = &inet->cork.fl;
+ struct sk_buff *skb;
+ struct udphdr *uh;
+ int err = 0;
+
+ /* Grab the skbuff where UDP header space exists. */
+ if ((skb = skb_peek(&sk->sk_write_queue)) == NULL)
+ goto out;
+
+ /*
+ * Create a UDP header
+ */
+ uh = skb->h.uh;
+ uh->source = fl->fl_ip_sport;
+ uh->dest = fl->fl_ip_dport;
+ uh->len = htons(up->len);
+ uh->check = 0;
+
+ if (sk->sk_no_check == UDP_CSUM_NOXMIT) {
+ skb->ip_summed = CHECKSUM_NONE;
+ goto send;
+ }
+
+ if (skb_queue_len(&sk->sk_write_queue) == 1) {
+ /*
+ * Only one fragment on the socket.
+ */
+ if (skb->ip_summed == CHECKSUM_HW) {
+ skb->csum = offsetof(struct udphdr, check);
+ uh->check = ~csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst,
+ up->len, IPPROTO_UDP, 0);
+ } else {
+ skb->csum = csum_partial((char *)uh,
+ sizeof(struct udphdr), skb->csum);
+ uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst,
+ up->len, IPPROTO_UDP, skb->csum);
+ if (uh->check == 0)
+ uh->check = -1;
+ }
+ } else {
+ unsigned int csum = 0;
+ /*
+ * HW-checksum won't work as there are two or more
+ * fragments on the socket so that all csums of sk_buffs
+ * should be together.
+ */
+ if (skb->ip_summed == CHECKSUM_HW) {
+ int offset = (unsigned char *)uh - skb->data;
+ skb->csum = skb_checksum(skb, offset, skb->len - offset, 0);
+
+ skb->ip_summed = CHECKSUM_NONE;
+ } else {
+ skb->csum = csum_partial((char *)uh,
+ sizeof(struct udphdr), skb->csum);
+ }
+
+ skb_queue_walk(&sk->sk_write_queue, skb) {
+ csum = csum_add(csum, skb->csum);
+ }
+ uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst,
+ up->len, IPPROTO_UDP, csum);
+ if (uh->check == 0)
+ uh->check = -1;
+ }
+send:
+ err = ip_push_pending_frames(sk);
+out:
+ up->len = 0;
+ up->pending = 0;
+ return err;
+}
+
+
+static unsigned short udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr, unsigned long base)
+{
+ return(csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base));
+}
+
+int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ size_t len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct udp_sock *up = udp_sk(sk);
+ int ulen = len;
+ struct ipcm_cookie ipc;
+ struct rtable *rt = NULL;
+ int free = 0;
+ int connected = 0;
+ u32 daddr, faddr, saddr;
+ u16 dport;
+ u8 tos;
+ int err;
+ int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
+
+ if (len > 0xFFFF)
+ return -EMSGSIZE;
+
+ /*
+ * Check the flags.
+ */
+
+ if (msg->msg_flags&MSG_OOB) /* Mirror BSD error message compatibility */
+ return -EOPNOTSUPP;
+
+ ipc.opt = NULL;
+
+ if (up->pending) {
+ /*
+ * There are pending frames.
+ * The socket lock must be held while it's corked.
+ */
+ lock_sock(sk);
+ if (likely(up->pending)) {
+ if (unlikely(up->pending != AF_INET)) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+ goto do_append_data;
+ }
+ release_sock(sk);
+ }
+ ulen += sizeof(struct udphdr);
+
+ /*
+ * Get and verify the address.
+ */
+ if (msg->msg_name) {
+ struct sockaddr_in * usin = (struct sockaddr_in*)msg->msg_name;
+ if (msg->msg_namelen < sizeof(*usin))
+ return -EINVAL;
+ if (usin->sin_family != AF_INET) {
+ if (usin->sin_family != AF_UNSPEC)
+ return -EAFNOSUPPORT;
+ }
+
+ daddr = usin->sin_addr.s_addr;
+ dport = usin->sin_port;
+ if (dport == 0)
+ return -EINVAL;
+ } else {
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -EDESTADDRREQ;
+ daddr = inet->daddr;
+ dport = inet->dport;
+ /* Open fast path for connected socket.
+ Route will not be used, if at least one option is set.
+ */
+ connected = 1;
+ }
+ ipc.addr = inet->saddr;
+
+ ipc.oif = sk->sk_bound_dev_if;
+ if (msg->msg_controllen) {
+ err = ip_cmsg_send(msg, &ipc);
+ if (err)
+ return err;
+ if (ipc.opt)
+ free = 1;
+ connected = 0;
+ }
+ if (!ipc.opt)
+ ipc.opt = inet->opt;
+
+ saddr = ipc.addr;
+ ipc.addr = faddr = daddr;
+
+ if (ipc.opt && ipc.opt->srr) {
+ if (!daddr)
+ return -EINVAL;
+ faddr = ipc.opt->faddr;
+ connected = 0;
+ }
+ tos = RT_TOS(inet->tos);
+ if (sock_flag(sk, SOCK_LOCALROUTE) ||
+ (msg->msg_flags & MSG_DONTROUTE) ||
+ (ipc.opt && ipc.opt->is_strictroute)) {
+ tos |= RTO_ONLINK;
+ connected = 0;
+ }
+
+ if (MULTICAST(daddr)) {
+ if (!ipc.oif)
+ ipc.oif = inet->mc_index;
+ if (!saddr)
+ saddr = inet->mc_addr;
+ connected = 0;
+ }
+
+ if (connected)
+ rt = (struct rtable*)sk_dst_check(sk, 0);
+
+ if (rt == NULL) {
+ struct flowi fl = { .oif = ipc.oif,
+ .nl_u = { .ip4_u =
+ { .daddr = faddr,
+ .saddr = saddr,
+ .tos = tos } },
+ .proto = IPPROTO_UDP,
+ .uli_u = { .ports =
+ { .sport = inet->sport,
+ .dport = dport } } };
+ err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT));
+ if (err)
+ goto out;
+
+ err = -EACCES;
+ if ((rt->rt_flags & RTCF_BROADCAST) &&
+ !sock_flag(sk, SOCK_BROADCAST))
+ goto out;
+ if (connected)
+ sk_dst_set(sk, dst_clone(&rt->u.dst));
+ }
+
+ if (msg->msg_flags&MSG_CONFIRM)
+ goto do_confirm;
+back_from_confirm:
+
+ saddr = rt->rt_src;
+ if (!ipc.addr)
+ daddr = ipc.addr = rt->rt_dst;
+
+ lock_sock(sk);
+ if (unlikely(up->pending)) {
+ /* The socket is already corked while preparing it. */
+ /* ... which is an evident application bug. --ANK */
+ release_sock(sk);
+
+ NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 2\n"));
+ err = -EINVAL;
+ goto out;
+ }
+ /*
+ * Now cork the socket to pend data.
+ */
+ inet->cork.fl.fl4_dst = daddr;
+ inet->cork.fl.fl_ip_dport = dport;
+ inet->cork.fl.fl4_src = saddr;
+ inet->cork.fl.fl_ip_sport = inet->sport;
+ up->pending = AF_INET;
+
+do_append_data:
+ up->len += ulen;
+ err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, ulen,
+ sizeof(struct udphdr), &ipc, rt,
+ corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
+ if (err)
+ udp_flush_pending_frames(sk);
+ else if (!corkreq)
+ err = udp_push_pending_frames(sk, up);
+ release_sock(sk);
+
+out:
+ ip_rt_put(rt);
+ if (free)
+ kfree(ipc.opt);
+ if (!err) {
+ UDP_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS);
+ return len;
+ }
+ return err;
+
+do_confirm:
+ dst_confirm(&rt->u.dst);
+ if (!(msg->msg_flags&MSG_PROBE) || len)
+ goto back_from_confirm;
+ err = 0;
+ goto out;
+}
+
+static int udp_sendpage(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
+{
+ struct udp_sock *up = udp_sk(sk);
+ int ret;
+
+ if (!up->pending) {
+ struct msghdr msg = { .msg_flags = flags|MSG_MORE };
+
+ /* Call udp_sendmsg to specify destination address which
+ * sendpage interface can't pass.
+ * This will succeed only when the socket is connected.
+ */
+ ret = udp_sendmsg(NULL, sk, &msg, 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ lock_sock(sk);
+
+ if (unlikely(!up->pending)) {
+ release_sock(sk);
+
+ NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 3\n"));
+ return -EINVAL;
+ }
+
+ ret = ip_append_page(sk, page, offset, size, flags);
+ if (ret == -EOPNOTSUPP) {
+ release_sock(sk);
+ return sock_no_sendpage(sk->sk_socket, page, offset,
+ size, flags);
+ }
+ if (ret < 0) {
+ udp_flush_pending_frames(sk);
+ goto out;
+ }
+
+ up->len += size;
+ if (!(up->corkflag || (flags&MSG_MORE)))
+ ret = udp_push_pending_frames(sk, up);
+ if (!ret)
+ ret = size;
+out:
+ release_sock(sk);
+ return ret;
+}
+
+/*
+ * IOCTL requests applicable to the UDP protocol
+ */
+
+int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ switch(cmd)
+ {
+ case SIOCOUTQ:
+ {
+ int amount = atomic_read(&sk->sk_wmem_alloc);
+ return put_user(amount, (int __user *)arg);
+ }
+
+ case SIOCINQ:
+ {
+ struct sk_buff *skb;
+ unsigned long amount;
+
+ amount = 0;
+ spin_lock_irq(&sk->sk_receive_queue.lock);
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb != NULL) {
+ /*
+ * We will only return the amount
+ * of this packet since that is all
+ * that will be read.
+ */
+ amount = skb->len - sizeof(struct udphdr);
+ }
+ spin_unlock_irq(&sk->sk_receive_queue.lock);
+ return put_user(amount, (int __user *)arg);
+ }
+
+ default:
+ return -ENOIOCTLCMD;
+ }
+ return(0);
+}
+
+static __inline__ int __udp_checksum_complete(struct sk_buff *skb)
+{
+ return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
+}
+
+static __inline__ int udp_checksum_complete(struct sk_buff *skb)
+{
+ return skb->ip_summed != CHECKSUM_UNNECESSARY &&
+ __udp_checksum_complete(skb);
+}
+
+/*
+ * This should be easy, if there is something there we
+ * return it, otherwise we block.
+ */
+
+static int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ size_t len, int noblock, int flags, int *addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
+ struct sk_buff *skb;
+ int copied, err;
+
+ /*
+ * Check any passed addresses
+ */
+ if (addr_len)
+ *addr_len=sizeof(*sin);
+
+ if (flags & MSG_ERRQUEUE)
+ return ip_recv_error(sk, msg, len);
+
+try_again:
+ skb = skb_recv_datagram(sk, flags, noblock, &err);
+ if (!skb)
+ goto out;
+
+ copied = skb->len - sizeof(struct udphdr);
+ if (copied > len) {
+ copied = len;
+ msg->msg_flags |= MSG_TRUNC;
+ }
+
+ if (skb->ip_summed==CHECKSUM_UNNECESSARY) {
+ err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
+ copied);
+ } else if (msg->msg_flags&MSG_TRUNC) {
+ if (__udp_checksum_complete(skb))
+ goto csum_copy_err;
+ err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
+ copied);
+ } else {
+ err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov);
+
+ if (err == -EINVAL)
+ goto csum_copy_err;
+ }
+
+ if (err)
+ goto out_free;
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ /* Copy the address. */
+ if (sin)
+ {
+ sin->sin_family = AF_INET;
+ sin->sin_port = skb->h.uh->source;
+ sin->sin_addr.s_addr = skb->nh.iph->saddr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ }
+ if (inet->cmsg_flags)
+ ip_cmsg_recv(msg, skb);
+
+ err = copied;
+ if (flags & MSG_TRUNC)
+ err = skb->len - sizeof(struct udphdr);
+
+out_free:
+ skb_free_datagram(sk, skb);
+out:
+ return err;
+
+csum_copy_err:
+ UDP_INC_STATS_BH(UDP_MIB_INERRORS);
+
+ /* Clear queue. */
+ if (flags&MSG_PEEK) {
+ int clear = 0;
+ spin_lock_irq(&sk->sk_receive_queue.lock);
+ if (skb == skb_peek(&sk->sk_receive_queue)) {
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ clear = 1;
+ }
+ spin_unlock_irq(&sk->sk_receive_queue.lock);
+ if (clear)
+ kfree_skb(skb);
+ }
+
+ skb_free_datagram(sk, skb);
+
+ if (noblock)
+ return -EAGAIN;
+ goto try_again;
+}
+
+
+int udp_disconnect(struct sock *sk, int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ /*
+ * 1003.1g - break association.
+ */
+
+ sk->sk_state = TCP_CLOSE;
+ inet->daddr = 0;
+ inet->dport = 0;
+ sk->sk_bound_dev_if = 0;
+ if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+ inet_reset_saddr(sk);
+
+ if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
+ sk->sk_prot->unhash(sk);
+ inet->sport = 0;
+ }
+ sk_dst_reset(sk);
+ return 0;
+}
+
+static void udp_close(struct sock *sk, long timeout)
+{
+ sk_common_release(sk);
+}
+
+/* return:
+ * 1 if the the UDP system should process it
+ * 0 if we should drop this packet
+ * -1 if it should get processed by xfrm4_rcv_encap
+ */
+static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb)
+{
+#ifndef CONFIG_XFRM
+ return 1;
+#else
+ struct udp_sock *up = udp_sk(sk);
+ struct udphdr *uh = skb->h.uh;
+ struct iphdr *iph;
+ int iphlen, len;
+
+ __u8 *udpdata = (__u8 *)uh + sizeof(struct udphdr);
+ __u32 *udpdata32 = (__u32 *)udpdata;
+ __u16 encap_type = up->encap_type;
+
+ /* if we're overly short, let UDP handle it */
+ if (udpdata > skb->tail)
+ return 1;
+
+ /* if this is not encapsulated socket, then just return now */
+ if (!encap_type)
+ return 1;
+
+ len = skb->tail - udpdata;
+
+ switch (encap_type) {
+ default:
+ case UDP_ENCAP_ESPINUDP:
+ /* Check if this is a keepalive packet. If so, eat it. */
+ if (len == 1 && udpdata[0] == 0xff) {
+ return 0;
+ } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0 ) {
+ /* ESP Packet without Non-ESP header */
+ len = sizeof(struct udphdr);
+ } else
+ /* Must be an IKE packet.. pass it through */
+ return 1;
+ break;
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ /* Check if this is a keepalive packet. If so, eat it. */
+ if (len == 1 && udpdata[0] == 0xff) {
+ return 0;
+ } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) &&
+ udpdata32[0] == 0 && udpdata32[1] == 0) {
+
+ /* ESP Packet with Non-IKE marker */
+ len = sizeof(struct udphdr) + 2 * sizeof(u32);
+ } else
+ /* Must be an IKE packet.. pass it through */
+ return 1;
+ break;
+ }
+
+ /* At this point we are sure that this is an ESPinUDP packet,
+ * so we need to remove 'len' bytes from the packet (the UDP
+ * header and optional ESP marker bytes) and then modify the
+ * protocol to ESP, and then call into the transform receiver.
+ */
+
+ /* Now we can update and verify the packet length... */
+ iph = skb->nh.iph;
+ iphlen = iph->ihl << 2;
+ iph->tot_len = htons(ntohs(iph->tot_len) - len);
+ if (skb->len < iphlen + len) {
+ /* packet is too small!?! */
+ return 0;
+ }
+
+ /* pull the data buffer up to the ESP header and set the
+ * transport header to point to ESP. Keep UDP on the stack
+ * for later.
+ */
+ skb->h.raw = skb_pull(skb, len);
+
+ /* modify the protocol (it's ESP!) */
+ iph->protocol = IPPROTO_ESP;
+
+ /* and let the caller know to send this into the ESP processor... */
+ return -1;
+#endif
+}
+
+/* returns:
+ * -1: error
+ * 0: success
+ * >0: "udp encap" protocol resubmission
+ *
+ * Note that in the success and error cases, the skb is assumed to
+ * have either been requeued or freed.
+ */
+static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
+{
+ struct udp_sock *up = udp_sk(sk);
+
+ /*
+ * Charge it to the socket, dropping if the queue is full.
+ */
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+ if (up->encap_type) {
+ /*
+ * This is an encapsulation socket, so let's see if this is
+ * an encapsulated packet.
+ * If it's a keepalive packet, then just eat it.
+ * If it's an encapsulateed packet, then pass it to the
+ * IPsec xfrm input and return the response
+ * appropriately. Otherwise, just fall through and
+ * pass this up the UDP socket.
+ */
+ int ret;
+
+ ret = udp_encap_rcv(sk, skb);
+ if (ret == 0) {
+ /* Eat the packet .. */
+ kfree_skb(skb);
+ return 0;
+ }
+ if (ret < 0) {
+ /* process the ESP packet */
+ ret = xfrm4_rcv_encap(skb, up->encap_type);
+ UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS);
+ return -ret;
+ }
+ /* FALLTHROUGH -- it's a UDP Packet */
+ }
+
+ if (sk->sk_filter && skb->ip_summed != CHECKSUM_UNNECESSARY) {
+ if (__udp_checksum_complete(skb)) {
+ UDP_INC_STATS_BH(UDP_MIB_INERRORS);
+ kfree_skb(skb);
+ return -1;
+ }
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+
+ if (sock_queue_rcv_skb(sk,skb)<0) {
+ UDP_INC_STATS_BH(UDP_MIB_INERRORS);
+ kfree_skb(skb);
+ return -1;
+ }
+ UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS);
+ return 0;
+}
+
+/*
+ * Multicasts and broadcasts go to each listener.
+ *
+ * Note: called only from the BH handler context,
+ * so we don't need to lock the hashes.
+ */
+static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
+ u32 saddr, u32 daddr)
+{
+ struct sock *sk;
+ int dif;
+
+ read_lock(&udp_hash_lock);
+ sk = sk_head(&udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]);
+ dif = skb->dev->ifindex;
+ sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
+ if (sk) {
+ struct sock *sknext = NULL;
+
+ do {
+ struct sk_buff *skb1 = skb;
+
+ sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr,
+ uh->source, saddr, dif);
+ if(sknext)
+ skb1 = skb_clone(skb, GFP_ATOMIC);
+
+ if(skb1) {
+ int ret = udp_queue_rcv_skb(sk, skb1);
+ if (ret > 0)
+ /* we should probably re-process instead
+ * of dropping packets here. */
+ kfree_skb(skb1);
+ }
+ sk = sknext;
+ } while(sknext);
+ } else
+ kfree_skb(skb);
+ read_unlock(&udp_hash_lock);
+ return 0;
+}
+
+/* Initialize UDP checksum. If exited with zero value (success),
+ * CHECKSUM_UNNECESSARY means, that no more checks are required.
+ * Otherwise, csum completion requires chacksumming packet body,
+ * including udp header and folding it to skb->csum.
+ */
+static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
+ unsigned short ulen, u32 saddr, u32 daddr)
+{
+ if (uh->check == 0) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ } else if (skb->ip_summed == CHECKSUM_HW) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
+ return 0;
+ NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp v4 hw csum failure.\n"));
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+ skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+ /* Probably, we should checksum udp header (it should be in cache
+ * in any case) and data in tiny packets (< rx copybreak).
+ */
+ return 0;
+}
+
+/*
+ * All we need to do is get the socket, and then do a checksum.
+ */
+
+int udp_rcv(struct sk_buff *skb)
+{
+ struct sock *sk;
+ struct udphdr *uh;
+ unsigned short ulen;
+ struct rtable *rt = (struct rtable*)skb->dst;
+ u32 saddr = skb->nh.iph->saddr;
+ u32 daddr = skb->nh.iph->daddr;
+ int len = skb->len;
+
+ /*
+ * Validate the packet and the UDP length.
+ */
+ if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+ goto no_header;
+
+ uh = skb->h.uh;
+
+ ulen = ntohs(uh->len);
+
+ if (ulen > len || ulen < sizeof(*uh))
+ goto short_packet;
+
+ if (pskb_trim(skb, ulen))
+ goto short_packet;
+
+ if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0)
+ goto csum_error;
+
+ if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
+ return udp_v4_mcast_deliver(skb, uh, saddr, daddr);
+
+ sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex);
+
+ if (sk != NULL) {
+ int ret = udp_queue_rcv_skb(sk, skb);
+ sock_put(sk);
+
+ /* a return value > 0 means to resubmit the input, but
+ * it it wants the return to be -protocol, or 0
+ */
+ if (ret > 0)
+ return -ret;
+ return 0;
+ }
+
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto drop;
+
+ /* No socket. Drop packet silently, if checksum is wrong */
+ if (udp_checksum_complete(skb))
+ goto csum_error;
+
+ UDP_INC_STATS_BH(UDP_MIB_NOPORTS);
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ /*
+ * Hmm. We got an UDP packet to a port to which we
+ * don't wanna listen. Ignore it.
+ */
+ kfree_skb(skb);
+ return(0);
+
+short_packet:
+ NETDEBUG(if (net_ratelimit())
+ printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
+ NIPQUAD(saddr),
+ ntohs(uh->source),
+ ulen,
+ len,
+ NIPQUAD(daddr),
+ ntohs(uh->dest)));
+no_header:
+ UDP_INC_STATS_BH(UDP_MIB_INERRORS);
+ kfree_skb(skb);
+ return(0);
+
+csum_error:
+ /*
+ * RFC1122: OK. Discards the bad packet silently (as far as
+ * the network is concerned, anyway) as per 4.1.3.4 (MUST).
+ */
+ NETDEBUG(if (net_ratelimit())
+ printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
+ NIPQUAD(saddr),
+ ntohs(uh->source),
+ NIPQUAD(daddr),
+ ntohs(uh->dest),
+ ulen));
+drop:
+ UDP_INC_STATS_BH(UDP_MIB_INERRORS);
+ kfree_skb(skb);
+ return(0);
+}
+
+static int udp_destroy_sock(struct sock *sk)
+{
+ lock_sock(sk);
+ udp_flush_pending_frames(sk);
+ release_sock(sk);
+ return 0;
+}
+
+/*
+ * Socket option code for UDP
+ */
+static int udp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int optlen)
+{
+ struct udp_sock *up = udp_sk(sk);
+ int val;
+ int err = 0;
+
+ if (level != SOL_UDP)
+ return ip_setsockopt(sk, level, optname, optval, optlen);
+
+ if(optlen<sizeof(int))
+ return -EINVAL;
+
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ switch(optname) {
+ case UDP_CORK:
+ if (val != 0) {
+ up->corkflag = 1;
+ } else {
+ up->corkflag = 0;
+ lock_sock(sk);
+ udp_push_pending_frames(sk, up);
+ release_sock(sk);
+ }
+ break;
+
+ case UDP_ENCAP:
+ switch (val) {
+ case 0:
+ case UDP_ENCAP_ESPINUDP:
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ up->encap_type = val;
+ break;
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ };
+
+ return err;
+}
+
+static int udp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct udp_sock *up = udp_sk(sk);
+ int val, len;
+
+ if (level != SOL_UDP)
+ return ip_getsockopt(sk, level, optname, optval, optlen);
+
+ if(get_user(len,optlen))
+ return -EFAULT;
+
+ len = min_t(unsigned int, len, sizeof(int));
+
+ if(len < 0)
+ return -EINVAL;
+
+ switch(optname) {
+ case UDP_CORK:
+ val = up->corkflag;
+ break;
+
+ case UDP_ENCAP:
+ val = up->encap_type;
+ break;
+
+ default:
+ return -ENOPROTOOPT;
+ };
+
+ if(put_user(len, optlen))
+ return -EFAULT;
+ if(copy_to_user(optval, &val,len))
+ return -EFAULT;
+ return 0;
+}
+
+/**
+ * udp_poll - wait for a UDP event.
+ * @file - file struct
+ * @sock - socket
+ * @wait - poll table
+ *
+ * This is same as datagram poll, except for the special case of
+ * blocking sockets. If application is using a blocking fd
+ * and a packet with checksum error is in the queue;
+ * then it could get return from select indicating data available
+ * but then block when reading it. Add special case code
+ * to work around these arguably broken applications.
+ */
+unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+ unsigned int mask = datagram_poll(file, sock, wait);
+ struct sock *sk = sock->sk;
+
+ /* Check for false positives due to checksum errors */
+ if ( (mask & POLLRDNORM) &&
+ !(file->f_flags & O_NONBLOCK) &&
+ !(sk->sk_shutdown & RCV_SHUTDOWN)){
+ struct sk_buff_head *rcvq = &sk->sk_receive_queue;
+ struct sk_buff *skb;
+
+ spin_lock_irq(&rcvq->lock);
+ while ((skb = skb_peek(rcvq)) != NULL) {
+ if (udp_checksum_complete(skb)) {
+ UDP_INC_STATS_BH(UDP_MIB_INERRORS);
+ __skb_unlink(skb, rcvq);
+ kfree_skb(skb);
+ } else {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ break;
+ }
+ }
+ spin_unlock_irq(&rcvq->lock);
+
+ /* nothing to see, move along */
+ if (skb == NULL)
+ mask &= ~(POLLIN | POLLRDNORM);
+ }
+
+ return mask;
+
+}
+
+struct proto udp_prot = {
+ .name = "UDP",
+ .owner = THIS_MODULE,
+ .close = udp_close,
+ .connect = ip4_datagram_connect,
+ .disconnect = udp_disconnect,
+ .ioctl = udp_ioctl,
+ .destroy = udp_destroy_sock,
+ .setsockopt = udp_setsockopt,
+ .getsockopt = udp_getsockopt,
+ .sendmsg = udp_sendmsg,
+ .recvmsg = udp_recvmsg,
+ .sendpage = udp_sendpage,
+ .backlog_rcv = udp_queue_rcv_skb,
+ .hash = udp_v4_hash,
+ .unhash = udp_v4_unhash,
+ .get_port = udp_v4_get_port,
+ .obj_size = sizeof(struct udp_sock),
+};
+
+/* ------------------------------------------------------------------------ */
+#ifdef CONFIG_PROC_FS
+
+static struct sock *udp_get_first(struct seq_file *seq)
+{
+ struct sock *sk;
+ struct udp_iter_state *state = seq->private;
+
+ for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
+ struct hlist_node *node;
+ sk_for_each(sk, node, &udp_hash[state->bucket]) {
+ if (sk->sk_family == state->family)
+ goto found;
+ }
+ }
+ sk = NULL;
+found:
+ return sk;
+}
+
+static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
+{
+ struct udp_iter_state *state = seq->private;
+
+ do {
+ sk = sk_next(sk);
+try_again:
+ ;
+ } while (sk && sk->sk_family != state->family);
+
+ if (!sk && ++state->bucket < UDP_HTABLE_SIZE) {
+ sk = sk_head(&udp_hash[state->bucket]);
+ goto try_again;
+ }
+ return sk;
+}
+
+static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct sock *sk = udp_get_first(seq);
+
+ if (sk)
+ while(pos && (sk = udp_get_next(seq, sk)) != NULL)
+ --pos;
+ return pos ? NULL : sk;
+}
+
+static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ read_lock(&udp_hash_lock);
+ return *pos ? udp_get_idx(seq, *pos-1) : (void *)1;
+}
+
+static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct sock *sk;
+
+ if (v == (void *)1)
+ sk = udp_get_idx(seq, 0);
+ else
+ sk = udp_get_next(seq, v);
+
+ ++*pos;
+ return sk;
+}
+
+static void udp_seq_stop(struct seq_file *seq, void *v)
+{
+ read_unlock(&udp_hash_lock);
+}
+
+static int udp_seq_open(struct inode *inode, struct file *file)
+{
+ struct udp_seq_afinfo *afinfo = PDE(inode)->data;
+ struct seq_file *seq;
+ int rc = -ENOMEM;
+ struct udp_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (!s)
+ goto out;
+ memset(s, 0, sizeof(*s));
+ s->family = afinfo->family;
+ s->seq_ops.start = udp_seq_start;
+ s->seq_ops.next = udp_seq_next;
+ s->seq_ops.show = afinfo->seq_show;
+ s->seq_ops.stop = udp_seq_stop;
+
+ rc = seq_open(file, &s->seq_ops);
+ if (rc)
+ goto out_kfree;
+
+ seq = file->private_data;
+ seq->private = s;
+out:
+ return rc;
+out_kfree:
+ kfree(s);
+ goto out;
+}
+
+/* ------------------------------------------------------------------------ */
+int udp_proc_register(struct udp_seq_afinfo *afinfo)
+{
+ struct proc_dir_entry *p;
+ int rc = 0;
+
+ if (!afinfo)
+ return -EINVAL;
+ afinfo->seq_fops->owner = afinfo->owner;
+ afinfo->seq_fops->open = udp_seq_open;
+ afinfo->seq_fops->read = seq_read;
+ afinfo->seq_fops->llseek = seq_lseek;
+ afinfo->seq_fops->release = seq_release_private;
+
+ p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
+ if (p)
+ p->data = afinfo;
+ else
+ rc = -ENOMEM;
+ return rc;
+}
+
+void udp_proc_unregister(struct udp_seq_afinfo *afinfo)
+{
+ if (!afinfo)
+ return;
+ proc_net_remove(afinfo->name);
+ memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
+}
+
+/* ------------------------------------------------------------------------ */
+static void udp4_format_sock(struct sock *sp, char *tmpbuf, int bucket)
+{
+ struct inet_sock *inet = inet_sk(sp);
+ unsigned int dest = inet->daddr;
+ unsigned int src = inet->rcv_saddr;
+ __u16 destp = ntohs(inet->dport);
+ __u16 srcp = ntohs(inet->sport);
+
+ sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
+ " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p",
+ bucket, src, srcp, dest, destp, sp->sk_state,
+ atomic_read(&sp->sk_wmem_alloc),
+ atomic_read(&sp->sk_rmem_alloc),
+ 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
+ atomic_read(&sp->sk_refcnt), sp);
+}
+
+static int udp4_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_printf(seq, "%-127s\n",
+ " sl local_address rem_address st tx_queue "
+ "rx_queue tr tm->when retrnsmt uid timeout "
+ "inode");
+ else {
+ char tmpbuf[129];
+ struct udp_iter_state *state = seq->private;
+
+ udp4_format_sock(v, tmpbuf, state->bucket);
+ seq_printf(seq, "%-127s\n", tmpbuf);
+ }
+ return 0;
+}
+
+/* ------------------------------------------------------------------------ */
+static struct file_operations udp4_seq_fops;
+static struct udp_seq_afinfo udp4_seq_afinfo = {
+ .owner = THIS_MODULE,
+ .name = "udp",
+ .family = AF_INET,
+ .seq_show = udp4_seq_show,
+ .seq_fops = &udp4_seq_fops,
+};
+
+int __init udp4_proc_init(void)
+{
+ return udp_proc_register(&udp4_seq_afinfo);
+}
+
+void udp4_proc_exit(void)
+{
+ udp_proc_unregister(&udp4_seq_afinfo);
+}
+#endif /* CONFIG_PROC_FS */
+
+EXPORT_SYMBOL(udp_disconnect);
+EXPORT_SYMBOL(udp_hash);
+EXPORT_SYMBOL(udp_hash_lock);
+EXPORT_SYMBOL(udp_ioctl);
+EXPORT_SYMBOL(udp_port_rover);
+EXPORT_SYMBOL(udp_prot);
+EXPORT_SYMBOL(udp_sendmsg);
+EXPORT_SYMBOL(udp_poll);
+
+#ifdef CONFIG_PROC_FS
+EXPORT_SYMBOL(udp_proc_register);
+EXPORT_SYMBOL(udp_proc_unregister);
+#endif
diff --git a/net/ipv4/utils.c b/net/ipv4/utils.c
new file mode 100644
index 00000000000..6aecd7a4353
--- /dev/null
+++ b/net/ipv4/utils.c
@@ -0,0 +1,59 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Various kernel-resident INET utility functions; mainly
+ * for format conversion and debugging output.
+ *
+ * Version: $Id: utils.c,v 1.8 2000/10/03 07:29:01 anton Exp $
+ *
+ * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * Fixes:
+ * Alan Cox : verify_area check.
+ * Alan Cox : removed old debugging.
+ * Andi Kleen : add net_ratelimit()
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <asm/byteorder.h>
+
+/*
+ * Convert an ASCII string to binary IP.
+ */
+
+__u32 in_aton(const char *str)
+{
+ unsigned long l;
+ unsigned int val;
+ int i;
+
+ l = 0;
+ for (i = 0; i < 4; i++)
+ {
+ l <<= 8;
+ if (*str != '\0')
+ {
+ val = 0;
+ while (*str != '\0' && *str != '.')
+ {
+ val *= 10;
+ val += *str - '0';
+ str++;
+ }
+ l |= val;
+ if (*str != '\0')
+ str++;
+ }
+ }
+ return(htonl(l));
+}
+
+EXPORT_SYMBOL(in_aton);
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
new file mode 100644
index 00000000000..2d3849c38a0
--- /dev/null
+++ b/net/ipv4/xfrm4_input.c
@@ -0,0 +1,160 @@
+/*
+ * xfrm4_input.c
+ *
+ * Changes:
+ * YOSHIFUJI Hideaki @USAGI
+ * Split up af-specific portion
+ * Derek Atkins <derek@ihtfp.com>
+ * Add Encapsulation support
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+int xfrm4_rcv(struct sk_buff *skb)
+{
+ return xfrm4_rcv_encap(skb, 0);
+}
+
+EXPORT_SYMBOL(xfrm4_rcv);
+
+static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
+{
+ struct iphdr *outer_iph = skb->nh.iph;
+ struct iphdr *inner_iph = skb->h.ipiph;
+
+ if (INET_ECN_is_ce(outer_iph->tos))
+ IP_ECN_set_ce(inner_iph);
+}
+
+static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq)
+{
+ switch (nexthdr) {
+ case IPPROTO_IPIP:
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ return -EINVAL;
+ *spi = skb->nh.iph->saddr;
+ *seq = 0;
+ return 0;
+ }
+
+ return xfrm_parse_spi(skb, nexthdr, spi, seq);
+}
+
+int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
+{
+ int err;
+ u32 spi, seq;
+ struct sec_decap_state xfrm_vec[XFRM_MAX_DEPTH];
+ struct xfrm_state *x;
+ int xfrm_nr = 0;
+ int decaps = 0;
+
+ if ((err = xfrm4_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) != 0)
+ goto drop;
+
+ do {
+ struct iphdr *iph = skb->nh.iph;
+
+ if (xfrm_nr == XFRM_MAX_DEPTH)
+ goto drop;
+
+ x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi, iph->protocol, AF_INET);
+ if (x == NULL)
+ goto drop;
+
+ spin_lock(&x->lock);
+ if (unlikely(x->km.state != XFRM_STATE_VALID))
+ goto drop_unlock;
+
+ if (x->props.replay_window && xfrm_replay_check(x, seq))
+ goto drop_unlock;
+
+ if (xfrm_state_check_expire(x))
+ goto drop_unlock;
+
+ xfrm_vec[xfrm_nr].decap.decap_type = encap_type;
+ if (x->type->input(x, &(xfrm_vec[xfrm_nr].decap), skb))
+ goto drop_unlock;
+
+ /* only the first xfrm gets the encap type */
+ encap_type = 0;
+
+ if (x->props.replay_window)
+ xfrm_replay_advance(x, seq);
+
+ x->curlft.bytes += skb->len;
+ x->curlft.packets++;
+
+ spin_unlock(&x->lock);
+
+ xfrm_vec[xfrm_nr++].xvec = x;
+
+ iph = skb->nh.iph;
+
+ if (x->props.mode) {
+ if (iph->protocol != IPPROTO_IPIP)
+ goto drop;
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto drop;
+ if (skb_cloned(skb) &&
+ pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ goto drop;
+ if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+ ipv4_copy_dscp(iph, skb->h.ipiph);
+ if (!(x->props.flags & XFRM_STATE_NOECN))
+ ipip_ecn_decapsulate(skb);
+ skb->mac.raw = memmove(skb->data - skb->mac_len,
+ skb->mac.raw, skb->mac_len);
+ skb->nh.raw = skb->data;
+ memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+ decaps = 1;
+ break;
+ }
+
+ if ((err = xfrm_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) < 0)
+ goto drop;
+ } while (!err);
+
+ /* Allocate new secpath or COW existing one. */
+
+ if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
+ struct sec_path *sp;
+ sp = secpath_dup(skb->sp);
+ if (!sp)
+ goto drop;
+ if (skb->sp)
+ secpath_put(skb->sp);
+ skb->sp = sp;
+ }
+ if (xfrm_nr + skb->sp->len > XFRM_MAX_DEPTH)
+ goto drop;
+
+ memcpy(skb->sp->x+skb->sp->len, xfrm_vec, xfrm_nr*sizeof(struct sec_decap_state));
+ skb->sp->len += xfrm_nr;
+
+ if (decaps) {
+ if (!(skb->dev->flags&IFF_LOOPBACK)) {
+ dst_release(skb->dst);
+ skb->dst = NULL;
+ }
+ netif_rx(skb);
+ return 0;
+ } else {
+ return -skb->nh.iph->protocol;
+ }
+
+drop_unlock:
+ spin_unlock(&x->lock);
+ xfrm_state_put(x);
+drop:
+ while (--xfrm_nr >= 0)
+ xfrm_state_put(xfrm_vec[xfrm_nr].xvec);
+
+ kfree_skb(skb);
+ return 0;
+}
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
new file mode 100644
index 00000000000..af2392ae576
--- /dev/null
+++ b/net/ipv4/xfrm4_output.c
@@ -0,0 +1,141 @@
+/*
+ * xfrm4_output.c - Common IPsec encapsulation code for IPv4.
+ * Copyright (c) 2004 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/icmp.h>
+
+/* Add encapsulation header.
+ *
+ * In transport mode, the IP header will be moved forward to make space
+ * for the encapsulation header.
+ *
+ * In tunnel mode, the top IP header will be constructed per RFC 2401.
+ * The following fields in it shall be filled in by x->type->output:
+ * tot_len
+ * check
+ *
+ * On exit, skb->h will be set to the start of the payload to be processed
+ * by x->type->output and skb->nh will be set to the top IP header.
+ */
+static void xfrm4_encap(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb->dst;
+ struct xfrm_state *x = dst->xfrm;
+ struct iphdr *iph, *top_iph;
+
+ iph = skb->nh.iph;
+ skb->h.ipiph = iph;
+
+ skb->nh.raw = skb_push(skb, x->props.header_len);
+ top_iph = skb->nh.iph;
+
+ if (!x->props.mode) {
+ skb->h.raw += iph->ihl*4;
+ memmove(top_iph, iph, iph->ihl*4);
+ return;
+ }
+
+ top_iph->ihl = 5;
+ top_iph->version = 4;
+
+ /* DS disclosed */
+ top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
+ if (x->props.flags & XFRM_STATE_NOECN)
+ IP_ECN_clear(top_iph);
+
+ top_iph->frag_off = iph->frag_off & htons(IP_DF);
+ if (!top_iph->frag_off)
+ __ip_select_ident(top_iph, dst, 0);
+
+ top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
+
+ top_iph->saddr = x->props.saddr.a4;
+ top_iph->daddr = x->id.daddr.a4;
+ top_iph->protocol = IPPROTO_IPIP;
+
+ memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+}
+
+static int xfrm4_tunnel_check_size(struct sk_buff *skb)
+{
+ int mtu, ret = 0;
+ struct dst_entry *dst;
+ struct iphdr *iph = skb->nh.iph;
+
+ if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
+ goto out;
+
+ IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
+
+ if (!(iph->frag_off & htons(IP_DF)) || skb->local_df)
+ goto out;
+
+ dst = skb->dst;
+ mtu = dst_mtu(dst);
+ if (skb->len > mtu) {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ ret = -EMSGSIZE;
+ }
+out:
+ return ret;
+}
+
+int xfrm4_output(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb->dst;
+ struct xfrm_state *x = dst->xfrm;
+ int err;
+
+ if (skb->ip_summed == CHECKSUM_HW) {
+ err = skb_checksum_help(skb, 0);
+ if (err)
+ goto error_nolock;
+ }
+
+ if (x->props.mode) {
+ err = xfrm4_tunnel_check_size(skb);
+ if (err)
+ goto error_nolock;
+ }
+
+ spin_lock_bh(&x->lock);
+ err = xfrm_state_check(x, skb);
+ if (err)
+ goto error;
+
+ xfrm4_encap(skb);
+
+ err = x->type->output(x, skb);
+ if (err)
+ goto error;
+
+ x->curlft.bytes += skb->len;
+ x->curlft.packets++;
+
+ spin_unlock_bh(&x->lock);
+
+ if (!(skb->dst = dst_pop(dst))) {
+ err = -EHOSTUNREACH;
+ goto error_nolock;
+ }
+ err = NET_XMIT_BYPASS;
+
+out_exit:
+ return err;
+error:
+ spin_unlock_bh(&x->lock);
+error_nolock:
+ kfree_skb(skb);
+ goto out_exit;
+}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
new file mode 100644
index 00000000000..7fe2afd2e66
--- /dev/null
+++ b/net/ipv4/xfrm4_policy.c
@@ -0,0 +1,281 @@
+/*
+ * xfrm4_policy.c
+ *
+ * Changes:
+ * Kazunori MIYAZAWA @USAGI
+ * YOSHIFUJI Hideaki @USAGI
+ * Split up af-specific portion
+ *
+ */
+
+#include <linux/config.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+
+static struct dst_ops xfrm4_dst_ops;
+static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
+
+static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED };
+
+static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
+{
+ return __ip_route_output_key((struct rtable**)dst, fl);
+}
+
+static struct dst_entry *
+__xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
+{
+ struct dst_entry *dst;
+
+ read_lock_bh(&policy->lock);
+ for (dst = policy->bundles; dst; dst = dst->next) {
+ struct xfrm_dst *xdst = (struct xfrm_dst*)dst;
+ if (xdst->u.rt.fl.oif == fl->oif && /*XXX*/
+ xdst->u.rt.fl.fl4_dst == fl->fl4_dst &&
+ xdst->u.rt.fl.fl4_src == fl->fl4_src &&
+ xfrm_bundle_ok(xdst, fl, AF_INET)) {
+ dst_clone(dst);
+ break;
+ }
+ }
+ read_unlock_bh(&policy->lock);
+ return dst;
+}
+
+/* Allocate chain of dst_entry's, attach known xfrm's, calculate
+ * all the metrics... Shortly, bundle a bundle.
+ */
+
+static int
+__xfrm4_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
+ struct flowi *fl, struct dst_entry **dst_p)
+{
+ struct dst_entry *dst, *dst_prev;
+ struct rtable *rt0 = (struct rtable*)(*dst_p);
+ struct rtable *rt = rt0;
+ u32 remote = fl->fl4_dst;
+ u32 local = fl->fl4_src;
+ struct flowi fl_tunnel = {
+ .nl_u = {
+ .ip4_u = {
+ .saddr = local,
+ .daddr = remote
+ }
+ }
+ };
+ int i;
+ int err;
+ int header_len = 0;
+ int trailer_len = 0;
+
+ dst = dst_prev = NULL;
+ dst_hold(&rt->u.dst);
+
+ for (i = 0; i < nx; i++) {
+ struct dst_entry *dst1 = dst_alloc(&xfrm4_dst_ops);
+ struct xfrm_dst *xdst;
+ int tunnel = 0;
+
+ if (unlikely(dst1 == NULL)) {
+ err = -ENOBUFS;
+ dst_release(&rt->u.dst);
+ goto error;
+ }
+
+ if (!dst)
+ dst = dst1;
+ else {
+ dst_prev->child = dst1;
+ dst1->flags |= DST_NOHASH;
+ dst_clone(dst1);
+ }
+
+ xdst = (struct xfrm_dst *)dst1;
+ xdst->route = &rt->u.dst;
+
+ dst1->next = dst_prev;
+ dst_prev = dst1;
+ if (xfrm[i]->props.mode) {
+ remote = xfrm[i]->id.daddr.a4;
+ local = xfrm[i]->props.saddr.a4;
+ tunnel = 1;
+ }
+ header_len += xfrm[i]->props.header_len;
+ trailer_len += xfrm[i]->props.trailer_len;
+
+ if (tunnel) {
+ fl_tunnel.fl4_src = local;
+ fl_tunnel.fl4_dst = remote;
+ err = xfrm_dst_lookup((struct xfrm_dst **)&rt,
+ &fl_tunnel, AF_INET);
+ if (err)
+ goto error;
+ } else
+ dst_hold(&rt->u.dst);
+ }
+
+ dst_prev->child = &rt->u.dst;
+ dst->path = &rt->u.dst;
+
+ *dst_p = dst;
+ dst = dst_prev;
+
+ dst_prev = *dst_p;
+ i = 0;
+ for (; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) {
+ struct xfrm_dst *x = (struct xfrm_dst*)dst_prev;
+ x->u.rt.fl = *fl;
+
+ dst_prev->xfrm = xfrm[i++];
+ dst_prev->dev = rt->u.dst.dev;
+ if (rt->u.dst.dev)
+ dev_hold(rt->u.dst.dev);
+ dst_prev->obsolete = -1;
+ dst_prev->flags |= DST_HOST;
+ dst_prev->lastuse = jiffies;
+ dst_prev->header_len = header_len;
+ dst_prev->trailer_len = trailer_len;
+ memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics));
+
+ /* Copy neighbout for reachability confirmation */
+ dst_prev->neighbour = neigh_clone(rt->u.dst.neighbour);
+ dst_prev->input = rt->u.dst.input;
+ dst_prev->output = xfrm4_output;
+ if (rt->peer)
+ atomic_inc(&rt->peer->refcnt);
+ x->u.rt.peer = rt->peer;
+ /* Sheit... I remember I did this right. Apparently,
+ * it was magically lost, so this code needs audit */
+ x->u.rt.rt_flags = rt0->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL);
+ x->u.rt.rt_type = rt->rt_type;
+ x->u.rt.rt_src = rt0->rt_src;
+ x->u.rt.rt_dst = rt0->rt_dst;
+ x->u.rt.rt_gateway = rt->rt_gateway;
+ x->u.rt.rt_spec_dst = rt0->rt_spec_dst;
+ header_len -= x->u.dst.xfrm->props.header_len;
+ trailer_len -= x->u.dst.xfrm->props.trailer_len;
+ }
+
+ xfrm_init_pmtu(dst);
+ return 0;
+
+error:
+ if (dst)
+ dst_free(dst);
+ return err;
+}
+
+static void
+_decode_session4(struct sk_buff *skb, struct flowi *fl)
+{
+ struct iphdr *iph = skb->nh.iph;
+ u8 *xprth = skb->nh.raw + iph->ihl*4;
+
+ memset(fl, 0, sizeof(struct flowi));
+ if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) {
+ switch (iph->protocol) {
+ case IPPROTO_UDP:
+ case IPPROTO_TCP:
+ case IPPROTO_SCTP:
+ if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
+ u16 *ports = (u16 *)xprth;
+
+ fl->fl_ip_sport = ports[0];
+ fl->fl_ip_dport = ports[1];
+ }
+ break;
+
+ case IPPROTO_ICMP:
+ if (pskb_may_pull(skb, xprth + 2 - skb->data)) {
+ u8 *icmp = xprth;
+
+ fl->fl_icmp_type = icmp[0];
+ fl->fl_icmp_code = icmp[1];
+ }
+ break;
+
+ case IPPROTO_ESP:
+ if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
+ u32 *ehdr = (u32 *)xprth;
+
+ fl->fl_ipsec_spi = ehdr[0];
+ }
+ break;
+
+ case IPPROTO_AH:
+ if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
+ u32 *ah_hdr = (u32*)xprth;
+
+ fl->fl_ipsec_spi = ah_hdr[1];
+ }
+ break;
+
+ case IPPROTO_COMP:
+ if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
+ u16 *ipcomp_hdr = (u16 *)xprth;
+
+ fl->fl_ipsec_spi = ntohl(ntohs(ipcomp_hdr[1]));
+ }
+ break;
+ default:
+ fl->fl_ipsec_spi = 0;
+ break;
+ };
+ }
+ fl->proto = iph->protocol;
+ fl->fl4_dst = iph->daddr;
+ fl->fl4_src = iph->saddr;
+}
+
+static inline int xfrm4_garbage_collect(void)
+{
+ read_lock(&xfrm4_policy_afinfo.lock);
+ xfrm4_policy_afinfo.garbage_collect();
+ read_unlock(&xfrm4_policy_afinfo.lock);
+ return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2);
+}
+
+static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
+{
+ struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+ struct dst_entry *path = xdst->route;
+
+ path->ops->update_pmtu(path, mtu);
+}
+
+static struct dst_ops xfrm4_dst_ops = {
+ .family = AF_INET,
+ .protocol = __constant_htons(ETH_P_IP),
+ .gc = xfrm4_garbage_collect,
+ .update_pmtu = xfrm4_update_pmtu,
+ .gc_thresh = 1024,
+ .entry_size = sizeof(struct xfrm_dst),
+};
+
+static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
+ .family = AF_INET,
+ .lock = RW_LOCK_UNLOCKED,
+ .type_map = &xfrm4_type_map,
+ .dst_ops = &xfrm4_dst_ops,
+ .dst_lookup = xfrm4_dst_lookup,
+ .find_bundle = __xfrm4_find_bundle,
+ .bundle_create = __xfrm4_bundle_create,
+ .decode_session = _decode_session4,
+};
+
+static void __init xfrm4_policy_init(void)
+{
+ xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
+}
+
+static void __exit xfrm4_policy_fini(void)
+{
+ xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo);
+}
+
+void __init xfrm4_init(void)
+{
+ xfrm4_state_init();
+ xfrm4_policy_init();
+}
+
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
new file mode 100644
index 00000000000..223a2e83853
--- /dev/null
+++ b/net/ipv4/xfrm4_state.c
@@ -0,0 +1,126 @@
+/*
+ * xfrm4_state.c
+ *
+ * Changes:
+ * YOSHIFUJI Hideaki @USAGI
+ * Split up af-specific portion
+ *
+ */
+
+#include <net/xfrm.h>
+#include <linux/pfkeyv2.h>
+#include <linux/ipsec.h>
+
+static struct xfrm_state_afinfo xfrm4_state_afinfo;
+
+static void
+__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl,
+ struct xfrm_tmpl *tmpl,
+ xfrm_address_t *daddr, xfrm_address_t *saddr)
+{
+ x->sel.daddr.a4 = fl->fl4_dst;
+ x->sel.saddr.a4 = fl->fl4_src;
+ x->sel.dport = xfrm_flowi_dport(fl);
+ x->sel.dport_mask = ~0;
+ x->sel.sport = xfrm_flowi_sport(fl);
+ x->sel.sport_mask = ~0;
+ x->sel.prefixlen_d = 32;
+ x->sel.prefixlen_s = 32;
+ x->sel.proto = fl->proto;
+ x->sel.ifindex = fl->oif;
+ x->id = tmpl->id;
+ if (x->id.daddr.a4 == 0)
+ x->id.daddr.a4 = daddr->a4;
+ x->props.saddr = tmpl->saddr;
+ if (x->props.saddr.a4 == 0)
+ x->props.saddr.a4 = saddr->a4;
+ x->props.mode = tmpl->mode;
+ x->props.reqid = tmpl->reqid;
+ x->props.family = AF_INET;
+}
+
+static struct xfrm_state *
+__xfrm4_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto)
+{
+ unsigned h = __xfrm4_spi_hash(daddr, spi, proto);
+ struct xfrm_state *x;
+
+ list_for_each_entry(x, xfrm4_state_afinfo.state_byspi+h, byspi) {
+ if (x->props.family == AF_INET &&
+ spi == x->id.spi &&
+ daddr->a4 == x->id.daddr.a4 &&
+ proto == x->id.proto) {
+ xfrm_state_hold(x);
+ return x;
+ }
+ }
+ return NULL;
+}
+
+static struct xfrm_state *
+__xfrm4_find_acq(u8 mode, u32 reqid, u8 proto,
+ xfrm_address_t *daddr, xfrm_address_t *saddr,
+ int create)
+{
+ struct xfrm_state *x, *x0;
+ unsigned h = __xfrm4_dst_hash(daddr);
+
+ x0 = NULL;
+
+ list_for_each_entry(x, xfrm4_state_afinfo.state_bydst+h, bydst) {
+ if (x->props.family == AF_INET &&
+ daddr->a4 == x->id.daddr.a4 &&
+ mode == x->props.mode &&
+ proto == x->id.proto &&
+ saddr->a4 == x->props.saddr.a4 &&
+ reqid == x->props.reqid &&
+ x->km.state == XFRM_STATE_ACQ &&
+ !x->id.spi) {
+ x0 = x;
+ break;
+ }
+ }
+ if (!x0 && create && (x0 = xfrm_state_alloc()) != NULL) {
+ x0->sel.daddr.a4 = daddr->a4;
+ x0->sel.saddr.a4 = saddr->a4;
+ x0->sel.prefixlen_d = 32;
+ x0->sel.prefixlen_s = 32;
+ x0->props.saddr.a4 = saddr->a4;
+ x0->km.state = XFRM_STATE_ACQ;
+ x0->id.daddr.a4 = daddr->a4;
+ x0->id.proto = proto;
+ x0->props.family = AF_INET;
+ x0->props.mode = mode;
+ x0->props.reqid = reqid;
+ x0->props.family = AF_INET;
+ x0->lft.hard_add_expires_seconds = XFRM_ACQ_EXPIRES;
+ xfrm_state_hold(x0);
+ x0->timer.expires = jiffies + XFRM_ACQ_EXPIRES*HZ;
+ add_timer(&x0->timer);
+ xfrm_state_hold(x0);
+ list_add_tail(&x0->bydst, xfrm4_state_afinfo.state_bydst+h);
+ wake_up(&km_waitq);
+ }
+ if (x0)
+ xfrm_state_hold(x0);
+ return x0;
+}
+
+static struct xfrm_state_afinfo xfrm4_state_afinfo = {
+ .family = AF_INET,
+ .lock = RW_LOCK_UNLOCKED,
+ .init_tempsel = __xfrm4_init_tempsel,
+ .state_lookup = __xfrm4_state_lookup,
+ .find_acq = __xfrm4_find_acq,
+};
+
+void __init xfrm4_state_init(void)
+{
+ xfrm_state_register_afinfo(&xfrm4_state_afinfo);
+}
+
+void __exit xfrm4_state_fini(void)
+{
+ xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
+}
+
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
new file mode 100644
index 00000000000..413191f585f
--- /dev/null
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -0,0 +1,144 @@
+/* xfrm4_tunnel.c: Generic IP tunnel transformer.
+ *
+ * Copyright (C) 2003 David S. Miller (davem@redhat.com)
+ */
+
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+
+static int ipip_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct iphdr *iph;
+
+ iph = skb->nh.iph;
+ iph->tot_len = htons(skb->len);
+ ip_send_check(iph);
+
+ return 0;
+}
+
+static int ipip_xfrm_rcv(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
+{
+ return 0;
+}
+
+static struct xfrm_tunnel *ipip_handler;
+static DECLARE_MUTEX(xfrm4_tunnel_sem);
+
+int xfrm4_tunnel_register(struct xfrm_tunnel *handler)
+{
+ int ret;
+
+ down(&xfrm4_tunnel_sem);
+ ret = 0;
+ if (ipip_handler != NULL)
+ ret = -EINVAL;
+ if (!ret)
+ ipip_handler = handler;
+ up(&xfrm4_tunnel_sem);
+
+ return ret;
+}
+
+EXPORT_SYMBOL(xfrm4_tunnel_register);
+
+int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler)
+{
+ int ret;
+
+ down(&xfrm4_tunnel_sem);
+ ret = 0;
+ if (ipip_handler != handler)
+ ret = -EINVAL;
+ if (!ret)
+ ipip_handler = NULL;
+ up(&xfrm4_tunnel_sem);
+
+ synchronize_net();
+
+ return ret;
+}
+
+EXPORT_SYMBOL(xfrm4_tunnel_deregister);
+
+static int ipip_rcv(struct sk_buff *skb)
+{
+ struct xfrm_tunnel *handler = ipip_handler;
+
+ /* Tunnel devices take precedence. */
+ if (handler && handler->handler(skb) == 0)
+ return 0;
+
+ return xfrm4_rcv(skb);
+}
+
+static void ipip_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm_tunnel *handler = ipip_handler;
+ u32 arg = info;
+
+ if (handler)
+ handler->err_handler(skb, &arg);
+}
+
+static int ipip_init_state(struct xfrm_state *x, void *args)
+{
+ if (!x->props.mode)
+ return -EINVAL;
+
+ if (x->encap)
+ return -EINVAL;
+
+ x->props.header_len = sizeof(struct iphdr);
+
+ return 0;
+}
+
+static void ipip_destroy(struct xfrm_state *x)
+{
+}
+
+static struct xfrm_type ipip_type = {
+ .description = "IPIP",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_IPIP,
+ .init_state = ipip_init_state,
+ .destructor = ipip_destroy,
+ .input = ipip_xfrm_rcv,
+ .output = ipip_output
+};
+
+static struct net_protocol ipip_protocol = {
+ .handler = ipip_rcv,
+ .err_handler = ipip_err,
+ .no_policy = 1,
+};
+
+static int __init ipip_init(void)
+{
+ if (xfrm_register_type(&ipip_type, AF_INET) < 0) {
+ printk(KERN_INFO "ipip init: can't add xfrm type\n");
+ return -EAGAIN;
+ }
+ if (inet_add_protocol(&ipip_protocol, IPPROTO_IPIP) < 0) {
+ printk(KERN_INFO "ipip init: can't add protocol\n");
+ xfrm_unregister_type(&ipip_type, AF_INET);
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static void __exit ipip_fini(void)
+{
+ if (inet_del_protocol(&ipip_protocol, IPPROTO_IPIP) < 0)
+ printk(KERN_INFO "ipip close: can't remove protocol\n");
+ if (xfrm_unregister_type(&ipip_type, AF_INET) < 0)
+ printk(KERN_INFO "ipip close: can't remove xfrm type\n");
+}
+
+module_init(ipip_init);
+module_exit(ipip_fini);
+MODULE_LICENSE("GPL");